[med-svn] [python-pysam] 01/03: New upstream version 0.12.0+ds

Andreas Tille tille at debian.org
Tue Aug 29 20:53:03 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository python-pysam.

commit 05f585113cc1da1e886586274e61c78a6464068a
Author: Andreas Tille <tille at debian.org>
Date:   Tue Aug 29 22:51:34 2017 +0200

    New upstream version 0.12.0+ds
---
 .gitignore                                         |   2 +
 README.rst                                         |  18 +-
 bcftools/HMM.c                                     |  95 +++--
 bcftools/HMM.c.pysam.c                             |  95 +++--
 bcftools/HMM.h                                     |   9 +-
 bcftools/bam2bcf.c                                 |  35 +-
 bcftools/bam2bcf.c.pysam.c                         |  35 +-
 bcftools/bam2bcf_indel.c                           |   8 +-
 bcftools/bam2bcf_indel.c.pysam.c                   |   8 +-
 bcftools/bcftools.h                                |   6 +
 bcftools/consensus.c                               |  11 +-
 bcftools/consensus.c.pysam.c                       |  11 +-
 bcftools/filter.c                                  |  47 ++
 bcftools/filter.c.pysam.c                          |  47 ++
 bcftools/main.c                                    |   6 +
 bcftools/main.c.pysam.c                            |   6 +
 bcftools/rbuf.h                                    |  66 ++-
 bcftools/vcfbuf.c                                  | 442 +++++++++++++++++++
 bcftools/vcfbuf.c.pysam.c                          | 444 +++++++++++++++++++
 bcftools/vcfbuf.h                                  |  81 ++++
 bcftools/vcfconvert.c                              |   8 +-
 bcftools/vcfconvert.c.pysam.c                      |   8 +-
 bcftools/vcfmerge.c                                |  32 +-
 bcftools/vcfmerge.c.pysam.c                        |  32 +-
 bcftools/vcfnorm.c                                 |  27 +-
 bcftools/vcfnorm.c.pysam.c                         |  27 +-
 bcftools/vcfplugin.c                               |  18 +-
 bcftools/vcfplugin.c.pysam.c                       |  18 +-
 bcftools/vcfroh.c                                  |  18 +-
 bcftools/vcfroh.c.pysam.c                          |  18 +-
 bcftools/version.h                                 |   2 +-
 cy_build.py                                        |  24 +-
 doc/conf.py                                        |   5 +-
 doc/faq.rst                                        |  14 +
 doc/glossary.rst                                   |  45 +-
 doc/installation.rst                               |  41 +-
 doc/release.rst                                    |  67 ++-
 import.py                                          |   2 +-
 .../PysamTestModule_link_pre_489/BuildRead.pyx     |  24 ++
 .../PysamTestModule_link_pre_489/__init__.py       |   3 +
 .../link_pre_489/cy_build.py                       |  24 +-
 linker_tests/link_pre_489/setup.py                 |  28 ++
 linker_tests/link_pre_489/tests/test_module.py     |  15 +
 .../PysamTestModule_link_with_rpath/BuildRead.pyx  |  24 ++
 .../PysamTestModule_link_with_rpath/__init__.py    |   3 +
 linker_tests/link_with_rpath/setup.py              |  36 ++
 linker_tests/link_with_rpath/tests/test_module.py  |  15 +
 .../BuildRead.pyx                                  |  24 ++
 .../PysamTestModule_link_without_rpath/__init__.py |   3 +
 linker_tests/link_without_rpath/setup.py           |  35 ++
 .../link_without_rpath/tests/test_module.py        |  15 +
 pysam/htslib_util.c                                |  10 +-
 pysam/htslib_util.h                                |   2 +-
 pysam/libcalignedsegment.pxd                       |  10 +-
 pysam/libcalignedsegment.pyx                       |  62 ++-
 pysam/libcalignmentfile.pxd                        |   6 +-
 pysam/libcalignmentfile.pyx                        | 475 ++++++++++-----------
 pysam/libcbcf.pyx                                  | 217 ++++++----
 pysam/libcbgzf.pyx                                 |   6 +-
 pysam/libcfaidx.pxd                                |   5 +-
 pysam/libcfaidx.pyx                                | 108 ++++-
 pysam/libchtslib.pxd                               |  20 +-
 pysam/libchtslib.pyx                               | 215 ++++++++--
 pysam/libctabix.pyx                                |  96 +++--
 pysam/libctabixproxies.pyx                         |  22 +-
 pysam/libcutils.pyx                                |   2 +-
 pysam/version.py                                   |   8 +-
 run_tests_travis.sh                                |  45 +-
 samtools/bam.h                                     |   2 +-
 samtools/bam_reheader.c.pysam.c                    |  12 +-
 samtools/bam_sort.c                                | 237 ++++++++--
 samtools/bam_sort.c.pysam.c                        | 237 ++++++++--
 samtools/bam_stat.c.pysam.c                        |  10 +-
 samtools/bamtk.c.pysam.c                           |   2 +-
 samtools/sam_view.c                                | 342 +++++++++++----
 samtools/sam_view.c.pysam.c                        | 342 +++++++++++----
 samtools/stats.c.pysam.c                           |   9 +-
 samtools/version.h                                 |   2 +-
 setup.cfg                                          |   8 +
 tests/AlignedSegment_test.py                       |  18 +-
 tests/AlignmentFile_test.py                        | 301 +++++++------
 tests/StreamFiledescriptors_test.py                |   8 +-
 tests/TestUtils.py                                 |  17 +-
 tests/VariantFile_test.py                          | 110 +++--
 tests/compile_test.py                              |  15 +-
 tests/faidx_test.py                                |  90 +++-
 tests/linking_test.py                              |  79 ++++
 tests/pysam_data/test_mapped_unmapped.sam          |  11 +
 tests/samtools_test.py                             | 244 ++++++-----
 tests/tabix_test.py                                | 129 +++---
 tests/tabixproxies_test.py                         |  40 +-
 tests/test_samtools_python.py                      |  35 +-
 92 files changed, 4309 insertions(+), 1427 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0910be8..f3e1e51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,8 @@ htslib/config.log
 htslib/config.mk
 pysam/config.py
 
+# linking tests
+BuildRead.c
 # cython files
 pysam/libc*.c
 
diff --git a/README.rst b/README.rst
index 5e37be5..eb065f0 100644
--- a/README.rst
+++ b/README.rst
@@ -12,20 +12,22 @@ next-generation sequencing methods.
 Pysam is a lightweight wrapper of the samtools_ C-API. Pysam also
 includes an interface for tabix_.
 
-The latest version is available through `pypi
-<https://pypi.python.org/pypi/pysam>`_. To install, simply type::
-
-   pip install pysam
-
 If you are using the conda packaging manager (e.g. miniconda or anaconda),
-you can install pysam from the `bioconda channel <https://bioconda.github.io/>`_:
+you can install pysam from the `bioconda channel <https://bioconda.github.io/>`_::
 
    conda config --add channels r
-
    conda config --add channels bioconda
-
    conda install pysam
 
+Installation through bioconda is the recommended way to install pysam
+as it resolves non-python dependencies and uses pre-configured
+compilation options. Especially for OS X this will potentially save a
+lot of trouble.
+
+Pysam is available through `pypi
+<https://pypi.python.org/pypi/pysam>`_. To install, type::
+
+   pip install pysam
 
 Pysam documentation is available through https://readthedocs.org/ from
 `here <http://pysam.readthedocs.org/en/latest/>`_
diff --git a/bcftools/HMM.c b/bcftools/HMM.c
index 5795987..70ad8d6 100644
--- a/bcftools/HMM.c
+++ b/bcftools/HMM.c
@@ -33,12 +33,11 @@
 
 typedef struct
 {
-    int nstates;        // number of hmm's states
-    int isite;          // take snapshot at i-th position
-    uint32_t pos;       // i-th site's position
-    double *vit_prob;   // viterbi probabilities, NULL for uniform probs
-    double *fwd_prob;   // transition probabilities
-    double *bwd_prob;   // transition probabilities
+    int nstates;            // number of hmm's states
+    uint32_t snap_at_pos;   // snapshot at this position, 0 when inactive
+    double *vit_prob;       // viterbi probabilities, NULL for uniform probs
+    double *fwd_prob;       // transition probabilities
+    double *bwd_prob;       // transition probabilities
 }
 snapshot_t;
 
@@ -61,8 +60,9 @@ struct _hmm_t
     set_tprob_f set_tprob;      // Optional user function to set / modify transition probabilities
                                 //  at each site (one step of Viterbi algorithm)
     void *set_tprob_data;
-    snapshot_t init;            // Initial state probabilities. Set isite=1 when site should be used
-    snapshot_t *snapshot;
+    snapshot_t init, state;     // Initial and current state probs. Set state from snapshot if prev_snap_pos!=0 or from init otherwise
+    snapshot_t *snapshot;       //  snapshot->snap_at_pos  .. request a snapshot at this position
+                                //  hmm->state.snap_at_pos .. the current state comes from snapshot made at this position
 };
 
 uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; }
@@ -92,14 +92,21 @@ static inline void multiply_matrix(int n, double *a, double *b, double *dst, dou
 
 void hmm_init_states(hmm_t *hmm, double *probs)
 {
-    hmm->init.isite = 0;
-    hmm->init.pos   = 0;
+    hmm->init.snap_at_pos = hmm->state.snap_at_pos = 0;
+
     if ( !hmm->init.vit_prob )
         hmm->init.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates);
     if ( !hmm->init.fwd_prob )
         hmm->init.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
     if ( !hmm->init.bwd_prob )
         hmm->init.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+
+    if ( !hmm->state.vit_prob )
+        hmm->state.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+    if ( !hmm->state.fwd_prob )
+        hmm->state.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+    if ( !hmm->state.bwd_prob )
+        hmm->state.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
     
     int i;
     if ( probs )
@@ -112,8 +119,11 @@ void hmm_init_states(hmm_t *hmm, double *probs)
     else
         for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] = 1./hmm->nstates;
 
-    memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+    memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);  // these remain unchanged
     memcpy(hmm->init.bwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+    memcpy(hmm->state.vit_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates); // can be changed by snapshotting
+    memcpy(hmm->state.fwd_prob,hmm->init.fwd_prob,sizeof(double)*hmm->nstates);
+    memcpy(hmm->state.bwd_prob,hmm->init.bwd_prob,sizeof(double)*hmm->nstates);
 }
 hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
 {
@@ -126,7 +136,7 @@ hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
     return hmm;
 }
 
-void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite)
+void *hmm_snapshot(hmm_t *hmm, void *_snapshot, uint32_t pos)
 {
     snapshot_t *snapshot = (snapshot_t*) _snapshot;
     if ( snapshot && snapshot->nstates!=hmm->nstates )
@@ -147,22 +157,33 @@ void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite)
         snapshot->vit_prob = (double*) (mem + str_size + pad_size);
         snapshot->fwd_prob = snapshot->vit_prob + hmm->nstates;
     }
-    snapshot->isite = isite;
+    snapshot->snap_at_pos = pos;
     hmm->snapshot = snapshot;
     return snapshot;
 }
 void hmm_restore(hmm_t *hmm, void *_snapshot)
 {
     snapshot_t *snapshot = (snapshot_t*) _snapshot;
-    if ( !snapshot ) 
+    if ( !snapshot || !snapshot->snap_at_pos ) 
     {
-        hmm->init.isite = 0;
-        return;
+        hmm->state.snap_at_pos = 0;
+        memcpy(hmm->state.vit_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+        memcpy(hmm->state.fwd_prob,hmm->init.fwd_prob,sizeof(double)*hmm->nstates);
     }
-    hmm->init.isite = 1;
-    hmm->init.pos   = snapshot->pos;
-    memcpy(hmm->init.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates);
-    memcpy(hmm->init.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates);
+    else
+    {
+        hmm->state.snap_at_pos = snapshot->snap_at_pos;
+        memcpy(hmm->state.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates);
+        memcpy(hmm->state.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates);
+    }
+}
+void hmm_reset(hmm_t *hmm, void *_snapshot)
+{
+    snapshot_t *snapshot = (snapshot_t*) _snapshot;
+    if ( snapshot ) snapshot->snap_at_pos = 0;
+    hmm->state.snap_at_pos = 0;
+    memcpy(hmm->state.vit_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+    memcpy(hmm->state.fwd_prob,hmm->init.fwd_prob,sizeof(double)*hmm->nstates);
 }
 
 void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob)
@@ -219,8 +240,8 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
 
     // Init all states with equal likelihood
     int i,j, nstates = hmm->nstates;
-    memcpy(hmm->vprob, hmm->init.vit_prob, sizeof(*hmm->init.vit_prob)*nstates);
-    uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
+    memcpy(hmm->vprob, hmm->state.vit_prob, sizeof(*hmm->state.vit_prob)*nstates);
+    uint32_t prev_pos = hmm->state.snap_at_pos ? hmm->state.snap_at_pos : sites[0];
 
     // Run Viterbi
     for (i=0; i<n; i++)
@@ -250,11 +271,8 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
         for (j=0; j<nstates; j++) hmm->vprob_tmp[j] /= vnorm;
         double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp;
 
-        if ( hmm->snapshot && i==hmm->snapshot->isite )
-        {
-            hmm->snapshot->pos = sites[i];
+        if ( hmm->snapshot && sites[i]==hmm->snapshot->snap_at_pos )
             memcpy(hmm->snapshot->vit_prob, hmm->vprob, sizeof(*hmm->vprob)*nstates);
-        }
     }
 
     // Find the most likely state
@@ -286,12 +304,10 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
     }
 
 
-    // Init all states with equal likelihood
     int i,j,k, nstates = hmm->nstates;
-    memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
-    memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
-
-    uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
+    memcpy(hmm->fwd, hmm->state.fwd_prob, sizeof(*hmm->state.fwd_prob)*nstates);
+    memcpy(hmm->bwd, hmm->state.bwd_prob, sizeof(*hmm->state.bwd_prob)*nstates);
+    uint32_t prev_pos = hmm->state.snap_at_pos ? hmm->state.snap_at_pos : sites[0];
 
     // Run fwd 
     for (i=0; i<n; i++)
@@ -316,13 +332,9 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
             norm += fwd[j];
         }
         for (j=0; j<nstates; j++) fwd[j] /= norm;
-    }
 
-    if ( hmm->snapshot )
-    {
-        i = hmm->snapshot->isite;
-        hmm->snapshot->pos = sites[i];
-        memcpy(hmm->snapshot->fwd_prob, hmm->fwd + (i+1)*nstates, sizeof(*hmm->fwd)*nstates);
+        if ( hmm->snapshot && sites[i]==hmm->snapshot->snap_at_pos )
+            memcpy(hmm->snapshot->fwd_prob, fwd, sizeof(*fwd)*nstates);
     }
 
     // Run bwd
@@ -376,9 +388,9 @@ double *hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
 
     // Init all states with equal likelihood
     int i,j,k, nstates = hmm->nstates;
-    memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
-    memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
-    uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
+    memcpy(hmm->fwd, hmm->state.fwd_prob, sizeof(*hmm->state.fwd_prob)*nstates);
+    memcpy(hmm->bwd, hmm->state.bwd_prob, sizeof(*hmm->state.bwd_prob)*nstates);
+    uint32_t prev_pos = hmm->state.snap_at_pos ? hmm->state.snap_at_pos : sites[0];
 
     // New transition matrix: temporary values
     double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double));
@@ -480,6 +492,9 @@ void hmm_destroy(hmm_t *hmm)
     free(hmm->init.vit_prob);
     free(hmm->init.fwd_prob);
     free(hmm->init.bwd_prob);
+    free(hmm->state.vit_prob);
+    free(hmm->state.fwd_prob);
+    free(hmm->state.bwd_prob);
     free(hmm->vprob);
     free(hmm->vprob_tmp);
     free(hmm->vpath);
diff --git a/bcftools/HMM.c.pysam.c b/bcftools/HMM.c.pysam.c
index 513da35..998254c 100644
--- a/bcftools/HMM.c.pysam.c
+++ b/bcftools/HMM.c.pysam.c
@@ -35,12 +35,11 @@
 
 typedef struct
 {
-    int nstates;        // number of hmm's states
-    int isite;          // take snapshot at i-th position
-    uint32_t pos;       // i-th site's position
-    double *vit_prob;   // viterbi probabilities, NULL for uniform probs
-    double *fwd_prob;   // transition probabilities
-    double *bwd_prob;   // transition probabilities
+    int nstates;            // number of hmm's states
+    uint32_t snap_at_pos;   // snapshot at this position, 0 when inactive
+    double *vit_prob;       // viterbi probabilities, NULL for uniform probs
+    double *fwd_prob;       // transition probabilities
+    double *bwd_prob;       // transition probabilities
 }
 snapshot_t;
 
@@ -63,8 +62,9 @@ struct _hmm_t
     set_tprob_f set_tprob;      // Optional user function to set / modify transition probabilities
                                 //  at each site (one step of Viterbi algorithm)
     void *set_tprob_data;
-    snapshot_t init;            // Initial state probabilities. Set isite=1 when site should be used
-    snapshot_t *snapshot;
+    snapshot_t init, state;     // Initial and current state probs. Set state from snapshot if prev_snap_pos!=0 or from init otherwise
+    snapshot_t *snapshot;       //  snapshot->snap_at_pos  .. request a snapshot at this position
+                                //  hmm->state.snap_at_pos .. the current state comes from snapshot made at this position
 };
 
 uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; }
@@ -94,14 +94,21 @@ static inline void multiply_matrix(int n, double *a, double *b, double *dst, dou
 
 void hmm_init_states(hmm_t *hmm, double *probs)
 {
-    hmm->init.isite = 0;
-    hmm->init.pos   = 0;
+    hmm->init.snap_at_pos = hmm->state.snap_at_pos = 0;
+
     if ( !hmm->init.vit_prob )
         hmm->init.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates);
     if ( !hmm->init.fwd_prob )
         hmm->init.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
     if ( !hmm->init.bwd_prob )
         hmm->init.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+
+    if ( !hmm->state.vit_prob )
+        hmm->state.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+    if ( !hmm->state.fwd_prob )
+        hmm->state.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+    if ( !hmm->state.bwd_prob )
+        hmm->state.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
     
     int i;
     if ( probs )
@@ -114,8 +121,11 @@ void hmm_init_states(hmm_t *hmm, double *probs)
     else
         for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] = 1./hmm->nstates;
 
-    memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+    memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);  // these remain unchanged
     memcpy(hmm->init.bwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+    memcpy(hmm->state.vit_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates); // can be changed by snapshotting
+    memcpy(hmm->state.fwd_prob,hmm->init.fwd_prob,sizeof(double)*hmm->nstates);
+    memcpy(hmm->state.bwd_prob,hmm->init.bwd_prob,sizeof(double)*hmm->nstates);
 }
 hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
 {
@@ -128,7 +138,7 @@ hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
     return hmm;
 }
 
-void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite)
+void *hmm_snapshot(hmm_t *hmm, void *_snapshot, uint32_t pos)
 {
     snapshot_t *snapshot = (snapshot_t*) _snapshot;
     if ( snapshot && snapshot->nstates!=hmm->nstates )
@@ -149,22 +159,33 @@ void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite)
         snapshot->vit_prob = (double*) (mem + str_size + pad_size);
         snapshot->fwd_prob = snapshot->vit_prob + hmm->nstates;
     }
-    snapshot->isite = isite;
+    snapshot->snap_at_pos = pos;
     hmm->snapshot = snapshot;
     return snapshot;
 }
 void hmm_restore(hmm_t *hmm, void *_snapshot)
 {
     snapshot_t *snapshot = (snapshot_t*) _snapshot;
-    if ( !snapshot ) 
+    if ( !snapshot || !snapshot->snap_at_pos ) 
     {
-        hmm->init.isite = 0;
-        return;
+        hmm->state.snap_at_pos = 0;
+        memcpy(hmm->state.vit_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+        memcpy(hmm->state.fwd_prob,hmm->init.fwd_prob,sizeof(double)*hmm->nstates);
     }
-    hmm->init.isite = 1;
-    hmm->init.pos   = snapshot->pos;
-    memcpy(hmm->init.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates);
-    memcpy(hmm->init.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates);
+    else
+    {
+        hmm->state.snap_at_pos = snapshot->snap_at_pos;
+        memcpy(hmm->state.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates);
+        memcpy(hmm->state.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates);
+    }
+}
+void hmm_reset(hmm_t *hmm, void *_snapshot)
+{
+    snapshot_t *snapshot = (snapshot_t*) _snapshot;
+    if ( snapshot ) snapshot->snap_at_pos = 0;
+    hmm->state.snap_at_pos = 0;
+    memcpy(hmm->state.vit_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+    memcpy(hmm->state.fwd_prob,hmm->init.fwd_prob,sizeof(double)*hmm->nstates);
 }
 
 void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob)
@@ -221,8 +242,8 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
 
     // Init all states with equal likelihood
     int i,j, nstates = hmm->nstates;
-    memcpy(hmm->vprob, hmm->init.vit_prob, sizeof(*hmm->init.vit_prob)*nstates);
-    uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
+    memcpy(hmm->vprob, hmm->state.vit_prob, sizeof(*hmm->state.vit_prob)*nstates);
+    uint32_t prev_pos = hmm->state.snap_at_pos ? hmm->state.snap_at_pos : sites[0];
 
     // Run Viterbi
     for (i=0; i<n; i++)
@@ -252,11 +273,8 @@ void hmm_run_viterbi(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
         for (j=0; j<nstates; j++) hmm->vprob_tmp[j] /= vnorm;
         double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp;
 
-        if ( hmm->snapshot && i==hmm->snapshot->isite )
-        {
-            hmm->snapshot->pos = sites[i];
+        if ( hmm->snapshot && sites[i]==hmm->snapshot->snap_at_pos )
             memcpy(hmm->snapshot->vit_prob, hmm->vprob, sizeof(*hmm->vprob)*nstates);
-        }
     }
 
     // Find the most likely state
@@ -288,12 +306,10 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
     }
 
 
-    // Init all states with equal likelihood
     int i,j,k, nstates = hmm->nstates;
-    memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
-    memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
-
-    uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
+    memcpy(hmm->fwd, hmm->state.fwd_prob, sizeof(*hmm->state.fwd_prob)*nstates);
+    memcpy(hmm->bwd, hmm->state.bwd_prob, sizeof(*hmm->state.bwd_prob)*nstates);
+    uint32_t prev_pos = hmm->state.snap_at_pos ? hmm->state.snap_at_pos : sites[0];
 
     // Run fwd 
     for (i=0; i<n; i++)
@@ -318,13 +334,9 @@ void hmm_run_fwd_bwd(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
             norm += fwd[j];
         }
         for (j=0; j<nstates; j++) fwd[j] /= norm;
-    }
 
-    if ( hmm->snapshot )
-    {
-        i = hmm->snapshot->isite;
-        hmm->snapshot->pos = sites[i];
-        memcpy(hmm->snapshot->fwd_prob, hmm->fwd + (i+1)*nstates, sizeof(*hmm->fwd)*nstates);
+        if ( hmm->snapshot && sites[i]==hmm->snapshot->snap_at_pos )
+            memcpy(hmm->snapshot->fwd_prob, fwd, sizeof(*fwd)*nstates);
     }
 
     // Run bwd
@@ -378,9 +390,9 @@ double *hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
 
     // Init all states with equal likelihood
     int i,j,k, nstates = hmm->nstates;
-    memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
-    memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
-    uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
+    memcpy(hmm->fwd, hmm->state.fwd_prob, sizeof(*hmm->state.fwd_prob)*nstates);
+    memcpy(hmm->bwd, hmm->state.bwd_prob, sizeof(*hmm->state.bwd_prob)*nstates);
+    uint32_t prev_pos = hmm->state.snap_at_pos ? hmm->state.snap_at_pos : sites[0];
 
     // New transition matrix: temporary values
     double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double));
@@ -482,6 +494,9 @@ void hmm_destroy(hmm_t *hmm)
     free(hmm->init.vit_prob);
     free(hmm->init.fwd_prob);
     free(hmm->init.bwd_prob);
+    free(hmm->state.vit_prob);
+    free(hmm->state.fwd_prob);
+    free(hmm->state.bwd_prob);
     free(hmm->vprob);
     free(hmm->vprob_tmp);
     free(hmm->vpath);
diff --git a/bcftools/HMM.h b/bcftools/HMM.h
index 3e5cf7f..70c9cb8 100644
--- a/bcftools/HMM.h
+++ b/bcftools/HMM.h
@@ -59,16 +59,21 @@ void hmm_init_states(hmm_t *hmm, double *probs);
 /**
  *   hmm_snapshot() - take the model's snapshot, intended for sliding HMM
  *   @snapshot: NULL or snapshot returned by previous hmm_snapshot() call, must be free()-ed by the caller
- *   @isite:    take the snapshot at i-th step
+ *   @pos:      take the snapshot at this position
+ *
+ *   If both restore() and snapshot() are needed, restore() must be called first.
  */
-void *hmm_snapshot(hmm_t *hmm, void *snapshot, int isite);
+void *hmm_snapshot(hmm_t *hmm, void *snapshot, uint32_t pos);
 
 /**
  *   hmm_restore() - restore model's snapshot, intended for sliding HMM
  *   @snapshot: snapshot returned by hmm_snapshot() call or NULL to reset
  *   @isite:    take the snapshot at i-th step
+ *
+ *   If both restore() and snapshot() are needed, restore() must be called first.
  */
 void hmm_restore(hmm_t *hmm, void *snapshot);
+void hmm_reset(hmm_t *hmm, void *snapshot);
 
 /**
  *   hmm_get_tprob() - return the array of transition matrices, precalculated
diff --git a/bcftools/bam2bcf.c b/bcftools/bam2bcf.c
index b4fb7f1..c8dc91b 100644
--- a/bcftools/bam2bcf.c
+++ b/bcftools/bam2bcf.c
@@ -168,27 +168,36 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
     for (i = n = 0; i < _n; ++i) {
         const bam_pileup1_t *p = pl + i;
         int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
-        // set base
-        if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
+        if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
+        if (p->is_del && !is_indel) continue;
         ++ori_depth;
+        if (is_indel)
+        {
+            b     = p->aux>>16&0x3f;
+            baseQ = q = p->aux&0xff;
+            // This read is not counted as indel. Instead of skipping it, treat it as ref. It is
+            // still only an approximation, but gives more accurate AD counts and calls correctly
+            // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam)
+            if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos];
+            seqQ  = p->aux>>8&0xff;
+            is_diff = (b != 0);
+        }
+        else
+        {
+            b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
+            b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
+            baseQ = q = (int)bam_get_qual(p->b)[p->qpos];
+            if (q < bca->min_baseQ) continue;
+            seqQ  = 99;
+            is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
+        }
         mapQ  = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
         if ( !mapQ ) r->mq0++;
-        baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality
-        seqQ = is_indel? (p->aux>>8&0xff) : 99;
-        if (q < bca->min_baseQ) continue;
         if (q > seqQ) q = seqQ;
         mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
         if (q > mapQ) q = mapQ;
         if (q > 63) q = 63;
         if (q < 4) q = 4;       // MQ=0 reads count as BQ=4
-        if (!is_indel) {
-            b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
-            b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
-            is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
-        } else {
-            b = p->aux>>16&0x3f;
-            is_diff = (b != 0);
-        }
         bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
         // collect annotations
         if (b < 4)
diff --git a/bcftools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c
index 5a1a443..4db42e4 100644
--- a/bcftools/bam2bcf.c.pysam.c
+++ b/bcftools/bam2bcf.c.pysam.c
@@ -170,27 +170,36 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
     for (i = n = 0; i < _n; ++i) {
         const bam_pileup1_t *p = pl + i;
         int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
-        // set base
-        if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
+        if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
+        if (p->is_del && !is_indel) continue;
         ++ori_depth;
+        if (is_indel)
+        {
+            b     = p->aux>>16&0x3f;
+            baseQ = q = p->aux&0xff;
+            // This read is not counted as indel. Instead of skipping it, treat it as ref. It is
+            // still only an approximation, but gives more accurate AD counts and calls correctly
+            // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam)
+            if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos];
+            seqQ  = p->aux>>8&0xff;
+            is_diff = (b != 0);
+        }
+        else
+        {
+            b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
+            b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
+            baseQ = q = (int)bam_get_qual(p->b)[p->qpos];
+            if (q < bca->min_baseQ) continue;
+            seqQ  = 99;
+            is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
+        }
         mapQ  = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
         if ( !mapQ ) r->mq0++;
-        baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality
-        seqQ = is_indel? (p->aux>>8&0xff) : 99;
-        if (q < bca->min_baseQ) continue;
         if (q > seqQ) q = seqQ;
         mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
         if (q > mapQ) q = mapQ;
         if (q > 63) q = 63;
         if (q < 4) q = 4;       // MQ=0 reads count as BQ=4
-        if (!is_indel) {
-            b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
-            b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
-            is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
-        } else {
-            b = p->aux>>16&0x3f;
-            is_diff = (b != 0);
-        }
         bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
         // collect annotations
         if (b < 4)
diff --git a/bcftools/bam2bcf_indel.c b/bcftools/bam2bcf_indel.c
index 52837b5..6c367da 100644
--- a/bcftools/bam2bcf_indel.c
+++ b/bcftools/bam2bcf_indel.c
@@ -357,14 +357,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
                     }
                     free(qq);
                 }
-/*
+#if 0
                 for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
                     fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
                 fputc('\n', stderr);
                 for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
                 fputc('\n', stderr);
-                fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
-*/
+                fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc);
+#endif
             }
         }
     }
@@ -454,7 +454,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
                     if (x == bca->indel_types[j]) break;
                 p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
                 if ((p->aux>>16&0x3f) > 0) ++n_alt;
-                //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+                //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
             }
         }
 
diff --git a/bcftools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c
index 0d36841..4b37122 100644
--- a/bcftools/bam2bcf_indel.c.pysam.c
+++ b/bcftools/bam2bcf_indel.c.pysam.c
@@ -359,14 +359,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
                     }
                     free(qq);
                 }
-/*
+#if 0
                 for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
                     fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysam_stderr);
                 fputc('\n', pysam_stderr);
                 for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysam_stderr);
                 fputc('\n', pysam_stderr);
-                fprintf(pysam_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
-*/
+                fprintf(pysam_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc);
+#endif
             }
         }
     }
@@ -456,7 +456,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
                     if (x == bca->indel_types[j]) break;
                 p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
                 if ((p->aux>>16&0x3f) > 0) ++n_alt;
-                //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+                //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
             }
         }
 
diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h
index 7d2d49f..dde3ab0 100644
--- a/bcftools/bcftools.h
+++ b/bcftools/bcftools.h
@@ -63,6 +63,12 @@ static inline char gt2iupac(char a, char b)
     return iupac[(int)a][(int)b];
 }
 
+static inline char nt_to_upper(char nt)
+{
+    if ( nt < 97 ) return nt;
+    return nt - 32;
+}
+
 static inline double phred_score(double prob)
 {
     if ( prob==0 ) return 99;
diff --git a/bcftools/consensus.c b/bcftools/consensus.c
index 4fccc4f..258ef14 100644
--- a/bcftools/consensus.c
+++ b/bcftools/consensus.c
@@ -536,13 +536,14 @@ static void consensus(args_t *args)
     {
         if ( str.s[0]=='>' )
         {
-            // new sequence encountered, apply all cached variants
+            // new sequence encountered
+            if (args->chain) {
+                print_chain(args);
+                destroy_chain(args);
+            }
+            // apply all cached variants
             while ( args->vcf_rbuf.n )
             {
-                if (args->chain) {
-                    print_chain(args);
-                    destroy_chain(args);
-                }
                 bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
                 if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
                 int i = rbuf_shift(&args->vcf_rbuf);
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c
index 51d9339..86e855e 100644
--- a/bcftools/consensus.c.pysam.c
+++ b/bcftools/consensus.c.pysam.c
@@ -538,13 +538,14 @@ static void consensus(args_t *args)
     {
         if ( str.s[0]=='>' )
         {
-            // new sequence encountered, apply all cached variants
+            // new sequence encountered
+            if (args->chain) {
+                print_chain(args);
+                destroy_chain(args);
+            }
+            // apply all cached variants
             while ( args->vcf_rbuf.n )
             {
-                if (args->chain) {
-                    print_chain(args);
-                    destroy_chain(args);
-                }
                 bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
                 if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
                 int i = rbuf_shift(&args->vcf_rbuf);
diff --git a/bcftools/filter.c b/bcftools/filter.c
index 463028f..78ff1f1 100644
--- a/bcftools/filter.c
+++ b/bcftools/filter.c
@@ -698,6 +698,40 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
     tok->values[0] = str.m;
     tok->str_value = str.s;
 }
+static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+    bcf_unpack(line, BCF_UN_FMT);
+    if ( !line->n_sample )
+    {
+        tok->nvalues = 1;
+        tok->values[0] = 0;
+        return;
+    }
+
+    int i,igt = bcf_hdr_id2int(flt->hdr, BCF_DT_ID, "GT");
+    bcf_fmt_t *fmt = NULL;
+    for (i=0; i<line->n_fmt; i++)
+        if ( line->d.fmt[i].id==igt ) { fmt = &line->d.fmt[i]; break; }
+    if ( !fmt )
+    {
+        tok->nvalues = 0;
+        return;
+    }
+    if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n");
+
+    int j,nmissing = 0;
+    for (i=0; i<line->n_sample; i++)
+    {
+        int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size);
+        for (j=0; j<fmt->n; j++)
+        {
+            if ( ptr[j]==bcf_int8_vector_end ) break;
+            if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; }
+        }
+    }
+    tok->nvalues = 1;
+    tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample;
+}
 static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     tok->nvalues = 1;
@@ -832,6 +866,7 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
         {
             char *se = ss;
             while ( *se && *se!=',' ) se++;
+            hts_expand(double, i+1, tok->mvalues, tok->values);
             if ( !*se ) tok->values[i] = strlen(ss);
             else
             {
@@ -1303,6 +1338,18 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->threshold = bcf_hdr_nsamples(filter->hdr);
             return 0;
         }
+        else if ( !strncasecmp(str,"N_MISSING",len) )
+        {
+            tok->setter = &filters_set_nmissing;
+            tok->tag = strdup("N_MISSING");
+            return 0;
+        }
+        else if ( !strncasecmp(str,"F_MISSING",len) )
+        {
+            tok->setter = &filters_set_nmissing;
+            tok->tag = strdup("F_MISSING");
+            return 0;
+        }
     }
 
     // does it have array subscript?
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c
index 44046f2..25200c4 100644
--- a/bcftools/filter.c.pysam.c
+++ b/bcftools/filter.c.pysam.c
@@ -700,6 +700,40 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
     tok->values[0] = str.m;
     tok->str_value = str.s;
 }
+static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
+{
+    bcf_unpack(line, BCF_UN_FMT);
+    if ( !line->n_sample )
+    {
+        tok->nvalues = 1;
+        tok->values[0] = 0;
+        return;
+    }
+
+    int i,igt = bcf_hdr_id2int(flt->hdr, BCF_DT_ID, "GT");
+    bcf_fmt_t *fmt = NULL;
+    for (i=0; i<line->n_fmt; i++)
+        if ( line->d.fmt[i].id==igt ) { fmt = &line->d.fmt[i]; break; }
+    if ( !fmt )
+    {
+        tok->nvalues = 0;
+        return;
+    }
+    if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n");
+
+    int j,nmissing = 0;
+    for (i=0; i<line->n_sample; i++)
+    {
+        int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size);
+        for (j=0; j<fmt->n; j++)
+        {
+            if ( ptr[j]==bcf_int8_vector_end ) break;
+            if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; }
+        }
+    }
+    tok->nvalues = 1;
+    tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample;
+}
 static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     tok->nvalues = 1;
@@ -834,6 +868,7 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
         {
             char *se = ss;
             while ( *se && *se!=',' ) se++;
+            hts_expand(double, i+1, tok->mvalues, tok->values);
             if ( !*se ) tok->values[i] = strlen(ss);
             else
             {
@@ -1305,6 +1340,18 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->threshold = bcf_hdr_nsamples(filter->hdr);
             return 0;
         }
+        else if ( !strncasecmp(str,"N_MISSING",len) )
+        {
+            tok->setter = &filters_set_nmissing;
+            tok->tag = strdup("N_MISSING");
+            return 0;
+        }
+        else if ( !strncasecmp(str,"F_MISSING",len) )
+        {
+            tok->setter = &filters_set_nmissing;
+            tok->tag = strdup("F_MISSING");
+            return 0;
+        }
     }
 
     // does it have array subscript?
diff --git a/bcftools/main.c b/bcftools/main.c
index 9350ff8..4e3e0e5 100644
--- a/bcftools/main.c
+++ b/bcftools/main.c
@@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.  */
 
+#include "config.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -110,7 +111,12 @@ static cmd_t cmds[] =
     },
     { .func  = main_plugin,
       .alias = "plugin",
+#ifdef ENABLE_BCF_PLUGINS
       .help  = "user-defined plugins"
+#else
+      /* Do not advertise when plugins disabled. */
+      .help  = "-user-defined plugins"
+#endif
     },
     { .func  = main_vcfquery,
       .alias = "query",
diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c
index a2b4a99..f148252 100644
--- a/bcftools/main.c.pysam.c
+++ b/bcftools/main.c.pysam.c
@@ -24,6 +24,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.  */
 
+#include "config.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -112,7 +113,12 @@ static cmd_t cmds[] =
     },
     { .func  = main_plugin,
       .alias = "plugin",
+#ifdef ENABLE_BCF_PLUGINS
       .help  = "user-defined plugins"
+#else
+      /* Do not advertise when plugins disabled. */
+      .help  = "-user-defined plugins"
+#endif
     },
     { .func  = main_vcfquery,
       .alias = "query",
diff --git a/bcftools/rbuf.h b/bcftools/rbuf.h
index 3d2805c..2c0e5b1 100644
--- a/bcftools/rbuf.h
+++ b/bcftools/rbuf.h
@@ -46,11 +46,16 @@ static inline void rbuf_init(rbuf_t *rbuf, int size)
 /**
  *  rbuf_kth() - get index of the k-th element of the round buffer
  *  @rbuf:  the rbuf_t holder
- *  @k:     0-based index
+ *  @k:     0-based index. If negative, return k-th element from the end, 1-based
  */
 static inline int rbuf_kth(rbuf_t *rbuf, int k)
 {
-    if ( k >= rbuf->n || k<0 ) return -1;
+    if ( k >= rbuf->n ) return -1;
+    if ( k < 0 )
+    {
+        k = rbuf->n + k; 
+        if ( k < 0 ) return -1;
+    }
     int i = k + rbuf->f;
     if ( i >= rbuf->m ) i -= rbuf->m;
     return i;
@@ -58,9 +63,29 @@ static inline int rbuf_kth(rbuf_t *rbuf, int k)
 /**
  *  rbuf_last() - get index of the last element of the round buffer
  *  @rbuf:  the rbuf_t holder
+ */
+#define rbuf_last(rbuf) rbuf_kth(rbuf, -1)
+
+/**
+ *  rbuf_l2ridx() - get 0-based rbuf index which corresponds to i-th linear index
+ *  @rbuf:  the rbuf_t holder
+ *  @idx:   0-based linear index
  *
+ *  Returns 0-based circular index or -1 if out of bounds
  */
-#define rbuf_last(rbuf) rbuf_kth(rbuf, (rbuf)->n - 1)
+static inline int rbuf_l2ridx(rbuf_t *rbuf, int idx)
+{
+    if ( idx < 0 || idx >= rbuf->n ) return -1;
+    if ( idx >= rbuf->f )
+    {
+        int i = idx - rbuf->f;
+        if ( i >= rbuf->n ) return -1;
+        return i;
+    }
+    int i = rbuf->m - rbuf->f + idx;
+    if ( i >= rbuf->n ) return -1;
+    return i;
+}
 
 /**
  *  rbuf_next() - get index of the next element in the round buffer
@@ -198,4 +223,39 @@ static inline void rbuf_shift_n(rbuf_t *rbuf, int n)
     } \
 }
 
+/**
+ *  rbuf_remove_kth() - remove k-th rbuf element (0-based) and memmove the data block
+ *  @rbuf:      the rbuf holder
+ *  @type_t:    data type
+ *  @k:         k-th element to remove
+ *  @data:      data array to be modified
+ */
+#define rbuf_remove_kth(rbuf, type_t, kth, data) \
+{ \
+    int k = rbuf_kth(rbuf, kth); \
+    if ( k < (rbuf)->f )    /* shrink from back */ \
+    { \
+        int l = rbuf_kth(rbuf, -1); \
+        if ( k < l ) \
+        { \
+            type_t tmp = (data)[k]; \
+            memmove(data+k, data+k+1, (l - k)*sizeof(type_t)); \
+            (data)[l] = tmp; \
+        } \
+        (rbuf)->n--; \
+    } \
+    else                    /* shrink from front */ \
+    { \
+        if ( k > (rbuf)->f ) \
+        { \
+            type_t tmp = (data)[k]; \
+            memmove(&data[(rbuf)->f+1], &data[(rbuf)->f], (k - (rbuf)->f)*sizeof(type_t)); \
+            (data)[(rbuf)->f] = tmp; \
+        } \
+        (rbuf)->f++; \
+        (rbuf)->n--; \
+        if ( (rbuf)->f == (rbuf)->m ) (rbuf)->f = 0; \
+    } \
+}
+
 #endif
diff --git a/bcftools/vcfbuf.c b/bcftools/vcfbuf.c
new file mode 100644
index 0000000..d896d3a
--- /dev/null
+++ b/bcftools/vcfbuf.c
@@ -0,0 +1,442 @@
+/* The MIT License
+
+   Copyright (c) 2016 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3 at sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <htslib/vcf.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "vcfbuf.h"
+#include "rbuf.h"
+
+typedef struct
+{
+    double max;
+    int rand_missing, skip_filter;
+}
+ld_t;
+
+typedef struct
+{
+    bcf1_t *rec;
+    double af;
+    int af_set:1, idx:31;
+}
+vcfrec_t;
+
+typedef struct
+{
+    int max_sites, mvrec, mac, mfarr;
+    int *ac, *idx;
+    float *farr;
+    char *af_tag;
+    vcfrec_t **vrec;
+}
+prune_t;
+
+typedef struct
+{
+    int active, rid, end;
+}
+overlap_t;
+
+struct _vcfbuf_t
+{
+    int win;
+    bcf_hdr_t *hdr;
+    vcfrec_t *vcf;
+    rbuf_t rbuf;
+    ld_t ld;
+    prune_t prune;
+    overlap_t overlap;
+};
+
+vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
+{
+    vcfbuf_t *buf = (vcfbuf_t*) calloc(1,sizeof(vcfbuf_t));
+    buf->hdr = hdr;
+    buf->win = win;
+    buf->overlap.rid = -1;
+    rbuf_init(&buf->rbuf, 0);
+    return buf;
+}
+
+void vcfbuf_destroy(vcfbuf_t *buf)
+{
+    int i;
+    for (i=0; i<buf->rbuf.m; i++)
+        if ( buf->vcf[i].rec ) bcf_destroy(buf->vcf[i].rec);
+    free(buf->vcf);
+    free(buf->prune.farr);
+    free(buf->prune.vrec);
+    free(buf->prune.ac);
+    free(buf->prune.idx);
+    free(buf);
+}
+
+void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value)
+{
+    if ( key==VCFBUF_LD_MAX ) { buf->ld.max = *((double*)value); return; }
+    if ( key==VCFBUF_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
+    if ( key==VCFBUF_SKIP_FILTER ) { buf->ld.skip_filter = *((int*)value); return; }
+    if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; }
+    if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; }
+    if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; }
+}
+
+int vcfbuf_nsites(vcfbuf_t *buf)
+{
+    return buf->rbuf.n;
+}
+
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
+{
+    if ( !swap ) error("todo: swap=%d\n", swap);
+
+    rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
+
+    int i = rbuf_append(&buf->rbuf);
+    if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1();
+    
+    bcf1_t *ret = buf->vcf[i].rec;
+    buf->vcf[i].rec = rec;
+    buf->vcf[i].af_set = 0;
+
+    return ret;
+}
+
+static int cmpvrec(const void *_a, const void *_b)
+{
+    vcfrec_t *a = *((vcfrec_t**) _a);
+    vcfrec_t *b = *((vcfrec_t**) _b);
+    if ( a->af < b->af ) return -1;
+    if ( a->af == b->af ) return 0;
+    return 1;
+}
+static int cmpint_desc(const void *_a, const void *_b)
+{
+    int a = *((int*)_a);
+    int b = *((int*)_b);
+    if ( a < b ) return 1;
+    if ( a == b ) return 0;
+    return -1;
+}
+
+static void _prune_sites(vcfbuf_t *buf, int flush_all)
+{
+    int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1;
+
+    if ( nbuf > buf->prune.mvrec )
+    {
+        buf->prune.idx   = (int*) realloc(buf->prune.idx, nbuf*sizeof(int));
+        buf->prune.vrec  = (vcfrec_t**) realloc(buf->prune.vrec, nbuf*sizeof(vcfrec_t*));
+        buf->prune.mvrec = nbuf;
+    }
+
+    // set allele frequency and prepare buffer for sorting
+    int i,k,irec = 0;
+    for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; )
+    {
+        bcf1_t *line = buf->vcf[i].rec;
+        if ( line->n_allele > buf->prune.mac ) 
+        {
+            buf->prune.ac = (int*) realloc(buf->prune.ac, line->n_allele*sizeof(*buf->prune.ac));
+            buf->prune.mac = line->n_allele;
+        }
+        if ( !buf->vcf[i].af_set )
+        {
+            buf->vcf[i].af = 0;
+            if ( buf->prune.af_tag )
+            {
+                if ( bcf_get_info_float(buf->hdr,line,buf->prune.af_tag,&buf->prune.farr, &buf->prune.mfarr) > 0 ) buf->vcf[i].af = buf->prune.farr[0];
+            }
+            else if ( bcf_calc_ac(buf->hdr, line, buf->prune.ac, BCF_UN_INFO|BCF_UN_FMT) )
+            {
+                int ntot = buf->prune.ac[0], nalt = 0; 
+                for (k=1; k<line->n_allele; k++) nalt += buf->prune.ac[k];
+                buf->vcf[i].af = ntot ? (float)nalt/ntot : 0;
+            }
+            buf->vcf[i].af_set = 1;
+        }
+        buf->vcf[i].idx = irec;
+        buf->prune.vrec[irec++] = &buf->vcf[i];
+    }
+
+    // sort by allele frequency, low AF will be removed preferentially
+    qsort(buf->prune.vrec, nbuf, sizeof(*buf->prune.vrec), cmpvrec);
+
+    // sort the rbuf indexes to be pruned descendently so that j-th rbuf index
+    // is removed before i-th index if i<j
+    int nprune = nbuf - buf->prune.max_sites;
+    for (i=0; i<nprune; i++)
+        buf->prune.idx[i] = buf->prune.vrec[i]->idx;
+
+    qsort(buf->prune.idx, nprune, sizeof(int), cmpint_desc);
+
+    for (i=0; i<nprune; i++)
+        rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf);
+}
+
+static int _overlap_can_flush(vcfbuf_t *buf, int flush_all)
+{
+    if ( flush_all ) { buf->overlap.rid = -1; return 1; }
+
+    int i = rbuf_last(&buf->rbuf);
+    vcfrec_t *last = &buf->vcf[i];
+    if ( buf->overlap.rid != last->rec->rid ) buf->overlap.end = 0;
+
+    int beg_pos = last->rec->pos;
+    int end_pos = last->rec->pos + last->rec->rlen - 1;
+
+    // Assuming left-aligned indels. In case it is a deletion, the real variant
+    // starts one base after. If an insertion, the overlap with previous zero length.
+    int imin = last->rec->rlen;
+    for (i=0; i<last->rec->n_allele; i++)
+    {
+        char *ref = last->rec->d.allele[0];
+        char *alt = last->rec->d.allele[i];
+        if ( *alt == '<' ) continue;    // ignore symbolic alleles
+        while ( *ref && *alt && nt_to_upper(*ref)==nt_to_upper(*alt) ) { ref++; alt++; }
+        if ( imin > ref - last->rec->d.allele[0] ) imin = ref - last->rec->d.allele[0];
+    }
+
+    if ( beg_pos <= buf->overlap.end )
+    {
+        beg_pos += imin;
+        if ( beg_pos > end_pos ) end_pos = beg_pos;
+    }
+
+    if ( buf->rbuf.n==1 )
+    {
+        buf->overlap.rid = last->rec->rid;
+        buf->overlap.end = end_pos;
+        return 0; 
+    }
+    if ( beg_pos <= buf->overlap.end )
+    {
+        if ( buf->overlap.end < end_pos ) buf->overlap.end = end_pos;
+        return 0;
+    }
+    return 1;
+}
+
+bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
+{
+    int i,j;
+
+    if ( buf->rbuf.n==0 ) return NULL;
+    if ( flush_all ) goto ret;
+
+    i = rbuf_kth(&buf->rbuf, 0);    // first
+    j = rbuf_last(&buf->rbuf);      // last
+
+    if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret;
+    if ( buf->overlap.active )
+    {
+        int ret = _overlap_can_flush(buf, flush_all);
+        //printf("can_flush: %d  %d - %d\n", ret, buf->vcf[i].rec->pos+1, buf->vcf[j].rec->pos+1);
+        if ( ret ) goto ret;
+    }
+    //if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret;
+
+    if ( buf->win > 0 )
+    {
+        if ( buf->rbuf.n <= buf->win ) return NULL;
+        goto ret;
+    }
+    else if ( buf->win < 0 )
+    {
+        if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL;
+    }
+    else return NULL;
+    
+ret:
+    if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);
+
+    i = rbuf_shift(&buf->rbuf);
+    return buf->vcf[i].rec;
+}
+
+static double _estimate_af(int8_t *ptr, int size, int nvals, int nsamples)
+{
+    int i,j, nref = 0, nalt = 0;
+    for (i=0; i<nsamples; i++)
+    {
+        for (j=0; j<nvals; j++)
+        {
+            if ( ptr[j]==bcf_gt_missing ) break;
+            if ( ptr[j]==bcf_int8_vector_end ) break;
+            if ( bcf_gt_allele(ptr[j]) ) nalt++;
+            else nref++;
+        }
+        ptr += size;
+    }
+    if ( nref+nalt == 0 ) return 0;
+    return (double)nalt/(nref+nalt);
+}
+
+/*
+    For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
+        D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb))
+*/
+static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
+{
+    if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample);
+    assert( arec->n_sample );
+
+    int i,j,igt = bcf_hdr_id2int(buf->hdr, BCF_DT_ID, "GT");
+    bcf_unpack(arec, BCF_UN_FMT);
+    bcf_unpack(brec, BCF_UN_FMT);
+    bcf_fmt_t *afmt = NULL, *bfmt = NULL;
+    for (i=0; i<arec->n_fmt; i++)
+        if ( arec->d.fmt[i].id==igt ) { afmt = &arec->d.fmt[i]; break; }
+    if ( !afmt ) return -1;  // no GT tag
+    for (i=0; i<brec->n_fmt; i++)
+        if ( brec->d.fmt[i].id==igt ) { bfmt = &brec->d.fmt[i]; break; }
+    if ( !bfmt ) return -1;  // no GT tag
+
+    if ( afmt->n==0 ) return -1;   // empty?!
+    if ( bfmt->n==0 ) return -1;   // empty?!
+    if ( afmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8!\n");
+    if ( bfmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8!\n");
+
+    // Determine allele frequencies, this is to sample randomly missing genotypes
+    double aaf = 0, baf = 0;
+    if ( buf->ld.rand_missing )
+    {
+        aaf = _estimate_af((int8_t*)afmt->p, afmt->size, afmt->n, arec->n_sample);
+        baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample);
+    }
+
+    // Calculate correlation 
+    double ab = 0, aa = 0, bb = 0, a = 0, b = 0;
+    int nab = 0, na = 0, nb = 0, ndiff = 0;
+    for (i=0; i<arec->n_sample; i++)
+    {
+        int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size);
+        int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size);
+        int adsg = 0, bdsg = 0, an = 0, bn = 0;
+        for (j=0; j<afmt->n; j++)
+        {
+            if ( aptr[j]==bcf_int8_vector_end ) break;
+            if ( aptr[j]==bcf_gt_missing )
+            {
+                if ( !buf->ld.rand_missing ) break;
+                if ( rand()/RAND_MAX >= aaf ) adsg += 1;
+            }
+            else if ( bcf_gt_allele(aptr[j]) ) adsg += 1;
+            an++;
+        }
+        for (j=0; j<bfmt->n; j++)
+        {
+            if ( bptr[j]==bcf_int8_vector_end ) break;
+            if ( bptr[j]==bcf_gt_missing )
+            {
+                if ( !buf->ld.rand_missing ) break;
+                if ( rand()/RAND_MAX >= baf ) bdsg += 1;
+            }
+            else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1;
+            bn++;
+        }
+        if ( an )
+        {
+            aa += adsg*adsg;
+            a  += adsg;
+            na++;
+        }
+        if ( bn )
+        {
+            bb += bdsg*bdsg;
+            b  += bdsg;
+            nb++;
+        }
+        if ( an && bn )
+        {
+            if ( adsg!=bdsg ) ndiff++;
+            ab += adsg*bdsg;
+            nab++;
+        }
+    }
+    if ( !nab ) return -1;
+
+    double cor;
+    if ( !ndiff ) cor = 1;
+    else
+    {
+        // Don't know how to deal with zero variance. Since this the purpose is filtering,
+        // it is not enough to say the value is undefined. Therefore an artificial noise is
+        // added to make the denominator non-zero.
+        if ( aa == a*a/na || bb == b*b/nb )
+        {
+            aa += 3*3;
+            bb += 3*3;
+            ab += 3*3;
+            a  += 3;
+            b  += 3;
+            na++;
+            nb++;
+            nab++;
+        }
+        cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb);
+    }
+    return cor*cor;
+}
+
+bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld)
+{
+    *ld = -1;
+    if ( !buf->rbuf.n ) return NULL;
+
+    int i = buf->rbuf.f;
+
+    // Relying on vcfbuf being properly flushed - all sites in the buffer
+    // must come from the same chromosome
+    if ( buf->vcf[i].rec->rid != rec->rid ) return NULL;
+
+    int imax = 0;
+    double max = 0;
+    for (i=-1; rbuf_next(&buf->rbuf,&i); )
+    {   
+        if ( buf->ld.skip_filter )
+        {
+            if ( buf->vcf[i].rec->d.n_flt > 1 ) continue;   // multiple filters are set
+            if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue;    // not PASS
+        }
+        double val = _calc_ld(buf, buf->vcf[i].rec, rec);
+        if ( buf->ld.max && buf->ld.max < val ) 
+        {
+            *ld = val;
+            return buf->vcf[i].rec;
+        }
+        if ( val > max )
+        {
+            max  = val;
+            imax = i;
+        }
+    }
+    *ld = max;
+    return buf->vcf[imax].rec;
+}
+
+
diff --git a/bcftools/vcfbuf.c.pysam.c b/bcftools/vcfbuf.c.pysam.c
new file mode 100644
index 0000000..2dc3dae
--- /dev/null
+++ b/bcftools/vcfbuf.c.pysam.c
@@ -0,0 +1,444 @@
+#include "pysam.h"
+
+/* The MIT License
+
+   Copyright (c) 2016 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3 at sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <htslib/vcf.h>
+#include <htslib/vcfutils.h>
+#include "bcftools.h"
+#include "vcfbuf.h"
+#include "rbuf.h"
+
+typedef struct
+{
+    double max;
+    int rand_missing, skip_filter;
+}
+ld_t;
+
+typedef struct
+{
+    bcf1_t *rec;
+    double af;
+    int af_set:1, idx:31;
+}
+vcfrec_t;
+
+typedef struct
+{
+    int max_sites, mvrec, mac, mfarr;
+    int *ac, *idx;
+    float *farr;
+    char *af_tag;
+    vcfrec_t **vrec;
+}
+prune_t;
+
+typedef struct
+{
+    int active, rid, end;
+}
+overlap_t;
+
+struct _vcfbuf_t
+{
+    int win;
+    bcf_hdr_t *hdr;
+    vcfrec_t *vcf;
+    rbuf_t rbuf;
+    ld_t ld;
+    prune_t prune;
+    overlap_t overlap;
+};
+
+vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
+{
+    vcfbuf_t *buf = (vcfbuf_t*) calloc(1,sizeof(vcfbuf_t));
+    buf->hdr = hdr;
+    buf->win = win;
+    buf->overlap.rid = -1;
+    rbuf_init(&buf->rbuf, 0);
+    return buf;
+}
+
+void vcfbuf_destroy(vcfbuf_t *buf)
+{
+    int i;
+    for (i=0; i<buf->rbuf.m; i++)
+        if ( buf->vcf[i].rec ) bcf_destroy(buf->vcf[i].rec);
+    free(buf->vcf);
+    free(buf->prune.farr);
+    free(buf->prune.vrec);
+    free(buf->prune.ac);
+    free(buf->prune.idx);
+    free(buf);
+}
+
+void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value)
+{
+    if ( key==VCFBUF_LD_MAX ) { buf->ld.max = *((double*)value); return; }
+    if ( key==VCFBUF_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
+    if ( key==VCFBUF_SKIP_FILTER ) { buf->ld.skip_filter = *((int*)value); return; }
+    if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; }
+    if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; }
+    if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; }
+}
+
+int vcfbuf_nsites(vcfbuf_t *buf)
+{
+    return buf->rbuf.n;
+}
+
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
+{
+    if ( !swap ) error("todo: swap=%d\n", swap);
+
+    rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
+
+    int i = rbuf_append(&buf->rbuf);
+    if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1();
+    
+    bcf1_t *ret = buf->vcf[i].rec;
+    buf->vcf[i].rec = rec;
+    buf->vcf[i].af_set = 0;
+
+    return ret;
+}
+
+static int cmpvrec(const void *_a, const void *_b)
+{
+    vcfrec_t *a = *((vcfrec_t**) _a);
+    vcfrec_t *b = *((vcfrec_t**) _b);
+    if ( a->af < b->af ) return -1;
+    if ( a->af == b->af ) return 0;
+    return 1;
+}
+static int cmpint_desc(const void *_a, const void *_b)
+{
+    int a = *((int*)_a);
+    int b = *((int*)_b);
+    if ( a < b ) return 1;
+    if ( a == b ) return 0;
+    return -1;
+}
+
+static void _prune_sites(vcfbuf_t *buf, int flush_all)
+{
+    int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1;
+
+    if ( nbuf > buf->prune.mvrec )
+    {
+        buf->prune.idx   = (int*) realloc(buf->prune.idx, nbuf*sizeof(int));
+        buf->prune.vrec  = (vcfrec_t**) realloc(buf->prune.vrec, nbuf*sizeof(vcfrec_t*));
+        buf->prune.mvrec = nbuf;
+    }
+
+    // set allele frequency and prepare buffer for sorting
+    int i,k,irec = 0;
+    for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; )
+    {
+        bcf1_t *line = buf->vcf[i].rec;
+        if ( line->n_allele > buf->prune.mac ) 
+        {
+            buf->prune.ac = (int*) realloc(buf->prune.ac, line->n_allele*sizeof(*buf->prune.ac));
+            buf->prune.mac = line->n_allele;
+        }
+        if ( !buf->vcf[i].af_set )
+        {
+            buf->vcf[i].af = 0;
+            if ( buf->prune.af_tag )
+            {
+                if ( bcf_get_info_float(buf->hdr,line,buf->prune.af_tag,&buf->prune.farr, &buf->prune.mfarr) > 0 ) buf->vcf[i].af = buf->prune.farr[0];
+            }
+            else if ( bcf_calc_ac(buf->hdr, line, buf->prune.ac, BCF_UN_INFO|BCF_UN_FMT) )
+            {
+                int ntot = buf->prune.ac[0], nalt = 0; 
+                for (k=1; k<line->n_allele; k++) nalt += buf->prune.ac[k];
+                buf->vcf[i].af = ntot ? (float)nalt/ntot : 0;
+            }
+            buf->vcf[i].af_set = 1;
+        }
+        buf->vcf[i].idx = irec;
+        buf->prune.vrec[irec++] = &buf->vcf[i];
+    }
+
+    // sort by allele frequency, low AF will be removed preferentially
+    qsort(buf->prune.vrec, nbuf, sizeof(*buf->prune.vrec), cmpvrec);
+
+    // sort the rbuf indexes to be pruned descendently so that j-th rbuf index
+    // is removed before i-th index if i<j
+    int nprune = nbuf - buf->prune.max_sites;
+    for (i=0; i<nprune; i++)
+        buf->prune.idx[i] = buf->prune.vrec[i]->idx;
+
+    qsort(buf->prune.idx, nprune, sizeof(int), cmpint_desc);
+
+    for (i=0; i<nprune; i++)
+        rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf);
+}
+
+static int _overlap_can_flush(vcfbuf_t *buf, int flush_all)
+{
+    if ( flush_all ) { buf->overlap.rid = -1; return 1; }
+
+    int i = rbuf_last(&buf->rbuf);
+    vcfrec_t *last = &buf->vcf[i];
+    if ( buf->overlap.rid != last->rec->rid ) buf->overlap.end = 0;
+
+    int beg_pos = last->rec->pos;
+    int end_pos = last->rec->pos + last->rec->rlen - 1;
+
+    // Assuming left-aligned indels. In case it is a deletion, the real variant
+    // starts one base after. If an insertion, the overlap with previous zero length.
+    int imin = last->rec->rlen;
+    for (i=0; i<last->rec->n_allele; i++)
+    {
+        char *ref = last->rec->d.allele[0];
+        char *alt = last->rec->d.allele[i];
+        if ( *alt == '<' ) continue;    // ignore symbolic alleles
+        while ( *ref && *alt && nt_to_upper(*ref)==nt_to_upper(*alt) ) { ref++; alt++; }
+        if ( imin > ref - last->rec->d.allele[0] ) imin = ref - last->rec->d.allele[0];
+    }
+
+    if ( beg_pos <= buf->overlap.end )
+    {
+        beg_pos += imin;
+        if ( beg_pos > end_pos ) end_pos = beg_pos;
+    }
+
+    if ( buf->rbuf.n==1 )
+    {
+        buf->overlap.rid = last->rec->rid;
+        buf->overlap.end = end_pos;
+        return 0; 
+    }
+    if ( beg_pos <= buf->overlap.end )
+    {
+        if ( buf->overlap.end < end_pos ) buf->overlap.end = end_pos;
+        return 0;
+    }
+    return 1;
+}
+
+bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
+{
+    int i,j;
+
+    if ( buf->rbuf.n==0 ) return NULL;
+    if ( flush_all ) goto ret;
+
+    i = rbuf_kth(&buf->rbuf, 0);    // first
+    j = rbuf_last(&buf->rbuf);      // last
+
+    if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret;
+    if ( buf->overlap.active )
+    {
+        int ret = _overlap_can_flush(buf, flush_all);
+        //printf("can_flush: %d  %d - %d\n", ret, buf->vcf[i].rec->pos+1, buf->vcf[j].rec->pos+1);
+        if ( ret ) goto ret;
+    }
+    //if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret;
+
+    if ( buf->win > 0 )
+    {
+        if ( buf->rbuf.n <= buf->win ) return NULL;
+        goto ret;
+    }
+    else if ( buf->win < 0 )
+    {
+        if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL;
+    }
+    else return NULL;
+    
+ret:
+    if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);
+
+    i = rbuf_shift(&buf->rbuf);
+    return buf->vcf[i].rec;
+}
+
+static double _estimate_af(int8_t *ptr, int size, int nvals, int nsamples)
+{
+    int i,j, nref = 0, nalt = 0;
+    for (i=0; i<nsamples; i++)
+    {
+        for (j=0; j<nvals; j++)
+        {
+            if ( ptr[j]==bcf_gt_missing ) break;
+            if ( ptr[j]==bcf_int8_vector_end ) break;
+            if ( bcf_gt_allele(ptr[j]) ) nalt++;
+            else nref++;
+        }
+        ptr += size;
+    }
+    if ( nref+nalt == 0 ) return 0;
+    return (double)nalt/(nref+nalt);
+}
+
+/*
+    For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
+        D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb))
+*/
+static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
+{
+    if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample);
+    assert( arec->n_sample );
+
+    int i,j,igt = bcf_hdr_id2int(buf->hdr, BCF_DT_ID, "GT");
+    bcf_unpack(arec, BCF_UN_FMT);
+    bcf_unpack(brec, BCF_UN_FMT);
+    bcf_fmt_t *afmt = NULL, *bfmt = NULL;
+    for (i=0; i<arec->n_fmt; i++)
+        if ( arec->d.fmt[i].id==igt ) { afmt = &arec->d.fmt[i]; break; }
+    if ( !afmt ) return -1;  // no GT tag
+    for (i=0; i<brec->n_fmt; i++)
+        if ( brec->d.fmt[i].id==igt ) { bfmt = &brec->d.fmt[i]; break; }
+    if ( !bfmt ) return -1;  // no GT tag
+
+    if ( afmt->n==0 ) return -1;   // empty?!
+    if ( bfmt->n==0 ) return -1;   // empty?!
+    if ( afmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8!\n");
+    if ( bfmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8!\n");
+
+    // Determine allele frequencies, this is to sample randomly missing genotypes
+    double aaf = 0, baf = 0;
+    if ( buf->ld.rand_missing )
+    {
+        aaf = _estimate_af((int8_t*)afmt->p, afmt->size, afmt->n, arec->n_sample);
+        baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample);
+    }
+
+    // Calculate correlation 
+    double ab = 0, aa = 0, bb = 0, a = 0, b = 0;
+    int nab = 0, na = 0, nb = 0, ndiff = 0;
+    for (i=0; i<arec->n_sample; i++)
+    {
+        int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size);
+        int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size);
+        int adsg = 0, bdsg = 0, an = 0, bn = 0;
+        for (j=0; j<afmt->n; j++)
+        {
+            if ( aptr[j]==bcf_int8_vector_end ) break;
+            if ( aptr[j]==bcf_gt_missing )
+            {
+                if ( !buf->ld.rand_missing ) break;
+                if ( rand()/RAND_MAX >= aaf ) adsg += 1;
+            }
+            else if ( bcf_gt_allele(aptr[j]) ) adsg += 1;
+            an++;
+        }
+        for (j=0; j<bfmt->n; j++)
+        {
+            if ( bptr[j]==bcf_int8_vector_end ) break;
+            if ( bptr[j]==bcf_gt_missing )
+            {
+                if ( !buf->ld.rand_missing ) break;
+                if ( rand()/RAND_MAX >= baf ) bdsg += 1;
+            }
+            else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1;
+            bn++;
+        }
+        if ( an )
+        {
+            aa += adsg*adsg;
+            a  += adsg;
+            na++;
+        }
+        if ( bn )
+        {
+            bb += bdsg*bdsg;
+            b  += bdsg;
+            nb++;
+        }
+        if ( an && bn )
+        {
+            if ( adsg!=bdsg ) ndiff++;
+            ab += adsg*bdsg;
+            nab++;
+        }
+    }
+    if ( !nab ) return -1;
+
+    double cor;
+    if ( !ndiff ) cor = 1;
+    else
+    {
+        // Don't know how to deal with zero variance. Since this the purpose is filtering,
+        // it is not enough to say the value is undefined. Therefore an artificial noise is
+        // added to make the denominator non-zero.
+        if ( aa == a*a/na || bb == b*b/nb )
+        {
+            aa += 3*3;
+            bb += 3*3;
+            ab += 3*3;
+            a  += 3;
+            b  += 3;
+            na++;
+            nb++;
+            nab++;
+        }
+        cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb);
+    }
+    return cor*cor;
+}
+
+bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld)
+{
+    *ld = -1;
+    if ( !buf->rbuf.n ) return NULL;
+
+    int i = buf->rbuf.f;
+
+    // Relying on vcfbuf being properly flushed - all sites in the buffer
+    // must come from the same chromosome
+    if ( buf->vcf[i].rec->rid != rec->rid ) return NULL;
+
+    int imax = 0;
+    double max = 0;
+    for (i=-1; rbuf_next(&buf->rbuf,&i); )
+    {   
+        if ( buf->ld.skip_filter )
+        {
+            if ( buf->vcf[i].rec->d.n_flt > 1 ) continue;   // multiple filters are set
+            if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue;    // not PASS
+        }
+        double val = _calc_ld(buf, buf->vcf[i].rec, rec);
+        if ( buf->ld.max && buf->ld.max < val ) 
+        {
+            *ld = val;
+            return buf->vcf[i].rec;
+        }
+        if ( val > max )
+        {
+            max  = val;
+            imax = i;
+        }
+    }
+    *ld = max;
+    return buf->vcf[imax].rec;
+}
+
+
diff --git a/bcftools/vcfbuf.h b/bcftools/vcfbuf.h
new file mode 100644
index 0000000..5494323
--- /dev/null
+++ b/bcftools/vcfbuf.h
@@ -0,0 +1,81 @@
+/* The MIT License
+
+   Copyright (c) 2017 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3 at sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+/*
+    Buffer VCF records and perform operations on the buffer
+*/
+
+#ifndef __VCFBUF_H__
+#define __VCFBUF_H__
+
+#include <htslib/vcf.h>
+
+typedef struct _vcfbuf_t vcfbuf_t;
+
+// Modes of operation
+typedef enum
+{
+    VCFBUF_LD_MAX,          // vcfbuf_max_ld() stops at the first record that exceeds the threshold
+    VCFBUF_RAND_MISSING,    // randomize rather than ignore missing genotypes
+    VCFBUF_SKIP_FILTER,     // skip sites with FILTER diferent from "PASS" or "."
+    VCFBUF_NSITES,          // leave at max this many sites in the window
+    VCFBUF_AF_TAG,          // use this INFO tag with LD_NSITES
+    VCFBUF_OVERLAP_WIN,     // keep only overlapping variants in the window
+}
+vcfbuf_opt_t;
+
+#define vcfbuf_set_opt(buf,type,key,value) { type tmp = value; vcfbuf_set(buf, key, (void*)&tmp); }
+void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value);
+
+
+/*
+ *  vcfbuf_init() - init buffer
+ *  @win:   number of sites (>0) or bp (<0)
+ */
+vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win);
+void vcfbuf_destroy(vcfbuf_t *buf);
+
+/*
+ *  vcfbuf_push() - push a new site for analysis
+ *  @swap:  if set, do not create a copy, but return a substitute
+ */
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap);
+
+bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all);
+
+/*
+ *  vcfbuf_nsites() - return the number of sites in the buffer
+ */
+int vcfbuf_nsites(vcfbuf_t *buf);
+
+/*
+ *  vcfbuf_max_ld() - return a record that has maximum D or first record exceeding the threshold
+ *  @ld:        will be filled with the maximum D found
+ */
+bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld);
+
+#endif
+
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c
index f650bea..8f596d4 100644
--- a/bcftools/vcfconvert.c
+++ b/bcftools/vcfconvert.c
@@ -862,7 +862,7 @@ static void vcf_to_haplegendsample(args_t *args)
     if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0;
     if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
 
-    if (hap_fname) fprintf(stderr, "Haps file: %s\n", hap_fname);
+    if (hap_fname) fprintf(stderr, "Hap file: %s\n", hap_fname);
     if (legend_fname) fprintf(stderr, "Legend file: %s\n", legend_fname);
     if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
 
@@ -1010,7 +1010,7 @@ static void vcf_to_hapsample(args_t *args)
     if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
     if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
 
-    if (hap_fname) fprintf(stderr, "Haps file: %s\n", hap_fname);
+    if (hap_fname) fprintf(stderr, "Hap file: %s\n", hap_fname);
     if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
 
     // write samples file
@@ -1397,8 +1397,8 @@ static void usage(void)
     fprintf(stderr, "   -f, --fasta-ref <file>      reference sequence in fasta format\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
-    fprintf(stderr, "       --hapsample2vcf <...>   <prefix>|<haps-file>,<sample-file>\n");
-    fprintf(stderr, "       --hapsample <...>       <prefix>|<haps-file>,<sample-file>\n");
+    fprintf(stderr, "       --hapsample2vcf <...>   <prefix>|<hap-file>,<sample-file>\n");
+    fprintf(stderr, "       --hapsample <...>       <prefix>|<hap-file>,<sample-file>\n");
     fprintf(stderr, "       --haploid2diploid       convert haploid genotypes to diploid homozygotes\n");
     fprintf(stderr, "       --sex <file>            output sex column in the sample-file, input format is: Sample\\t[MF]\n");
     fprintf(stderr, "       --vcf-ids               output VCF IDs instead of CHROM:POS_REF_ALT\n");
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c
index 4d3469c..53df3d9 100644
--- a/bcftools/vcfconvert.c.pysam.c
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -864,7 +864,7 @@ static void vcf_to_haplegendsample(args_t *args)
     if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0;
     if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
 
-    if (hap_fname) fprintf(pysam_stderr, "Haps file: %s\n", hap_fname);
+    if (hap_fname) fprintf(pysam_stderr, "Hap file: %s\n", hap_fname);
     if (legend_fname) fprintf(pysam_stderr, "Legend file: %s\n", legend_fname);
     if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
 
@@ -1012,7 +1012,7 @@ static void vcf_to_hapsample(args_t *args)
     if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0;
     if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0;
 
-    if (hap_fname) fprintf(pysam_stderr, "Haps file: %s\n", hap_fname);
+    if (hap_fname) fprintf(pysam_stderr, "Hap file: %s\n", hap_fname);
     if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
 
     // write samples file
@@ -1399,8 +1399,8 @@ static void usage(void)
     fprintf(pysam_stderr, "   -f, --fasta-ref <file>      reference sequence in fasta format\n");
     fprintf(pysam_stderr, "\n");
     fprintf(pysam_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
-    fprintf(pysam_stderr, "       --hapsample2vcf <...>   <prefix>|<haps-file>,<sample-file>\n");
-    fprintf(pysam_stderr, "       --hapsample <...>       <prefix>|<haps-file>,<sample-file>\n");
+    fprintf(pysam_stderr, "       --hapsample2vcf <...>   <prefix>|<hap-file>,<sample-file>\n");
+    fprintf(pysam_stderr, "       --hapsample <...>       <prefix>|<hap-file>,<sample-file>\n");
     fprintf(pysam_stderr, "       --haploid2diploid       convert haploid genotypes to diploid homozygotes\n");
     fprintf(pysam_stderr, "       --sex <file>            output sex column in the sample-file, input format is: Sample\\t[MF]\n");
     fprintf(pysam_stderr, "       --vcf-ids               output VCF IDs instead of CHROM:POS_REF_ALT\n");
diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c
index 1aeb739..e9ed5ad 100644
--- a/bcftools/vcfmerge.c
+++ b/bcftools/vcfmerge.c
@@ -104,7 +104,6 @@ typedef struct
     int rid;        // current rid
     int beg,end;    // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
     int cur;        // current line or -1 if none
-    int npos;       // number of unprocessed lines at this position
     int mrec;       // allocated size of buf
     maux1_t *rec;   // buffer to keep reader's lines
     bcf1_t **lines; // source buffer: either gvcf or readers' buffer
@@ -758,10 +757,12 @@ void maux_reset(maux_t *ma)
         ma->pos = line->pos;
         break;
     }
-    if ( chr )
+    int new_chr = 0;
+    if ( chr && (!ma->chr || strcmp(ma->chr,chr)) )
     {
         free(ma->chr);
         ma->chr = strdup(chr);
+        new_chr = 1;
     }
     for (i=0; i<ma->n; i++)
     {
@@ -781,6 +782,7 @@ void maux_reset(maux_t *ma)
             ma->buf[i].lines = ma->files->readers[i].buffer;
             if ( ma->gvcf ) ma->gvcf[i].active = 0;     // gvcf block cannot overlap with the next record
         }
+        if ( new_chr && ma->gvcf ) ma->gvcf[i].active = 0;  // make sure to close active gvcf block on new chr
     }
 }
 void maux_debug(maux_t *ma, int ir, int ib)
@@ -1985,6 +1987,31 @@ void debug_maux(args_t *args)
     fprintf(stderr,"\n\n");
 }
 
+void debug_state(args_t *args)
+{
+    maux_t *maux = args->maux;
+    int i,j;
+    for (i=0; i<args->files->nreaders; i++)
+    {
+        fprintf(stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
+        if ( maux->buf[i].cur >=0 )
+        {
+            bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
+            const char *chr = bcf_hdr_id2name(hdr, maux->buf[i].rid);
+            fprintf(stderr,"\t");
+            for (j=maux->buf[i].beg; j<maux->buf[i].end; j++) fprintf(stderr," %s:%d",chr,maux->buf[i].lines[j]->pos+1);
+        }
+        fprintf(stderr,"\n");
+    }
+    for (i=0; i<args->files->nreaders; i++)
+    {
+        fprintf(stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
+        if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%d,%d", maux->gvcf[i].line->pos+1,maux->gvcf[i].end+1);
+        fprintf(stderr,"\n");
+    }
+    fprintf(stderr,"\n");
+}
+
 
 /*
    Determine which line should be merged from which reader: go through all
@@ -2294,6 +2321,7 @@ void merge_vcf(args_t *args)
             merge_line(args);
         }
         clean_buffer(args);
+        // debug_state(args);
     }
     if ( args->do_gvcf )
         gvcf_flush(args,1);
diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c
index db9aff5..a162905 100644
--- a/bcftools/vcfmerge.c.pysam.c
+++ b/bcftools/vcfmerge.c.pysam.c
@@ -106,7 +106,6 @@ typedef struct
     int rid;        // current rid
     int beg,end;    // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
     int cur;        // current line or -1 if none
-    int npos;       // number of unprocessed lines at this position
     int mrec;       // allocated size of buf
     maux1_t *rec;   // buffer to keep reader's lines
     bcf1_t **lines; // source buffer: either gvcf or readers' buffer
@@ -760,10 +759,12 @@ void maux_reset(maux_t *ma)
         ma->pos = line->pos;
         break;
     }
-    if ( chr )
+    int new_chr = 0;
+    if ( chr && (!ma->chr || strcmp(ma->chr,chr)) )
     {
         free(ma->chr);
         ma->chr = strdup(chr);
+        new_chr = 1;
     }
     for (i=0; i<ma->n; i++)
     {
@@ -783,6 +784,7 @@ void maux_reset(maux_t *ma)
             ma->buf[i].lines = ma->files->readers[i].buffer;
             if ( ma->gvcf ) ma->gvcf[i].active = 0;     // gvcf block cannot overlap with the next record
         }
+        if ( new_chr && ma->gvcf ) ma->gvcf[i].active = 0;  // make sure to close active gvcf block on new chr
     }
 }
 void maux_debug(maux_t *ma, int ir, int ib)
@@ -1987,6 +1989,31 @@ void debug_maux(args_t *args)
     fprintf(pysam_stderr,"\n\n");
 }
 
+void debug_state(args_t *args)
+{
+    maux_t *maux = args->maux;
+    int i,j;
+    for (i=0; i<args->files->nreaders; i++)
+    {
+        fprintf(pysam_stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
+        if ( maux->buf[i].cur >=0 )
+        {
+            bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
+            const char *chr = bcf_hdr_id2name(hdr, maux->buf[i].rid);
+            fprintf(pysam_stderr,"\t");
+            for (j=maux->buf[i].beg; j<maux->buf[i].end; j++) fprintf(pysam_stderr," %s:%d",chr,maux->buf[i].lines[j]->pos+1);
+        }
+        fprintf(pysam_stderr,"\n");
+    }
+    for (i=0; i<args->files->nreaders; i++)
+    {
+        fprintf(pysam_stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
+        if ( maux->gvcf[i].active ) fprintf(pysam_stderr,"\tpos,end=%d,%d", maux->gvcf[i].line->pos+1,maux->gvcf[i].end+1);
+        fprintf(pysam_stderr,"\n");
+    }
+    fprintf(pysam_stderr,"\n");
+}
+
 
 /*
    Determine which line should be merged from which reader: go through all
@@ -2296,6 +2323,7 @@ void merge_vcf(args_t *args)
             merge_line(args);
         }
         clean_buffer(args);
+        // debug_state(args);
     }
     if ( args->do_gvcf )
         gvcf_flush(args,1);
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c
index 3a1706b..86c20ab 100644
--- a/bcftools/vcfnorm.c
+++ b/bcftools/vcfnorm.c
@@ -104,6 +104,15 @@ static inline int has_non_acgtn(char *seq, int nseq)
     return 0;
 }
 
+static void seq_to_upper(char *seq, int len)
+{
+    int i;
+    if ( len )
+        for (i=0; i<len; i++) seq[i] = nt_to_upper(seq[i]);
+    else
+        for (i=0; seq[i]; i++) seq[i] = nt_to_upper(seq[i]);
+}
+
 static void fix_ref(args_t *args, bcf1_t *line)
 {
     int reflen = strlen(line->d.allele[0]);
@@ -274,6 +283,7 @@ static int realign(args_t *args, bcf1_t *line)
     int i, nref, reflen = strlen(line->d.allele[0]);
     char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
     if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
+    seq_to_upper(ref,0);
     replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
 
     // does VCF REF contain non-standard bases?
@@ -298,7 +308,16 @@ static int realign(args_t *args, bcf1_t *line)
     free(ref);
     ref = NULL;
 
-    if ( line->n_allele == 1 ) return ERR_OK;    // a REF
+    if ( line->n_allele == 1 ) // a REF
+    {
+        if ( line->rlen > 1 )
+        {
+            line->d.allele[0][1] = 0;
+            bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+        }
+        return ERR_OK;
+    }
+    if ( bcf_get_variant_types(line)==VCF_BND ) return ERR_SYMBOLIC;   // breakend, not an error
 
     // make a copy of each allele for trimming
     hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
@@ -307,7 +326,6 @@ static int realign(args_t *args, bcf1_t *line)
     {
         if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC;  // symbolic allele
         if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
-        if ( bcf_get_variant_type(line,i)==VCF_BND ) return ERR_SYMBOLIC;   // breakend, not an error
         if ( has_non_acgtn(line->d.allele[i],0) )
         {
             if ( args->check_ref==CHECK_REF_EXIT )
@@ -319,8 +337,9 @@ static int realign(args_t *args, bcf1_t *line)
 
         als[i].l = 0;
         kputs(line->d.allele[i], &als[i]);
+        seq_to_upper(als[i].s,0);
 
-        if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
+        if ( i>0 && als[i].l==als[0].l && !strcmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
     }
 
     // trim from right
@@ -388,7 +407,7 @@ static int realign(args_t *args, bcf1_t *line)
 
     // Have the alleles changed?
     als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
-    if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+    if ( ori_pos==line->pos && !strcmp(line->d.allele[0],als[0].s) ) return ERR_OK;
 
     // Create new block of alleles and update
     args->tmp_als_str.l = 0;
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c
index da5a2aa..a54180d 100644
--- a/bcftools/vcfnorm.c.pysam.c
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -106,6 +106,15 @@ static inline int has_non_acgtn(char *seq, int nseq)
     return 0;
 }
 
+static void seq_to_upper(char *seq, int len)
+{
+    int i;
+    if ( len )
+        for (i=0; i<len; i++) seq[i] = nt_to_upper(seq[i]);
+    else
+        for (i=0; seq[i]; i++) seq[i] = nt_to_upper(seq[i]);
+}
+
 static void fix_ref(args_t *args, bcf1_t *line)
 {
     int reflen = strlen(line->d.allele[0]);
@@ -276,6 +285,7 @@ static int realign(args_t *args, bcf1_t *line)
     int i, nref, reflen = strlen(line->d.allele[0]);
     char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
     if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
+    seq_to_upper(ref,0);
     replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
 
     // does VCF REF contain non-standard bases?
@@ -300,7 +310,16 @@ static int realign(args_t *args, bcf1_t *line)
     free(ref);
     ref = NULL;
 
-    if ( line->n_allele == 1 ) return ERR_OK;    // a REF
+    if ( line->n_allele == 1 ) // a REF
+    {
+        if ( line->rlen > 1 )
+        {
+            line->d.allele[0][1] = 0;
+            bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+        }
+        return ERR_OK;
+    }
+    if ( bcf_get_variant_types(line)==VCF_BND ) return ERR_SYMBOLIC;   // breakend, not an error
 
     // make a copy of each allele for trimming
     hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
@@ -309,7 +328,6 @@ static int realign(args_t *args, bcf1_t *line)
     {
         if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC;  // symbolic allele
         if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
-        if ( bcf_get_variant_type(line,i)==VCF_BND ) return ERR_SYMBOLIC;   // breakend, not an error
         if ( has_non_acgtn(line->d.allele[i],0) )
         {
             if ( args->check_ref==CHECK_REF_EXIT )
@@ -321,8 +339,9 @@ static int realign(args_t *args, bcf1_t *line)
 
         als[i].l = 0;
         kputs(line->d.allele[i], &als[i]);
+        seq_to_upper(als[i].s,0);
 
-        if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
+        if ( i>0 && als[i].l==als[0].l && !strcmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
     }
 
     // trim from right
@@ -390,7 +409,7 @@ static int realign(args_t *args, bcf1_t *line)
 
     // Have the alleles changed?
     als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
-    if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+    if ( ori_pos==line->pos && !strcmp(line->d.allele[0],als[0].s) ) return ERR_OK;
 
     // Create new block of alleles and update
     args->tmp_als_str.l = 0;
diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c
index bfd6ad2..a53ac3c 100644
--- a/bcftools/vcfplugin.c
+++ b/bcftools/vcfplugin.c
@@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.  */
 
+#include "config.h"
 #include <stdio.h>
 #include <strings.h>
 #include <unistd.h>
@@ -42,6 +43,8 @@ THE SOFTWARE.  */
 #include "vcmp.h"
 #include "filter.h"
 
+#ifdef ENABLE_BCF_PLUGINS
+
 typedef struct _plugin_t plugin_t;
 
 /**
@@ -209,7 +212,7 @@ static void *dlopen_plugin(args_t *args, const char *fname)
         int i;
         for (i=0; i<args->nplugin_paths; i++)
         {
-            tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname);
+	    tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT);
             handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
             if ( args->verbose > 1 )
             {
@@ -362,6 +365,7 @@ static int list_plugins(args_t *args)
     init_plugin_paths(args);
 
     kstring_t str = {0,0,0};
+    int plugin_ext_len = strlen(PLUGIN_EXT);
     int i;
     for (i=0; i<args->nplugin_paths; i++)
     {
@@ -372,7 +376,7 @@ static int list_plugins(args_t *args)
         while ( (ep=readdir(dp)) )
         {
             int len = strlen(ep->d_name);
-            if ( strcasecmp(".so",ep->d_name+len-3) ) continue;
+            if ( strcasecmp(PLUGIN_EXT,ep->d_name+len-plugin_ext_len) ) continue;
             str.l = 0;
             ksprintf(&str,"%s/%s", args->plugin_paths[i],ep->d_name);
             hts_expand(plugin_t, nplugins+1, mplugins, plugins);
@@ -641,3 +645,13 @@ int main_plugin(int argc, char *argv[])
     return 0;
 }
 
+#else /* ENABLE_BCF_PLUGINS */
+
+int main_plugin(int argc, char *argv[])
+{
+    fprintf(stderr, "bcftools plugins are disabled.  To use them, you will need to rebuild\n"
+	    "bcftools from the source distribution with plugins enabled.\n");
+    return 1;
+}
+
+#endif /* ENABLE_BCF_PLUGINS */
diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c
index ec1d586..6ea7eb6 100644
--- a/bcftools/vcfplugin.c.pysam.c
+++ b/bcftools/vcfplugin.c.pysam.c
@@ -24,6 +24,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.  */
 
+#include "config.h"
 #include <stdio.h>
 #include <strings.h>
 #include <unistd.h>
@@ -44,6 +45,8 @@ THE SOFTWARE.  */
 #include "vcmp.h"
 #include "filter.h"
 
+#ifdef ENABLE_BCF_PLUGINS
+
 typedef struct _plugin_t plugin_t;
 
 /**
@@ -211,7 +214,7 @@ static void *dlopen_plugin(args_t *args, const char *fname)
         int i;
         for (i=0; i<args->nplugin_paths; i++)
         {
-            tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname);
+	    tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT);
             handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
             if ( args->verbose > 1 )
             {
@@ -364,6 +367,7 @@ static int list_plugins(args_t *args)
     init_plugin_paths(args);
 
     kstring_t str = {0,0,0};
+    int plugin_ext_len = strlen(PLUGIN_EXT);
     int i;
     for (i=0; i<args->nplugin_paths; i++)
     {
@@ -374,7 +378,7 @@ static int list_plugins(args_t *args)
         while ( (ep=readdir(dp)) )
         {
             int len = strlen(ep->d_name);
-            if ( strcasecmp(".so",ep->d_name+len-3) ) continue;
+            if ( strcasecmp(PLUGIN_EXT,ep->d_name+len-plugin_ext_len) ) continue;
             str.l = 0;
             ksprintf(&str,"%s/%s", args->plugin_paths[i],ep->d_name);
             hts_expand(plugin_t, nplugins+1, mplugins, plugins);
@@ -643,3 +647,13 @@ int main_plugin(int argc, char *argv[])
     return 0;
 }
 
+#else /* ENABLE_BCF_PLUGINS */
+
+int main_plugin(int argc, char *argv[])
+{
+    fprintf(pysam_stderr, "bcftools plugins are disabled.  To use them, you will need to rebuild\n"
+	    "bcftools from the source distribution with plugins enabled.\n");
+    return 1;
+}
+
+#endif /* ENABLE_BCF_PLUGINS */
diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c
index 9437d7e..8c1d055 100644
--- a/bcftools/vcfroh.c
+++ b/bcftools/vcfroh.c
@@ -342,7 +342,7 @@ static void destroy_data(args_t *args)
     free(args->samples);
 }
 
-static int load_genmap(args_t *args, bcf1_t *line)
+static int load_genmap(args_t *args, const char *chr)
 {
     if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; }
 
@@ -351,7 +351,7 @@ static int load_genmap(args_t *args, bcf1_t *line)
     if ( fname )
     {
         kputsn(args->genmap_fname, fname - args->genmap_fname, &str);
-        kputs(bcf_seqname(args->hdr,line), &str);
+        kputs(chr, &str);
         kputs(fname+7,&str);
         fname = str.s;
     }
@@ -488,7 +488,7 @@ static void flush_viterbi(args_t *args, int ismpl)
         hmm_restore(args->hmm, smpl->snapshot); 
         int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites;
         if ( end < smpl->nsites )
-            smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->nsites - args->nbuf_olap - 1);
+            smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->sites[smpl->nsites - args->nbuf_olap - 1]);
 
         args->igenmap = smpl->igenmap;
         hmm_run_viterbi(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
@@ -631,7 +631,6 @@ static void flush_viterbi(args_t *args, int ismpl)
     }
 }
 
-
 int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
 {
     if ( tgt->nals != line->n_allele ) return -1;    // number of alleles does not match
@@ -998,20 +997,25 @@ static void vcfroh(args_t *args, bcf1_t *line)
     {
         args->prev_rid = line->rid;
         args->prev_pos = line->pos;
-        skip_rid = load_genmap(args, line);
+        skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
     }
 
     // New chromosome?
     if ( args->prev_rid!=line->rid )
     {
-        skip_rid = load_genmap(args, line);
         if ( !args->vi_training )
         {
-            for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
+            for (i=0; i<args->roh_smpl->n; i++)
+            {
+                flush_viterbi(args, i);
+                hmm_reset(args->hmm, args->smpl[i].snapshot);
+            }
         }
         args->prev_rid = line->rid;
         args->prev_pos = line->pos;
+        skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
     }
+    else if ( args->prev_pos == line->pos ) return;     // skip duplicate positions
 
     if ( skip_rid )
     {
diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c
index 70ed798..2bf45b7 100644
--- a/bcftools/vcfroh.c.pysam.c
+++ b/bcftools/vcfroh.c.pysam.c
@@ -344,7 +344,7 @@ static void destroy_data(args_t *args)
     free(args->samples);
 }
 
-static int load_genmap(args_t *args, bcf1_t *line)
+static int load_genmap(args_t *args, const char *chr)
 {
     if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; }
 
@@ -353,7 +353,7 @@ static int load_genmap(args_t *args, bcf1_t *line)
     if ( fname )
     {
         kputsn(args->genmap_fname, fname - args->genmap_fname, &str);
-        kputs(bcf_seqname(args->hdr,line), &str);
+        kputs(chr, &str);
         kputs(fname+7,&str);
         fname = str.s;
     }
@@ -490,7 +490,7 @@ static void flush_viterbi(args_t *args, int ismpl)
         hmm_restore(args->hmm, smpl->snapshot); 
         int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites;
         if ( end < smpl->nsites )
-            smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->nsites - args->nbuf_olap - 1);
+            smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->sites[smpl->nsites - args->nbuf_olap - 1]);
 
         args->igenmap = smpl->igenmap;
         hmm_run_viterbi(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
@@ -633,7 +633,6 @@ static void flush_viterbi(args_t *args, int ismpl)
     }
 }
 
-
 int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
 {
     if ( tgt->nals != line->n_allele ) return -1;    // number of alleles does not match
@@ -1000,20 +999,25 @@ static void vcfroh(args_t *args, bcf1_t *line)
     {
         args->prev_rid = line->rid;
         args->prev_pos = line->pos;
-        skip_rid = load_genmap(args, line);
+        skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
     }
 
     // New chromosome?
     if ( args->prev_rid!=line->rid )
     {
-        skip_rid = load_genmap(args, line);
         if ( !args->vi_training )
         {
-            for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
+            for (i=0; i<args->roh_smpl->n; i++)
+            {
+                flush_viterbi(args, i);
+                hmm_reset(args->hmm, args->smpl[i].snapshot);
+            }
         }
         args->prev_rid = line->rid;
         args->prev_pos = line->pos;
+        skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
     }
+    else if ( args->prev_pos == line->pos ) return;     // skip duplicate positions
 
     if ( skip_rid )
     {
diff --git a/bcftools/version.h b/bcftools/version.h
index 84247e7..11ee02d 100644
--- a/bcftools/version.h
+++ b/bcftools/version.h
@@ -1 +1 @@
-#define BCFTOOLS_VERSION "1.4.1"
+#define BCFTOOLS_VERSION "1.5"
diff --git a/cy_build.py b/cy_build.py
index 29af588..fae7055 100644
--- a/cy_build.py
+++ b/cy_build.py
@@ -60,26 +60,22 @@ class cy_build_ext(build_ext):
             ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
 
         if sys.platform == 'darwin':
+            # The idea is to give shared libraries an install name of the form
+            # `@rpath/<library-name.so>`, and to set the rpath equal to
+            # @loader_path. This will allow Python packages to find the library
+            # in the expected place, while still giving enough flexibility to
+            # external applications to link against the library.
             relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"]
-
-            if "develop" in sys.argv or "test" in sys.argv:
-                # develop-mode and tests use local directory
-                pkg_root = os.path.dirname(__file__)
-                linker_path = os.path.join(pkg_root, relative_module_path)
-            elif "bdist_wheel" in sys.argv or is_pip_install():
-                # making a wheel, or pip is secretly involved
-                linker_path = os.path.join("@rpath", relative_module_path)
-            else:
-                # making an egg: `python setup.py install` default behavior
-                egg_name = '%s.egg' % self._get_egg_name()
-                linker_path = os.path.join("@rpath", egg_name, relative_module_path)
+            library_path = os.path.join(
+                "@rpath", os.path.basename(relative_module_path)
+            )
 
             if not ext.extra_link_args:
                 ext.extra_link_args = []
             ext.extra_link_args += ['-dynamiclib',
-                                    '-rpath', get_python_lib(),
+                                    '-rpath', '@loader_path',
                                     '-Wl,-headerpad_max_install_names',
-                                    '-Wl,-install_name,%s' % linker_path,
+                                    '-Wl,-install_name,%s' % library_path,
                                     '-Wl,-x']
         else:
             if not ext.extra_link_args:
diff --git a/doc/conf.py b/doc/conf.py
index 5b92efd..375aa55 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -57,12 +57,13 @@ rst_epilog = '''
 .. _samtools: http://samtools.sourceforge.net/
 .. _bcftools: https://samtools.github.io/bcftools/bcftools.html
 .. _htslib: http://www.htslib.org/
-.. _tabix: http://samtools.sourceforge.net/tabix.shtml/
+.. _tabix: http://samtools.sourceforge.net/tabix.shtml
 .. _Galaxy: https://main.g2.bx.psu.edu/
 .. _cython: http://cython.org/
 .. _python: http://python.org/
 .. _pyximport: http://www.prescod.net/pyximport/
-
+.. _conda: https://conda.io/docs/
+.. _bioconda: https://bioconda.github.io/
 '''
 
 autosummary_generate = True
diff --git a/doc/faq.rst b/doc/faq.rst
index d5d84c4..e07e1cf 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -261,4 +261,18 @@ cython_ when building pysam. There are some known incompatibilities:
 
 .. _global interpreter lock: https://en.wikipedia.org/wiki/Global_interpreter_lock
 
+ImportError: cannot import name csamtools
+=========================================
+
+In version 0.10.0 and onwards, all pysam extension modules contain a
+``lib``-prefix. This facilates linking against pysam extension modules
+with compilers that require to start with ``lib``. As a consequence,
+all code using pysam extension modules directly will need to be
+adapted. For example, for example::
+
+   cimport pysam.csamtools
+
+will become::
+
+   cimport pysam.libcamtools
 
diff --git a/doc/glossary.rst b/doc/glossary.rst
index e35a537..3e739f9 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -1,4 +1,4 @@
-========
+ ========
 Glossary
 ========
 
@@ -6,12 +6,16 @@ Glossary
    :sorted:
 
    cigar
-      An alignment format string. In the python API, the cigar alignment is 
-      presented as a list of tuples ``(operation,length)``. For example, the tuple
-      ``[ (0,3), (1,5), (0,2) ]`` refers to an alignment with 3 matches, 5 insertions
-      and another 2 matches.
-
-   region 
+      Stands for Compact Idiosyncratic Gapped Alignment Report and
+      represents a compressed (run-length encoded) pairwise alignment
+      format.  It was first defined by the Exonerate Aligner, but was alter
+      adapted and adopted as part of the :term:`SAM` standard and many other
+      aligners.  In the Python API, the cigar alignment is presented as a
+      list of tuples ``(operation,length)``.  For example, the tuple ``[
+      (0,3), (1,5), (0,2) ]`` refers to an alignment with 3 matches, 5
+      insertions and another 2 matches.
+
+   region
       A genomic region, stated relative to a reference sequence. A
       region consists of reference name ('chr1'), start (10000), and
       end (20000). Start and end can be omitted for regions spanning
@@ -22,27 +26,30 @@ Glossary
       :term:`samtools` compatible region strings such as
       'chr1:10000:20000', which are closed, i.e., both positions 10,000
       and 20,000 are part of the interval.
- 
+
    column
       Reads that are aligned to a base in the :term:`reference` sequence.
-     
+
    tid
       The :term:`target` id. The target id is 0 or a positive integer mapping to
-      entries within the sequence dictionary in the header section of 
+      entries within the sequence dictionary in the header section of
       a :term:`TAM` file or :term:`BAM` file.
 
-   Reference
+   contig
       The sequence that a :term:`tid` refers to. For example ``chr1``, ``contig123``.
 
+   Reference
+      Synonym for contig
+
    SAM
        A textual format for storing genomic alignment information.
 
    BAM
-       Binary SAM format. BAM files are binary formatted, indexed and 
+       Binary SAM format. BAM files are binary formatted, indexed and
        allow random access.
 
    TAM
-       Text SAM file. TAM files are human readable files of 
+       Text SAM file. TAM files are human readable files of
        tab-separated fields. TAM files do not allow random access.
 
    sam file
@@ -50,7 +57,7 @@ Glossary
        be a :term:`BAM` file or a :term:`TAM` file.
 
    pileup
-      Pileup     
+      Pileup
 
    samtools
       The samtools_ package.
@@ -63,7 +70,7 @@ Glossary
 
    target
       The sequence that a read has been aligned to. Target
-      sequences have bot a numerical identifier (:term:`tid`) 
+      sequences have bot a numerical identifier (:term:`tid`)
       and an alphanumeric name (:term:`Reference`).
 
    tabix file
@@ -73,15 +80,15 @@ Glossary
       is indexed by chromosomal coordinates.
 
    tabix row
-      A row in a :term:`tabix file`. Fields within a row are 
-      tab-separated. 
+      A row in a :term:`tabix file`. Fields within a row are
+      tab-separated.
 
    soft clipping
    soft clipped
 
       In alignments with soft clipping part of the query sequence
       are not aligned. The unaligned query sequence is still part
-      of the alignment record. This is in difference to 
+      of the alignment record. This is in difference to
       :term:`hard clipped` reads.
 
    hard clipping
@@ -92,7 +99,7 @@ Glossary
       recorded in the :term:`cigar` alignment, but the removed
       sequence will not be part of the alignment record, in contrast
       to :term:`soft clipped` reads.
-     
+
    VCF
       Variant call format
 
diff --git a/doc/installation.rst b/doc/installation.rst
index 2dbf2a4..e404701 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -4,12 +4,32 @@
 Installing pysam
 ================
 
+Pysam can be installed through conda_, pypi_ and from the repository.
+The recommended way to install pysam is through conda/bioconda.
+
+Conda installation
+==================
+
+To install pysam in your current conda_ environment, type::
+
+   conda config --add channels r
+   conda config --add channels bioconda
+   conda install pysam
+
+This will install pysam from the bioconda_ channel and automatically
+makes sure that dependencies are installed. Also, compilation flags
+will be set automatically, which will potentially save a lot of
+trouble on OS X.
+
+Pypi installation
+=================
+
 Pysam provides a python interface to the functionality contained
 within the htslib_ C library. There are two ways that these two
 can be combined, ``builtin`` and ``external``.
 
 Builtin
-=======
+-------
 
 The typical installation will be through pypi_::
 
@@ -34,7 +54,7 @@ For example::
   pip install pysam
 
 External
-========
+--------
 
 pysam can be combined with an externally installed htslib_
 library. This is a good way to avoid duplication of libraries. To link
@@ -49,12 +69,17 @@ Note that the location of the file :file:`libhts.so` needs to be known
 to the linker once you run pysam, for example by setting the
 environment-varirable `LD_LIBRARY_PATH`.
 
-cython
-======
+Installation from repository
+============================
 
 pysam depends on cython_ to provide the connectivity to the htslib_ C
 library. The installation of the source tarball (:file:`.tar.gz`)
-python 2.7 contains pre-built C-files and cython needs not be present
-during installation. However, when installing the source tarball on
-python 3 or building from the repository, these pre-built C-files are
-not present and cython needs to be installed beforehand.
+contains pre-built C-files and cython needs not be present
+during installation. However, when installing from the repository,
+cython needs to be installed beforehand.
+
+To install from repository, type::
+
+    python setup.py install
+
+For compilation options, see the section on Pypi installation above.
diff --git a/doc/release.rst b/doc/release.rst
index 3874856..4a9b35c 100644
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,6 +2,57 @@
 Release notes
 =============
 
+Release 0.12.0
+==============
+
+This release wraps htslib/samtools/bcftools versions 1.5.0 and
+contains a series of bugfixes.
+
+* [#473] A new FastxRecord class that can be instantiated from class and
+  modified in-place. Replaces PersistentFastqProxy.
+* [#521] In AligmentFile, Simplify file detection logic and allow remote index files
+  * Removed attempts to guess data and index file names; this is magic left
+    to htslib.
+  * Removed file existence check prior to opening files with htslib
+  * Better error checking after opening files that raise the appropriate
+    error (IOError for when errno is set, ValueError otherwise for backward
+    compatibility).
+  * Report IO errors when loading an index by name.
+  * Allow remote indices (tested using S3 signed URLs).
+  * Document filepath_index and make it an alias for index_filename.
+  * Added a require_index parameter to AlignmentFile
+* [#526] handle unset ref when creating new records
+* [#513] fix bcf_translate to skip deleted FORMAT fields to avoid
+  segfaults
+* [#516] expose IO errors via IOError exceptions
+* [#487] add tabix line_skip, remove 'pileup' preset
+* add FastxRecord, replaces PersistentFastqProxy (still present for
+  backwards compatibility)
+* [#496] upgrade to htslib/samtools/bcftools versions 1.5
+* add start/stop to AlignmentFile.fetch() to be consistent with
+  VariantFile.fetch(). "end" is kept for backwards compatibility.
+
+Upcoming changes:
+
+In the next release we are plannig to separate the header information
+from AlignmentFile into a separate class AlignmentHeader. This layout
+is similar to VariantFile/VariantHeader. With this change we will
+ensure that an AlignedSegment record will be linked to a header so
+that chromosome names can be automatically translated from the numeric
+representation. As a consequence, the way new AlignedSegment records
+are created will need to change as the constructor requires a header::
+
+    header = pysam.AlignmentHeader(
+        reference_names=["chr1", "chr2"],
+        reference_lengths=[1000, 1000])
+        
+    read = pysam.AlignedSegment(header)
+
+This will affect all code that instantiates AlignedSegment objects
+directly. We have not yet merged to allow users to provide feed-back.
+The pull-request is here: https://github.com/pysam-developers/pysam/pull/518
+Please comment on github.
+
 Release 0.11.2.2
 ================
 
@@ -11,14 +62,12 @@ Bugfix release to address two issues:
   more tests have been added.
 * [#479] Correct VariantRecord edge cases described in issue
 
-
 Release 0.11.2.1
 ================
 
 Release to fix release tar-ball containing 0.11.1 pre-compiled
 C-files.
 
-
 Release 0.11.2
 ==============
 
@@ -106,6 +155,17 @@ and includes several bugfixes:
 * Preliminary (potentially unsafe) support for removing and altering header metadata
 * Many minor fixes and improvements to VariantFile and related objects
 
+Please note that all internal cython extensions now have a lib prefix
+to facilitate linking against pysam extension modules. Any user cython
+extensions using cimport to import pysam definitions will need
+changes, for example::
+
+   cimport pysam.csamtools
+
+will become::
+
+   cimport pysam.libcamtools
+
 Release 0.9.1
 =============
 
@@ -386,9 +446,6 @@ Backwards incompatible changes
 * fancy_str() has been removed
 * qual, qqual now return arrays
 
-
-
-
 Release 0.8.0
 =============
 
diff --git a/import.py b/import.py
index b8eab01..c50f623 100644
--- a/import.py
+++ b/import.py
@@ -47,7 +47,7 @@ EXCLUDE = {
         "bamcheck.c",
         "chk_indel.c",
         "vcf-miniview.c",
-        "htslib-1.3",   # do not import twice
+        "htslib-1.5",   # do not import twice
         "hfile_irods.c",  # requires irods library
     ),
     "bcftools": (
diff --git a/linker_tests/link_pre_489/PysamTestModule_link_pre_489/BuildRead.pyx b/linker_tests/link_pre_489/PysamTestModule_link_pre_489/BuildRead.pyx
new file mode 100644
index 0000000..9c51e8f
--- /dev/null
+++ b/linker_tests/link_pre_489/PysamTestModule_link_pre_489/BuildRead.pyx
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+
+from pysam.libchtslib         cimport bam1_t, bam_endpos
+from pysam.libcsamfile        cimport aux_type2size
+from pysam.libcalignedsegment cimport AlignedSegment
+
+import pysam
+
+
+cpdef build_read():
+    cdef AlignedSegment read = pysam.AlignedSegment()
+    read.query_name = "hello"
+    read.query_sequence = "ACGT"
+    read.reference_start = 10
+    read.cigarstring = "4M"
+
+    # Test calling htslib function
+    cdef bam1_t *calign = read._delegate
+    print(bam_endpos(calign))
+
+    # Test calling pysam htslib_util function
+    print(aux_type2size(12))
+
+    return read
diff --git a/linker_tests/link_pre_489/PysamTestModule_link_pre_489/__init__.py b/linker_tests/link_pre_489/PysamTestModule_link_pre_489/__init__.py
new file mode 100644
index 0000000..e441021
--- /dev/null
+++ b/linker_tests/link_pre_489/PysamTestModule_link_pre_489/__init__.py
@@ -0,0 +1,3 @@
+from PysamTestModule_link_pre_489.BuildRead import build_read
+
+all = ["build_read"]
diff --git a/cy_build.py b/linker_tests/link_pre_489/cy_build.py
similarity index 73%
copy from cy_build.py
copy to linker_tests/link_pre_489/cy_build.py
index 29af588..fae7055 100644
--- a/cy_build.py
+++ b/linker_tests/link_pre_489/cy_build.py
@@ -60,26 +60,22 @@ class cy_build_ext(build_ext):
             ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
 
         if sys.platform == 'darwin':
+            # The idea is to give shared libraries an install name of the form
+            # `@rpath/<library-name.so>`, and to set the rpath equal to
+            # @loader_path. This will allow Python packages to find the library
+            # in the expected place, while still giving enough flexibility to
+            # external applications to link against the library.
             relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"]
-
-            if "develop" in sys.argv or "test" in sys.argv:
-                # develop-mode and tests use local directory
-                pkg_root = os.path.dirname(__file__)
-                linker_path = os.path.join(pkg_root, relative_module_path)
-            elif "bdist_wheel" in sys.argv or is_pip_install():
-                # making a wheel, or pip is secretly involved
-                linker_path = os.path.join("@rpath", relative_module_path)
-            else:
-                # making an egg: `python setup.py install` default behavior
-                egg_name = '%s.egg' % self._get_egg_name()
-                linker_path = os.path.join("@rpath", egg_name, relative_module_path)
+            library_path = os.path.join(
+                "@rpath", os.path.basename(relative_module_path)
+            )
 
             if not ext.extra_link_args:
                 ext.extra_link_args = []
             ext.extra_link_args += ['-dynamiclib',
-                                    '-rpath', get_python_lib(),
+                                    '-rpath', '@loader_path',
                                     '-Wl,-headerpad_max_install_names',
-                                    '-Wl,-install_name,%s' % linker_path,
+                                    '-Wl,-install_name,%s' % library_path,
                                     '-Wl,-x']
         else:
             if not ext.extra_link_args:
diff --git a/linker_tests/link_pre_489/setup.py b/linker_tests/link_pre_489/setup.py
new file mode 100644
index 0000000..5fc75d5
--- /dev/null
+++ b/linker_tests/link_pre_489/setup.py
@@ -0,0 +1,28 @@
+import glob
+import sys
+import os
+
+from setuptools import setup, find_packages, Extension
+from cy_build import CyExtension as Extension, cy_build_ext as build_ext
+
+import pysam
+
+test_module_suffix = os.path.dirname(os.path.abspath(__file__)).split(os.sep)[-1]
+test_module_name = "PysamTestModule_{}".format(test_module_suffix)
+
+TestModule = Extension(
+    "{}.BuildRead".format(test_module_name),
+    ["{}/BuildRead.pyx".format(test_module_name)],
+    include_dirs=pysam.get_include(),
+    extra_link_args=pysam.get_libraries(),
+    define_macros=pysam.get_defines(),
+)
+
+setup(
+    name=test_module_name,
+    version='0.1',
+    packages=find_packages(),
+    package_dir={test_module_name: test_module_name},
+    ext_modules=[TestModule],
+    cmdclass={'build_ext': build_ext},
+)
diff --git a/linker_tests/link_pre_489/tests/test_module.py b/linker_tests/link_pre_489/tests/test_module.py
new file mode 100644
index 0000000..46ba27a
--- /dev/null
+++ b/linker_tests/link_pre_489/tests/test_module.py
@@ -0,0 +1,15 @@
+import unittest
+
+from PysamTestModule_link_pre_489 import build_read
+
+        
+class TestModule(unittest.TestCase):
+
+    def test_pass_if_module_can_be_called(self):
+        read = build_read()
+        self.assertEqual(read.query_name, "hello")
+        self.assertEqual(read.query_sequence, "ACGT")
+        
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/linker_tests/link_with_rpath/PysamTestModule_link_with_rpath/BuildRead.pyx b/linker_tests/link_with_rpath/PysamTestModule_link_with_rpath/BuildRead.pyx
new file mode 100644
index 0000000..9c51e8f
--- /dev/null
+++ b/linker_tests/link_with_rpath/PysamTestModule_link_with_rpath/BuildRead.pyx
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+
+from pysam.libchtslib         cimport bam1_t, bam_endpos
+from pysam.libcsamfile        cimport aux_type2size
+from pysam.libcalignedsegment cimport AlignedSegment
+
+import pysam
+
+
+cpdef build_read():
+    cdef AlignedSegment read = pysam.AlignedSegment()
+    read.query_name = "hello"
+    read.query_sequence = "ACGT"
+    read.reference_start = 10
+    read.cigarstring = "4M"
+
+    # Test calling htslib function
+    cdef bam1_t *calign = read._delegate
+    print(bam_endpos(calign))
+
+    # Test calling pysam htslib_util function
+    print(aux_type2size(12))
+
+    return read
diff --git a/linker_tests/link_with_rpath/PysamTestModule_link_with_rpath/__init__.py b/linker_tests/link_with_rpath/PysamTestModule_link_with_rpath/__init__.py
new file mode 100644
index 0000000..c4b1a09
--- /dev/null
+++ b/linker_tests/link_with_rpath/PysamTestModule_link_with_rpath/__init__.py
@@ -0,0 +1,3 @@
+from PysamTestModule_link_with_rpath.BuildRead import build_read
+
+all = ["build_read"]
diff --git a/linker_tests/link_with_rpath/setup.py b/linker_tests/link_with_rpath/setup.py
new file mode 100644
index 0000000..2e5a8af
--- /dev/null
+++ b/linker_tests/link_with_rpath/setup.py
@@ -0,0 +1,36 @@
+import glob
+import sys
+import os
+
+from setuptools import setup, find_packages, Extension
+from Cython.Distutils import build_ext
+
+import pysam
+
+test_module_suffix = os.path.dirname(os.path.abspath(__file__)).split(os.sep)[-1]
+test_module_name = "PysamTestModule_{}".format(test_module_suffix)
+
+pysam_libraries = pysam.get_libraries()
+pysam_libdirs, pysam_libs = zip(*[os.path.split(x) for x in pysam_libraries])
+pysam_libdir = pysam_libdirs[0]
+# remove lib and .so
+pysam_libs = [x[3:-3] for x in pysam_libs]
+
+TestModule = Extension(
+    "{}.BuildRead".format(test_module_name),
+    ["{}/BuildRead.pyx".format(test_module_name)],
+    include_dirs=pysam.get_include(),
+    library_dirs=[pysam_libdir],
+    libraries=pysam_libs,
+    extra_link_args=['-Wl,-rpath,{}'.format(pysam_libdir)],
+    language="C",
+)
+
+setup(
+    name=test_module_name,
+    version='0.1',
+    packages=find_packages(),
+    package_dir={test_module_name: test_module_name},
+    ext_modules=[TestModule],
+    cmdclass={'build_ext': build_ext},
+)
diff --git a/linker_tests/link_with_rpath/tests/test_module.py b/linker_tests/link_with_rpath/tests/test_module.py
new file mode 100644
index 0000000..f8621b4
--- /dev/null
+++ b/linker_tests/link_with_rpath/tests/test_module.py
@@ -0,0 +1,15 @@
+import unittest
+
+from PysamTestModule_link_with_rpath import build_read
+
+        
+class TestModule(unittest.TestCase):
+
+    def test_pass_if_module_can_be_called(self):
+        read = build_read()
+        self.assertEqual(read.query_name, "hello")
+        self.assertEqual(read.query_sequence, "ACGT")
+        
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/linker_tests/link_without_rpath/PysamTestModule_link_without_rpath/BuildRead.pyx b/linker_tests/link_without_rpath/PysamTestModule_link_without_rpath/BuildRead.pyx
new file mode 100644
index 0000000..9c51e8f
--- /dev/null
+++ b/linker_tests/link_without_rpath/PysamTestModule_link_without_rpath/BuildRead.pyx
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+
+from pysam.libchtslib         cimport bam1_t, bam_endpos
+from pysam.libcsamfile        cimport aux_type2size
+from pysam.libcalignedsegment cimport AlignedSegment
+
+import pysam
+
+
+cpdef build_read():
+    cdef AlignedSegment read = pysam.AlignedSegment()
+    read.query_name = "hello"
+    read.query_sequence = "ACGT"
+    read.reference_start = 10
+    read.cigarstring = "4M"
+
+    # Test calling htslib function
+    cdef bam1_t *calign = read._delegate
+    print(bam_endpos(calign))
+
+    # Test calling pysam htslib_util function
+    print(aux_type2size(12))
+
+    return read
diff --git a/linker_tests/link_without_rpath/PysamTestModule_link_without_rpath/__init__.py b/linker_tests/link_without_rpath/PysamTestModule_link_without_rpath/__init__.py
new file mode 100644
index 0000000..722c5d0
--- /dev/null
+++ b/linker_tests/link_without_rpath/PysamTestModule_link_without_rpath/__init__.py
@@ -0,0 +1,3 @@
+from PysamTestModule_link_without_rpath.BuildRead import build_read
+
+all = ["build_read"]
diff --git a/linker_tests/link_without_rpath/setup.py b/linker_tests/link_without_rpath/setup.py
new file mode 100644
index 0000000..7846e4b
--- /dev/null
+++ b/linker_tests/link_without_rpath/setup.py
@@ -0,0 +1,35 @@
+import glob
+import sys
+import os
+
+from setuptools import setup, find_packages, Extension
+from Cython.Distutils import build_ext
+
+import pysam
+
+test_module_suffix = os.path.dirname(os.path.abspath(__file__)).split(os.sep)[-1]
+test_module_name = "PysamTestModule_{}".format(test_module_suffix)
+
+pysam_libraries = pysam.get_libraries()
+pysam_libdirs, pysam_libs = zip(*[os.path.split(x) for x in pysam_libraries])
+pysam_libdir = pysam_libdirs[0]
+# remove lib and .so
+pysam_libs = [x[3:-3] for x in pysam_libs]
+
+TestModule = Extension(
+    "{}.BuildRead".format(test_module_name),
+    ["{}/BuildRead.pyx".format(test_module_name)],
+    include_dirs=pysam.get_include(),
+    library_dirs=[pysam_libdir],
+    libraries=pysam_libs,
+    language="C",
+)
+
+setup(
+    name=test_module_name,
+    version='0.1',
+    packages=find_packages(),
+    package_dir={test_module_name: test_module_name},
+    ext_modules=[TestModule],
+    cmdclass={'build_ext': build_ext},
+)
diff --git a/linker_tests/link_without_rpath/tests/test_module.py b/linker_tests/link_without_rpath/tests/test_module.py
new file mode 100644
index 0000000..1da5bbc
--- /dev/null
+++ b/linker_tests/link_without_rpath/tests/test_module.py
@@ -0,0 +1,15 @@
+import unittest
+
+from PysamTestModule_link_without_rpath import build_read
+
+        
+class TestModule(unittest.TestCase):
+
+    def test_pass_if_module_can_be_called(self):
+        read = build_read()
+        self.assertEqual(read.query_name, "hello")
+        self.assertEqual(read.query_sequence, "ACGT")
+        
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pysam/htslib_util.c b/pysam/htslib_util.c
index 28eeca2..c584a23 100644
--- a/pysam/htslib_util.c
+++ b/pysam/htslib_util.c
@@ -32,7 +32,7 @@ int hts_get_hts_verbose();
 
 
 // taken from samtools/bam_import.c
-static inline uint8_t *alloc_data(bam1_t *b, size_t size)
+static inline uint8_t * alloc_data(bam1_t *b, size_t size)
 {
   if (b->m_data < size)
     {
@@ -47,6 +47,7 @@ static inline uint8_t *alloc_data(bam1_t *b, size_t size)
 // Adds *nbytes_new* - *nbytes_old* into the variable length data of *src* at *pos*.
 // Data within the bam1_t entry is moved so that it is
 // consistent with the data field lengths.
+// Return NULL on error (memory allocation)
 bam1_t * pysam_bam_update(bam1_t * b,
 			  const size_t nbytes_old,
 			  const size_t nbytes_new, 
@@ -55,7 +56,8 @@ bam1_t * pysam_bam_update(bam1_t * b,
   int d = nbytes_new - nbytes_old;
   int new_size;
   size_t nbytes_before;
-
+  uint8_t * retval = NULL;
+    
   // no change
   if (d == 0)
     return b;
@@ -75,7 +77,9 @@ bam1_t * pysam_bam_update(bam1_t * b,
   // increase memory if required
   if (d > 0)
     {
-      alloc_data(b, new_size);
+      retval = alloc_data(b, new_size);
+      if (retval == NULL)
+	return NULL;
       field_start = b->data + nbytes_before;
     }
   
diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h
index c714986..cb72853 100644
--- a/pysam/htslib_util.h
+++ b/pysam/htslib_util.h
@@ -44,7 +44,7 @@ typedef khash_t(s2i) s2i_t;
   Old data is deleted and the data within b are re-arranged to 
   make place for new data.
   
-  @discussion Returns b
+  @discussion Return NULL on error, otherwise b is returned.
 
   @param  b           bam1_t data
   @param  nbytes_old  size of old data
diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd
index 8441313..d65beee 100644
--- a/pysam/libcalignedsegment.pxd
+++ b/pysam/libcalignedsegment.pxd
@@ -4,9 +4,9 @@ cdef extern from "htslib_util.h":
 
     # add *nbytes* into the variable length data of *src* at *pos*
     bam1_t * pysam_bam_update(bam1_t * b,
-                              size_t nbytes_old,
-                              size_t nbytes_new,
-                              uint8_t * pos)
+           	         size_t nbytes_old,
+                         size_t nbytes_new,
+                         uint8_t * pos)
 
     # now: static
     int aux_type2size(int)
@@ -81,5 +81,5 @@ cdef class PileupRead:
 # factor methods
 cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file)
 cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file)
-cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file)
-cdef inline uint32_t get_alignment_length(bam1_t * src)
+cdef makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file)
+cdef uint32_t get_alignment_length(bam1_t * src)
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx
index 73d426a..67967c4 100644
--- a/pysam/libcalignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -593,7 +593,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src):
     cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
     if s == NULL:
         raise ValueError(
-            "could not allocated sequence of length %i" % max_len)
+            "could not allocate sequence of length %i" % max_len)
 
     for k from 0 <= k < pysam_get_n_cigar(src):
         op = cigar_p[k] & BAM_CIGAR_MASK
@@ -703,12 +703,16 @@ cdef class AlignedSegment:
     def __init__(self):
         # see bam_init1
         self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
+        if self._delegate == NULL:
+            raise MemoryError("could not allocated memory of {} bytes".format(sizeof(bam1_t)))
         # allocate some memory. If size is 0, calloc does not return a
         # pointer that can be passed to free() so allocate 40 bytes
         # for a new read
         self._delegate.m_data = 40
         self._delegate.data = <uint8_t *>calloc(
             self._delegate.m_data, 1)
+        if self._delegate.data == NULL:
+            raise MemoryError("could not allocate memory of {} bytes".format(self._delegate.m_data))
         self._delegate.l_data = 0
         # set some data to make read approximately legit.
         # Note, SAM writing fails with q_name of length 0
@@ -887,10 +891,12 @@ cdef class AlignedSegment:
             if l % 4 != 0:
                 l_extranul = 4 - l % 4
 
-            pysam_bam_update(src,
-                             src.core.l_qname,
-                             l + l_extranul,
-                             <uint8_t*>p)
+            cdef bam1_t * retval = pysam_bam_update(src,
+                                                    src.core.l_qname,
+                                                    l + l_extranul,
+                                                    <uint8_t*>p)
+            if retval == NULL:
+                raise MemoryError("could not allocate memory")
 
             src.core.l_extranul = l_extranul
             src.core.l_qname = l + l_extranul
@@ -1106,10 +1112,13 @@ cdef class AlignedSegment:
             src.core.l_qseq = l
 
             # change length of data field
-            pysam_bam_update(src,
-                             nbytes_old,
-                             nbytes_new,
-                             p)
+            cdef bam1_t * retval = pysam_bam_update(src,
+                                                    nbytes_old,
+                                                    nbytes_new,
+                                                    p)
+            
+            if retval == NULL:
+                raise MemoryError("could not allocate memory")
 
             if l > 0:
                 # re-acquire pointer to location in memory
@@ -1460,7 +1469,7 @@ cdef class AlignedSegment:
         return result
 
     def infer_query_length(self, always=False):
-        """infer query length from sequence or CIGAR alignment.
+        """infer query length from CIGAR alignment.
 
         This method deduces the query length from the CIGAR alignment
         but does not include hard-clipped bases.
@@ -1472,7 +1481,11 @@ cdef class AlignedSegment:
         """
         if always is True:
             return self.infer_read_length()
-        return calculateQueryLengthWithoutHardClipping(self._delegate)
+        cdef int32_t l = calculateQueryLengthWithoutHardClipping(self._delegate)
+        if l > 0:
+            return l
+        else:
+            return None
 
     def infer_read_length(self):
         """infer read length from CIGAR alignment.
@@ -1482,7 +1495,11 @@ cdef class AlignedSegment:
 
         Returns None if CIGAR alignment is not present.
         """
-        return calculateQueryLengthWithHardClipping(self._delegate)
+        cdef int32_t l = calculateQueryLengthWithHardClipping(self._delegate)
+        if l > 0:
+            return l
+        else:
+            return None
 
     def get_reference_sequence(self):
         """return the reference sequence.
@@ -1864,10 +1881,13 @@ cdef class AlignedSegment:
 
             ncigar = len(values)
             # create space for cigar data within src.data
-            pysam_bam_update(src,
-                             pysam_get_n_cigar(src) * 4,
-                             ncigar * 4,
-                             <uint8_t*>p)
+            cdef bam1_t * retval = pysam_bam_update(src,
+                                                    pysam_get_n_cigar(src) * 4,
+                                                    ncigar * 4,
+                                                    <uint8_t*>p)
+
+            if retval == NULL:
+                raise MemoryError("could not allocate memory")
 
             # length is number of cigar operations, not bytes
             pysam_set_n_cigar(src, ncigar)
@@ -2180,10 +2200,12 @@ cdef class AlignedSegment:
         # If total_size == 0, the aux field will be
         # empty
         old_size = pysam_bam_get_l_aux(src)
-        pysam_bam_update(src,
-                         old_size,
-                         new_size,
-                         pysam_bam_get_aux(src))
+        cdef bam1_t * retval = pysam_bam_update(src,
+                                                old_size,
+                                                new_size,
+                                                pysam_bam_get_aux(src))
+        if retval == NULL:
+            raise MemoryError("could not allocated memory")
 
         # copy data only if there is any
         if new_size > 0:
diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd
index 6f32f47..d59e704 100644
--- a/pysam/libcalignmentfile.pxd
+++ b/pysam/libcalignmentfile.pxd
@@ -131,16 +131,16 @@ cdef class IteratorColumn:
     cdef setupIteratorData(self,
                            int tid,
                            int start,
-                           int end,
+                           int stop,
                            int multiple_iterators=?)
 
-    cdef reset(self, tid, start, end)
+    cdef reset(self, tid, start, stop)
     cdef _free_pileup_iter(self)
 
 
 cdef class IteratorColumnRegion(IteratorColumn):
     cdef int start
-    cdef int end
+    cdef int stop
     cdef int truncate
 
 
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx
index 0b248c1..cea312c 100644
--- a/pysam/libcalignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -75,6 +75,13 @@ else:
 
 cimport cython
 
+
+IndexStats = collections.namedtuple("IndexStats",
+                                    ("contig",
+                                     "mapped",
+                                     "unmapped",
+                                     "total"))
+
 ########################################################
 ## global variables
 # maximum genomic coordinace
@@ -176,8 +183,11 @@ cdef bam_hdr_t * build_header_from_dict(new_header):
                 lines.append(build_header_line(fields, record))
 
     text = "\n".join(lines) + "\n"
-    if dest.text != NULL: free( dest.text )
+    if dest.text != NULL:
+        free(dest.text)
     dest.text = <char*>calloc(len(text), sizeof(char))
+    if dest.text == NULL:
+        raise MemoryError("could not allocate {} bytes".format(len(text) * sizeof(char)))
     dest.l_text = len(text)
     cdef bytes btext = text.encode('ascii')
     strncpy(dest.text, btext, dest.l_text)
@@ -194,12 +204,18 @@ cdef bam_hdr_t * build_header_from_dict(new_header):
 
         dest.n_targets = len(seqs)
         dest.target_name = <char**>calloc(dest.n_targets, sizeof(char*))
+        if dest.target_name == NULL:
+            raise MemoryError("could not allocate {} bytes".format(dest.n_targets, sizeof(char *)))
         dest.target_len = <uint32_t*>calloc(dest.n_targets, sizeof(uint32_t))
+        if dest.target_len == NULL:
+            raise MemoryError("could not allocate {} bytes".format(dest.n_targets * sizeof(uint32_t)))
 
         for x from 0 <= x < dest.n_targets:
             seqname, seqlen = seqs[x]
             dest.target_name[x] = <char*>calloc(
                 len(seqname) + 1, sizeof(char))
+            if dest.target_name[x] == NULL:
+                raise MemoryError("could not allocate {} bytes".format(len(seqname) + 1, sizeof(char)))
             bseqname = seqname.encode('ascii')
             strncpy(dest.target_name[x], bseqname,
                     len(seqname) + 1)
@@ -225,12 +241,20 @@ cdef bam_hdr_t * build_header_from_list(reference_names,
     for x in reference_names:
         n += len(x) + 1
     dest.target_name = <char**>calloc(n, sizeof(char*))
+    if dest.target_name == NULL:
+        raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
+
     dest.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
+    if dest.target_len == NULL:
+        raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
+
     for x from 0 <= x < dest.n_targets:
         dest.target_len[x] = reference_lengths[x]
         name = reference_names[x]
         dest.target_name[x] = <char*>calloc(
             len(name) + 1, sizeof(char))
+        if dest.target_name[x] == NULL:
+            raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
         strncpy(dest.target_name[x], name, len(name))
 
     # Optionally, if there is no text, add a SAM
@@ -252,6 +276,8 @@ cdef bam_hdr_t * build_header_from_list(reference_names,
         dest.l_text = strlen(ctext)
         dest.text = <char*>calloc(
             strlen(ctext), sizeof(char))
+        if dest.text == NULL:
+            raise MemoryError("could not allocate {} bytes".format(strlen(ctext), sizeof(char)))
         memcpy(dest.text, ctext, strlen(ctext))
 
     return dest
@@ -261,7 +287,8 @@ cdef class AlignmentFile(HTSFile):
     """AlignmentFile(filepath_or_object, mode=None, template=None,
     reference_names=None, reference_lengths=None, text=NULL,
     header=None, add_sq_text=False, check_header=True, check_sq=True,
-    reference_filename=None, filename=None, duplicate_filehandle=True,
+    reference_filename=None, filename=None, index_filename=None,
+    filepath_index=None, require_index=False, duplicate_filehandle=True,
     ignore_truncation=False)
 
     A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
@@ -270,10 +297,13 @@ cdef class AlignmentFile(HTSFile):
     opened. If `filepath_or_object` is a python File object, the
     already opened file will be used.
 
-    If the file is opened for reading an index for a BAM file exists
-    (.bai), it will be opened automatically. Without an index random
-    access via :meth:`~pysam.AlignmentFile.fetch` and
-    :meth:`~pysam.AlignmentFile.pileup` is disabled.
+    If the file is opened for reading and an index exists (if file is BAM, a
+    .bai file or if CRAM a .crai file), it will be opened automatically.
+    `index_filename` may be specified explicitly. If the index is not named
+    in the standard manner, not located in the same directory as the
+    BAM/CRAM file, or is remote.  Without an index, random access via
+    :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
+    is disabled.
 
     For writing, the header of a :term:`SAM` file/:term:`BAM` file can
     be constituted from several sources (see also the samtools format
@@ -368,6 +398,19 @@ cdef class AlignmentFile(HTSFile):
         specified in the header (``UR`` tag), which are normally used to find
         the reference.
 
+    index_filename : string
+        Explicit path to the index file.  Only needed if the index is not
+        named in the standard manner, not located in the same directory as
+        the BAM/CRAM file, or is remote.  An IOError is raised if the index
+        cannot be found or is invalid.
+
+    filepath_index : string
+        Alias for `index_filename`.
+
+    require_index : bool
+        When reading, require that an index file is present and is valid or
+        raise an IOError.  (default=False)
+
     filename : string
         Alternative to filepath_or_object. Filename of the file
         to be opened.
@@ -402,6 +445,8 @@ cdef class AlignmentFile(HTSFile):
 
         # allocate memory for iterator
         self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
+        if self.b is NULL:
+            raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
 
     def has_index(self):
         """return true if htsfile has an existing (and opened) index.
@@ -446,7 +491,9 @@ cdef class AlignmentFile(HTSFile):
               add_sam_header=True,
               check_header=True,
               check_sq=True,
+              index_filename=None,
               filepath_index=None,
+              require_index=False,
               referencenames=None,
               referencelengths=None,
               duplicate_filehandle=True,
@@ -546,6 +593,13 @@ cdef class AlignmentFile(HTSFile):
 
             self.htsfile = self._open_htsfile()
 
+            if self.htsfile == NULL:
+                if errno:
+                    raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
+                                  force_str(strerror(errno))))
+                else:
+                    raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
+
             # set filename with reference sequences. If no filename
             # is given, the CRAM reference arrays will be built from
             # the @SQ header in the header
@@ -560,15 +614,14 @@ cdef class AlignmentFile(HTSFile):
 
         elif mode[0] == "r":
             # open file for reading
-            if not self._exists():
-                raise IOError("file `%s` not found" % self.filename)
-
             self.htsfile = self._open_htsfile()
 
             if self.htsfile == NULL:
-                raise ValueError(
-                    "could not open file (mode='%s') - "
-                    "is it SAM/BAM format?" % mode)
+                if errno:
+                    raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
+                                  force_str(strerror(errno))))
+                else:
+                    raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
 
             if self.htsfile.format.category != sequence_data:
                 raise ValueError("file does not contain alignment data")
@@ -579,26 +632,26 @@ cdef class AlignmentFile(HTSFile):
             if self.is_bam or self.is_cram:
                 with nogil:
                     self.header = sam_hdr_read(self.htsfile)
+
+            # in sam files a header is optional, but requires
+            # reference names and lengths
+            elif reference_names and reference_lengths:
+                self.header = build_header_from_list(
+                    reference_names,
+                    reference_lengths,
+                    add_sq_text=add_sq_text,
+                    text=text)
+            else:
+                with nogil:
+                    self.header = sam_hdr_read(self.htsfile)
+
                 if self.header == NULL:
                     raise ValueError(
-                        "file does not have valid header (mode='%s') "
-                        "- is it BAM format?" % mode )
-            else:
-                # in sam files a header is optional, but requires
-                # reference names and lengths
-                if reference_names and reference_lengths:
-                    self.header = build_header_from_list(
-                        reference_names,
-                        reference_lengths,
-                        add_sq_text=add_sq_text,
-                        text=text)
-                else:
-                    with nogil:
-                        self.header = sam_hdr_read(self.htsfile)
-                    if self.header == NULL:
-                        raise ValueError(
-                            "file does not have valid header (mode='%s'), "
-                            "please provide reference_names and reference_lengths")
+                        "file `{}` does not have valid header, "
+                        "please provide reference_names and reference_lengths".format(force_str(filename)))
+
+            if self.header == NULL:
+                raise ValueError("file `{}` does not have valid header".format(force_str(filename)))
 
             # set filename with reference sequences
             if self.is_cram and reference_filename:
@@ -613,69 +666,36 @@ cdef class AlignmentFile(HTSFile):
                      "is it SAM/BAM format? Consider opening with "
                      "check_sq=False") % mode)
 
-        assert self.htsfile != NULL
+            if self.is_bam or self.is_cram:
+                # open index for remote files
+                # returns NULL if there is no index or index could
+                # not be opened
+                index_filename = index_filename or filepath_index
+                if index_filename:
+                    cindexname = bindex_filename = encode_filename(index_filename)
+
+                if cfilename or cindexname:
+                    with nogil:
+                        self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
 
-        # check for index and open if present
-        cdef int format_index = -1
-        if self.is_bam:
-            format_index = HTS_FMT_BAI
-        elif self.is_cram:
-            format_index = HTS_FMT_CRAI
+                    if not self.index and (cindexname or require_index):
+                        if errno:
+                            raise IOError(errno, force_str(strerror(errno)))
+                        else:
+                            raise IOError('unable to open index file `%s`' % index_filename)
 
-        if mode[0] == "r" and (self.is_bam or self.is_cram):
-            # open index for remote files
-            if self.is_remote and not filepath_index:
-                with nogil:
-                    self.index = hts_idx_load(cfilename, format_index)
-                if self.index == NULL:
-                    warnings.warn(
-                        "unable to open remote index for '%s'" % cfilename)
-            else:
-                has_index = True
-                if filepath_index:
-                    if not os.path.exists(filepath_index):
-                        warnings.warn(
-                            "unable to open index at %s" % cfilename)
-                        self.index = NULL
-                        has_index = False
-                elif filename is not None:
-                    if self.is_bam \
-                            and not os.path.exists(filename + b".bai") \
-                            and not os.path.exists(filename[:-4] + b".bai") \
-                            and not os.path.exists(filename + b".csi") \
-                            and not os.path.exists(filename[:-4] + b".csi"):
-                        self.index = NULL
-                        has_index = False
-                    elif self.is_cram \
-                            and not os.path.exists(filename + b".crai") \
-                            and not os.path.exists(filename[:-5] + b".crai"):
-                        self.index = NULL
-                        has_index = False
-                else:
-                    self.index = NULL
-                    has_index = False
-
-                if has_index:
-                    # returns NULL if there is no index or index could
-                    # not be opened
-                    if filepath_index:
-                        cindexname = filepath_index = encode_filename(filepath_index)
-                        with nogil:
-                            self.index = sam_index_load2(self.htsfile,
-                                                         cfilename,
-                                                         cindexname)
-                    else:
-                        with nogil:
-                            self.index = sam_index_load(self.htsfile,
-                                                        cfilename)
-                    if self.index == NULL:
-                        raise IOError(
-                            "error while opening index for '%s'" %
-                            filename)
-
-            # save start of data section
-            if not self.is_stream:
-                self.start_offset = self.tell()
+                elif require_index:
+                    raise IOError('unable to open index file')
+
+                # save start of data section
+                if not self.is_stream:
+                    self.start_offset = self.tell()
+
+    def is_valid_tid(self, tid):
+        """
+        return True if the numerical :term:`tid` is valid; False otherwise.
+        """
+        return 0 <= tid < self.header.n_targets
 
     def get_tid(self, reference):
         """
@@ -700,106 +720,24 @@ cdef class AlignmentFile(HTSFile):
                              (tid, self.header.n_targets))
         return charptr_to_str(self.header.target_name[tid])
 
-    def parse_region(self,
-                     reference=None,
-                     start=None,
-                     end=None,
-                     region=None,
-                     tid=None):
-        """parse alternative ways to specify a genomic region. A region can
-        either be specified by :term:`reference`, `start` and
-        `end`. `start` and `end` denote 0-based, half-open
-        intervals.
-
-        Alternatively, a samtools :term:`region` string can be
-        supplied.
-
-        If any of the coordinates are missing they will be replaced by the
-        minimum (`start`) or maximum (`end`) coordinate.
-
-        Note that region strings are 1-based, while `start` and `end` denote
-        an interval in python coordinates.
-
-        Returns
-        -------
-
-        tuple :  a tuple of `flag`, :term:`tid`, `start` and `end`. The
-        flag indicates whether no coordinates were supplied and the
-        genomic region is the complete genomic space.
-
-        Raises
-        ------
-
-        ValueError
-           for invalid or out of bounds regions.
-
-        """
-        cdef int rtid
-        cdef long long rstart
-        cdef long long rend
-
-        if reference is None and tid is None and region is None:
-            return 0, 0, 0, 0
-
-        rtid = -1
-        rstart = 0
-        rend = MAX_POS
-        if start != None:
-            try:
-                rstart = start
-            except OverflowError:
-                raise ValueError('start out of range (%i)' % start)
-
-        if end != None:
-            try:
-                rend = end
-            except OverflowError:
-                raise ValueError('end out of range (%i)' % end)
-
-        if region:
-            region = force_str(region)
-            parts = re.split("[:-]", region)
-            reference = parts[0]
-            if len(parts) >= 2:
-                rstart = int(parts[1]) - 1
-            if len(parts) >= 3:
-                rend = int(parts[2])
-
-        if tid is not None:
-            rtid = tid
-            if rtid < 0 or rtid >= self.header.n_targets:
-                raise IndexError("invalid reference, {} out of range 0-{}".format(
-                        rtid, self.header.n_targets))
-        else:
-            rtid = self.gettid(reference)
-
-        if rtid < 0:
-            raise ValueError(
-                "invalid reference `%s`" % reference)
-        if rstart > rend:
-            raise ValueError(
-                'invalid coordinates: start (%i) > end (%i)' % (rstart, rend))
-        if not 0 <= rstart < MAX_POS:
-            raise ValueError('start out of range (%i)' % rstart)
-        if not 0 <= rend <= MAX_POS:
-            raise ValueError('end out of range (%i)' % rend)
-
-        return 1, rtid, rstart, rend
-
     def fetch(self,
-              reference=None,
+              contig=None,
               start=None,
-              end=None,
+              stop=None,
               region=None,
               tid=None,
               until_eof=False,
-              multiple_iterators=False):
+              multiple_iterators=False,
+              reference=None,
+              end=None):
         """fetch reads aligned in a :term:`region`.
 
         See :meth:`AlignmentFile.parse_region` for more information
-        on genomic regions.
+        on genomic regions.  :term:`reference` and `end` are also accepted for
+        backward compatiblity as synonyms for :term:`contig` and `stop`,
+        respectively.
 
-        Without a `reference` or `region` all mapped reads in the file
+        Without a `contig` or `region` all mapped reads in the file
         will be fetched. The reads will be returned ordered by reference
         sequence, which will not necessarily be the order within the
         file. This mode of iteration still requires an index. If there is
@@ -809,7 +747,7 @@ cdef class AlignmentFile(HTSFile):
         will be fetched.
 
         A :term:`SAM` file does not allow random access. If `region`
-        or `reference` are given, an exception is raised.
+        or `contig` are given, an exception is raised.
 
         :class:`~pysam.FastaFile`
         :class:`~pysam.IteratorRow`
@@ -847,17 +785,13 @@ cdef class AlignmentFile(HTSFile):
             file does not permit random access to genomic coordinates.
 
         """
-        cdef int rtid, rstart, rend, has_coord
+        cdef int rtid, rstart, rstop, has_coord
 
         if not self.is_open:
             raise ValueError( "I/O operation on closed file" )
 
-        has_coord, rtid, rstart, rend = self.parse_region(
-            reference,
-            start,
-            end,
-            region,
-            tid)
+        has_coord, rtid, rstart, rstop = self.parse_region(contig, start, stop, region, tid,
+                                                          end=end, reference=reference)
 
         # Turn of re-opening if htsfile is a stream
         if self.is_stream:
@@ -871,7 +805,7 @@ cdef class AlignmentFile(HTSFile):
 
             if has_coord:
                 return IteratorRowRegion(
-                    self, rtid, rstart, rend,
+                    self, rtid, rstart, rstop,
                     multiple_iterators=multiple_iterators)
             else:
                 if until_eof:
@@ -978,23 +912,27 @@ cdef class AlignmentFile(HTSFile):
         return mate
 
     def pileup(self,
-               reference=None,
+               contig=None,
                start=None,
-               end=None,
+               stop=None,
                region=None,
+               reference=None,
+               end=None,
                **kwargs):
         """perform a :term:`pileup` within a :term:`region`. The region is
-        specified by :term:`reference`, 'start' and 'end' (using
-        0-based indexing).  Alternatively, a samtools 'region' string
+        specified by :term:`contig`, `start` and `stop` (using
+        0-based indexing).  :term:`reference` and `end` are also accepted for
+        backward compatiblity as synonyms for :term:`contig` and `stop`,
+        respectively.  Alternatively, a samtools 'region' string
         can be supplied.
 
-        Without 'reference' or 'region' all reads will be used for the
+        Without 'contig' or 'region' all reads will be used for the
         pileup. The reads will be returned ordered by
-        :term:`reference` sequence, which will not necessarily be the
+        :term:`contig` sequence, which will not necessarily be the
         order within the file.
 
         Note that :term:`SAM` formatted files do not allow random
-        access.  In these files, if a 'region' or 'reference' are
+        access.  In these files, if a 'region' or 'contig' are
         given an exception is raised.
 
         .. note::
@@ -1043,13 +981,13 @@ cdef class AlignmentFile(HTSFile):
         an iterator over genomic positions.
 
         """
-        cdef int rtid, rstart, rend, has_coord
+        cdef int rtid, rstart, rstop, has_coord
 
         if not self.is_open:
             raise ValueError("I/O operation on closed file")
 
-        has_coord, rtid, rstart, rend = self.parse_region(
-            reference, start, end, region)
+        has_coord, rtid, rstart, rstop = self.parse_region(
+            contig, start, stop, region, reference=reference, end=end)
 
         if self.is_bam or self.is_cram:
             if not self.has_index():
@@ -1059,8 +997,8 @@ cdef class AlignmentFile(HTSFile):
                 return IteratorColumnRegion(self,
                                             tid=rtid,
                                             start=rstart,
-                                            end=rend,
-                                            **kwargs )
+                                            stop=rstop,
+                                            **kwargs)
             else:
                 return IteratorColumnAllRefs(self, **kwargs )
 
@@ -1069,32 +1007,36 @@ cdef class AlignmentFile(HTSFile):
                 "pileup of samfiles not implemented yet")
 
     def count(self,
-              reference=None,
+              contig=None,
               start=None,
-              end=None,
+              stop=None,
               region=None,
               until_eof=False,
-              read_callback="nofilter"):
+              read_callback="nofilter",
+              reference=None,
+              end=None):
         '''count the number of reads in :term:`region`
 
-        The region is specified by :term:`reference`, `start` and
-        `end`. Alternatively, a :term:`samtools` :term:`region` string
-        can be supplied.
+        The region is specified by :term:`contig`, `start` and `stop`.
+        :term:`reference` and `end` are also accepted for backward
+        compatiblity as synonyms for :term:`contig` and `stop`,
+        respectively.  Alternatively, a :term:`samtools` :term:`region`
+        string can be supplied.
 
         A :term:`SAM` file does not allow random access and if
-        `region` or `reference` are given, an exception is raised.
+        `region` or `contig` are given, an exception is raised.
 
         Parameters
         ----------
 
-        reference : string
+        contig : string
             reference_name of the genomic region (chromosome)
 
         start : int
-            start of the genomic region
+            start of the genomic region (0-based inclusive)
 
-        end : int
-            end of the genomic region
+        stop : int
+            end of the genomic region (0-based exclusive)
 
         region : string
             a region string in samtools format.
@@ -1120,6 +1062,12 @@ cdef class AlignmentFile(HTSFile):
             ``check_read(read)`` that should return True only for
             those reads that shall be included in the counting.
 
+        reference : string
+            backward compatible synonym for `contig`
+
+        end : int
+            backward compatible synonym for `stop`
+
         Raises
         ------
 
@@ -1139,8 +1087,10 @@ cdef class AlignmentFile(HTSFile):
         elif read_callback == "nofilter":
             filter_method = 2
 
-        for read in self.fetch(reference=reference,
+        for read in self.fetch(contig=contig,
                                start=start,
+                               stop=stop,
+                               reference=reference,
                                end=end,
                                region=region,
                                until_eof=until_eof):
@@ -1161,29 +1111,33 @@ cdef class AlignmentFile(HTSFile):
 
     @cython.boundscheck(False)  # we do manual bounds checking
     def count_coverage(self,
-                       reference=None,
+                       contig=None,
                        start=None,
-                       end=None,
+                       stop=None,
                        region=None,
                        quality_threshold=15,
-                       read_callback='all'):
+                       read_callback='all',
+                       reference=None,
+                       end=None):
         """count the coverage of genomic positions by reads in :term:`region`.
 
-        The region is specified by :term:`reference`, `start` and
-        `end`. Alternatively, a :term:`samtools` :term:`region` string
-        can be supplied. The coverage is computed per-base [ACGT].
+        The region is specified by :term:`contig`, `start` and `stop`.
+        :term:`reference` and `end` are also accepted for backward
+        compatiblity as synonyms for :term:`contig` and `stop`,
+        respectively.  Alternatively, a :term:`samtools` :term:`region`
+        string can be supplied.  The coverage is computed per-base [ACGT].
 
         Parameters
         ----------
 
-        reference : string
+        contig : string
             reference_name of the genomic region (chromosome)
 
         start : int
-            start of the genomic region
+            start of the genomic region (0-based inclusive)
 
-        end : int
-            end of the genomic region
+        stop : int
+            end of the genomic region (0-based exclusive)
 
         region : int
             a region string.
@@ -1209,6 +1163,12 @@ cdef class AlignmentFile(HTSFile):
             ``check_read(read)`` that should return True only for
             those reads that shall be included in the counting.
 
+        reference : string
+            backward compatible synonym for `contig`
+
+        end : int
+            backward compatible synonym for `stop`
+
         Raises
         ------
 
@@ -1223,7 +1183,7 @@ cdef class AlignmentFile(HTSFile):
         """
 
         cdef int _start = start
-        cdef int _stop = end
+        cdef int _stop = stop if stop is not None else end
         cdef int length = _stop - _start
         cdef c_array.array int_array_template = array.array('L', [])
         cdef c_array.array count_a
@@ -1248,8 +1208,10 @@ cdef class AlignmentFile(HTSFile):
             filter_method = 2
 
         cdef int _threshold = quality_threshold
-        for read in self.fetch(reference=reference,
+        for read in self.fetch(contig=contig,
+                               reference=reference,
                                start=start,
+                               stop=stop,
                                end=end,
                                region=region):
             # apply filter
@@ -1329,7 +1291,7 @@ cdef class AlignmentFile(HTSFile):
             if errno == EPIPE:
                 errno = 0
             else:
-                raise OSError(errno, force_str(strerror(errno)))
+                raise IOError(errno, force_str(strerror(errno)))
 
     def __dealloc__(self):
         cdef int ret = 0
@@ -1355,7 +1317,7 @@ cdef class AlignmentFile(HTSFile):
             if errno == EPIPE:
                 errno = 0
             else:
-                raise OSError(errno, force_str(strerror(errno)))
+                raise IOError(errno, force_str(strerror(errno)))
 
     cpdef int write(self, AlignedSegment read) except -1:
         '''
@@ -1479,6 +1441,33 @@ cdef class AlignmentFile(HTSFile):
                 n = hts_idx_get_n_no_coor(self.index)
             return n
 
+    def get_index_statistics(self):
+        """return statistics about mapped/unmapped reads per chromosome as
+        they are stored in the index.
+
+        Returns
+        -------
+        list : a list of records for each chromosome. Each record has the attributes 'contig',
+               'mapped', 'unmapped' and 'total'.
+        """
+        
+        self.check_index()
+        cdef int tid
+        cdef uint64_t mapped, unmapped
+        results = []
+        # TODO: use header
+        for tid from 0 <= tid < self.nreferences:
+            with nogil:
+                hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+            results.append(
+                IndexStats._make((
+                    self.get_reference_name(tid),
+                    mapped,
+                    unmapped,
+                    mapped + unmapped)))
+                
+        return results
+
     property text:
         '''string with the full contents of the :term:`sam file` header as a
         string.
@@ -1708,7 +1697,7 @@ cdef class IteratorRow:
 
 
 cdef class IteratorRowRegion(IteratorRow):
-    """*(AlignmentFile samfile, int tid, int beg, int end,
+    """*(AlignmentFile samfile, int tid, int beg, int stop,
     int multiple_iterators=False)*
 
     iterate over mapped reads in a region.
@@ -1722,7 +1711,7 @@ cdef class IteratorRowRegion(IteratorRow):
     """
 
     def __init__(self, AlignmentFile samfile,
-                 int tid, int beg, int end,
+                 int tid, int beg, int stop,
                  int multiple_iterators=False):
 
         IteratorRow.__init__(self, samfile,
@@ -1736,7 +1725,7 @@ cdef class IteratorRowRegion(IteratorRow):
                 self.samfile.index,
                 tid,
                 beg,
-                end)
+                stop)
 
     def __iter__(self):
         return self
@@ -1971,9 +1960,9 @@ cdef class IteratorRowSelection(IteratorRow):
 
     def __next__(self):
         cdef int ret = self.cnext()
-        if (ret >= 0):
+        if ret >= 0:
             return makeAlignedSegment(self.b, self.samfile)
-        elif (ret == -2):
+        elif ret == -2:
             raise IOError('truncated file')
         else:
             raise StopIteration
@@ -2186,11 +2175,11 @@ cdef class IteratorColumn:
     cdef setupIteratorData( self,
                             int tid,
                             int start,
-                            int end,
+                            int stop,
                             int multiple_iterators=0 ):
         '''setup the iterator structure'''
 
-        self.iter = IteratorRowRegion(self.samfile, tid, start, end, multiple_iterators)
+        self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
         self.iterdata.htsfile = self.samfile.htsfile
         self.iterdata.iter = self.iter.iter
         self.iterdata.seq = NULL
@@ -2231,13 +2220,13 @@ cdef class IteratorColumn:
 
         # bam_plp_set_mask( self.pileup_iter, self.mask )
 
-    cdef reset( self, tid, start, end ):
+    cdef reset(self, tid, start, stop):
         '''reset iterator position.
 
         This permits using the iterator multiple times without
         having to incur the full set-up costs.
         '''
-        self.iter = IteratorRowRegion( self.samfile, tid, start, end, multiple_iterators = 0 )
+        self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
         self.iterdata.iter = self.iter.iter
 
         # invalidate sequence if different tid
@@ -2280,14 +2269,14 @@ cdef class IteratorColumnRegion(IteratorColumn):
     def __cinit__(self, AlignmentFile samfile,
                   int tid = 0,
                   int start = 0,
-                  int end = MAX_POS,
+                  int stop = MAX_POS,
                   int truncate = False,
                   **kwargs ):
 
         # initialize iterator
-        self.setupIteratorData(tid, start, end, 1)
+        self.setupIteratorData(tid, start, stop, 1)
         self.start = start
-        self.end = end
+        self.stop = stop
         self.truncate = truncate
 
     def __next__(self):
@@ -2302,7 +2291,7 @@ cdef class IteratorColumnRegion(IteratorColumn):
 
             if self.truncate:
                 if self.start > self.pos: continue
-                if self.pos >= self.end: raise StopIteration
+                if self.pos >= self.stop: raise StopIteration
 
             return makePileupColumn(&self.plp,
                                    self.tid,
@@ -2467,6 +2456,8 @@ cdef class IndexedReads:
         # position if you decide
         cdef int ret = 1
         cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
+        if b == NULL:
+            raise ValueError("could not allocate {} bytes".format(sizeof(bam1_t)))
 
         cdef uint64_t pos
 
diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx
index 9413e70..67565f0 100644
--- a/pysam/libcbcf.pyx
+++ b/pysam/libcbcf.pyx
@@ -1159,7 +1159,13 @@ cdef inline bcf_sync_end(VariantRecord record):
     cdef bcf_hdr_t *hdr = record.header.ptr
     cdef bcf_info_t *info
     cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
-    cdef int ref_len = len(record.ref)
+    cdef int ref_len 
+
+    # allow missing ref when instantiating a new record
+    if record.ref is not None:
+        ref_len = len(record.ref)
+    else:
+        ref_len = 0
 
     # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
     if not record.ptr.n_allele or record.ptr.rlen == ref_len:
@@ -2034,6 +2040,10 @@ cdef class VariantHeader(object):
 
         """
         rec = makeVariantRecord(self, bcf_init())
+
+        if not rec:
+            raise MemoryError('unable to allocate BCF record')
+
         rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
 
         if contig is not None:
@@ -3641,8 +3651,8 @@ cdef class BCFIndex(object):
             hts_idx_destroy(self.ptr)
             self.ptr = NULL
 
-    def fetch(self, bcf, contig, start, stop, region, reopen):
-        return BCFIterator(bcf, contig, start, stop, region, reopen)
+    def fetch(self, bcf, contig, start, stop, reopen):
+        return BCFIterator(bcf, contig, start, stop, reopen)
 
 
 cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
@@ -3680,8 +3690,8 @@ cdef class TabixIndex(BaseIndex):
             tbx_destroy(self.ptr)
             self.ptr = NULL
 
-    def fetch(self, bcf, contig, start, stop, region, reopen):
-        return TabixIterator(bcf, contig, start, stop, region, reopen)
+    def fetch(self, bcf, contig, start, stop, reopen):
+        return TabixIterator(bcf, contig, start, stop, reopen)
 
 
 cdef TabixIndex makeTabixIndex(tbx_t *idx):
@@ -3716,54 +3726,44 @@ cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
 
 
 cdef class BCFIterator(BaseIterator):
-    def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+    def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
         if bcf is None:
             raise ValueError('bcf must not be None')
 
+        if contig is None:
+            raise ValueError('contig must be specified')
+
         if not isinstance(bcf.index, BCFIndex):
             raise ValueError('bcf index required')
 
         cdef BCFIndex index = bcf.index
-        cdef int rid, cstart, cstop
-        cdef char *cregion
-
-        if not index:
-            raise ValueError('bcf index required')
-
-        if reopen:
-            bcf = bcf.copy()
 
-        if region is not None:
-            if contig is not None or start is not None or stop is not None:
-                raise ValueError  # FIXME
-
-            bregion = force_bytes(region)
-            cregion = bregion
-            with nogil:
-                self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, cregion)
-        else:
-            if contig is None:
-                raise ValueError  # FIXME
+        self.bcf = bcf
+        self.index = index
 
-            try:
-                rid = index.refmap[contig]
-            except KeyError:
-                raise ValueError('Unknown contig specified: {}'.format(contig))
+        cdef int rid, cstart, cstop
 
-            if start is None:
-                start = 0
-            if stop is None:
-                stop = MAX_POS
+        try:
+            rid = index.refmap[contig]
+        except KeyError:
+            # A query for a non-existant contig yields an empty iterator, does not raise an error
+            self.iter = NULL
+            return
 
-            cstart, cstop = start, stop
+        if reopen:
+            self.bcf = self.bcf.copy()
 
-            with nogil:
-                self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
+        cstart = start if start is not None else 0
+        cstop  = stop  if stop  is not None else MAX_POS
 
-        # Do not fail on self.iter == NULL, since it signifies a null query.
+        with nogil:
+            self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
 
-        self.bcf = bcf
-        self.index = index
+        if not self.iter:
+            if errno:
+                raise IOError(errno, strerror(errno))
+            else:
+                raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
 
     def __dealloc__(self):
         if self.iter:
@@ -3779,6 +3779,9 @@ cdef class BCFIterator(BaseIterator):
 
         cdef bcf1_t *record = bcf_init1()
 
+        if not record:
+            raise MemoryError('unable to allocate BCF record')
+
         record.pos = -1
         if self.bcf.drop_samples:
             record.max_unpack = BCF_UN_SHR
@@ -3792,8 +3795,12 @@ cdef class BCFIterator(BaseIterator):
             _stop_BCFIterator(self, record)
             if ret == -1:
                 raise StopIteration
+            elif ret == -2:
+                raise IOError('truncated file')
+            elif errno:
+                raise IOError(errno, strerror(errno))
             else:
-                raise ValueError('error reading BCF file')
+                raise IOError('unable to fetch next record')
 
         ret = bcf_subset_format(self.bcf.header.ptr, record)
 
@@ -3810,7 +3817,7 @@ cdef class TabixIterator(BaseIterator):
         self.line_buffer.m = 0
         self.line_buffer.s = NULL
 
-    def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+    def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
         if bcf is None:
             raise ValueError('bcf must not be None')
 
@@ -3819,34 +3826,31 @@ cdef class TabixIterator(BaseIterator):
 
         cdef TabixIndex index = bcf.index
 
-        if not index:
-            raise ValueError('bcf index required')
-
-        if reopen:
-            bcf = bcf.copy()
-
-        if region is not None:
-            if contig is not None or start is not None or stop is not None:
-                raise ValueError  # FIXME
+        self.bcf = bcf
+        self.index = index
 
-            self.iter = tbx_itr_querys(index.ptr, region)
-        else:
-            if contig is None:
-                raise ValueError  # FIXME
+        cdef int rid, cstart, cstop
 
-            rid = index.refmap.get(contig, -1)
+        try:
+            rid = index.refmap[contig]
+        except KeyError:
+            # A query for a non-existant contig yields an empty iterator, does not raise an error
+            self.iter = NULL
+            return
 
-            if start is None:
-                start = 0
-            if stop is None:
-                stop = MAX_POS
+        if reopen:
+            self.bcf = self.bcf.copy()
 
-            self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
+        cstart = start if start is not None else 0
+        cstop  = stop  if stop  is not None else MAX_POS
 
-        # Do not fail on self.iter == NULL, since it signifies a null query.
+        self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
 
-        self.bcf = bcf
-        self.index = index
+        if not self.iter:
+            if errno:
+                raise IOError(errno, strerror(errno))
+            else:
+                raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
 
     def __dealloc__(self):
         if self.iter:
@@ -3877,11 +3881,18 @@ cdef class TabixIterator(BaseIterator):
             self.iter = NULL
             if ret == -1:
                 raise StopIteration
+            elif ret == -2:
+                raise IOError('truncated file')
+            elif errno:
+                raise IOError(errno, strerror(errno))
             else:
-                raise ValueError('error reading indexed VCF file')
+                raise IOError('unable to fetch next record')
 
         cdef bcf1_t *record = bcf_init1()
 
+        if not record:
+            raise MemoryError('unable to allocate BCF record')
+
         record.pos = -1
         if self.bcf.drop_samples:
             record.max_unpack = BCF_UN_SHR
@@ -3996,7 +4007,7 @@ cdef class VariantFile(HTSFile):
             if errno == EPIPE:
                 errno = 0
             else:
-                raise OSError(errno, force_str(strerror(errno)))
+                raise IOError(errno, force_str(strerror(errno)))
 
     def close(self):
         """closes the :class:`pysam.VariantFile`."""
@@ -4017,7 +4028,7 @@ cdef class VariantFile(HTSFile):
             if errno == EPIPE:
                 errno = 0
             else:
-                raise OSError(errno, force_str(strerror(errno)))
+                raise IOError(errno, force_str(strerror(errno)))
 
     def __iter__(self):
         if not self.is_open:
@@ -4033,6 +4044,9 @@ cdef class VariantFile(HTSFile):
         cdef int ret
         cdef bcf1_t *record = bcf_init1()
 
+        if not record:
+            raise MemoryError('unable to allocate BCF record')
+
         record.pos = -1
         if self.drop_samples:
             record.max_unpack = BCF_UN_SHR
@@ -4045,9 +4059,11 @@ cdef class VariantFile(HTSFile):
             if ret == -1:
                 raise StopIteration
             elif ret == -2:
-                raise OSError('truncated file')
+                raise IOError('truncated file')
+            elif errno:
+                raise IOError(errno, strerror(errno))
             else:
-                raise ValueError('Variant read failed')
+                raise IOError('unable to fetch next record')
 
         return makeVariantRecord(self.header, record)
 
@@ -4177,16 +4193,16 @@ cdef class VariantFile(HTSFile):
 
         elif mode.startswith(b'r'):
             # open file for reading
-            if not self._exists():
-                raise IOError('file `{}` not found'.format(filename))
-
             self.htsfile = self._open_htsfile()
 
             if not self.htsfile:
-                raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+                if errno:
+                    raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno))))
+                else:
+                    raise ValueError('could not open variant file `{}`'.format(filename))
 
             if self.htsfile.format.format not in (bcf, vcf):
-                raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+                raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
 
             self.check_truncation(ignore_truncation)
 
@@ -4196,7 +4212,7 @@ cdef class VariantFile(HTSFile):
             try:
                 self.header = makeVariantHeader(hdr)
             except ValueError:
-                raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+                raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
 
             if isinstance(self.filename, bytes):
                 cfilename = self.filename
@@ -4221,13 +4237,54 @@ cdef class VariantFile(HTSFile):
             if not self.is_stream:
                 self.start_offset = self.tell()
         else:
-            raise ValueError("unknown mode {}".format(mode))
+            raise ValueError('unknown mode {}'.format(mode))
 
     def reset(self):
         """reset file position to beginning of file just after the header."""
         return self.seek(self.start_offset)
 
-    def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
+    def is_valid_tid(self, tid):
+        """
+        return True if the numerical :term:`tid` is valid; False otherwise.
+
+        returns -1 if reference is not known.
+        """
+        if not self.is_open:
+            raise ValueError('I/O operation on closed file')
+
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int rid = tid
+        return 0 <= rid < hdr.n[BCF_DT_CTG]
+
+    def get_tid(self, reference):
+        """
+        return the numerical :term:`tid` corresponding to
+        :term:`reference`
+
+        returns -1 if reference is not known.
+        """
+        if not self.is_open:
+            raise ValueError('I/O operation on closed file')
+
+        cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
+        reference = force_bytes(reference)
+        cdef khint_t k = kh_get_vdict(d, reference)
+        return kh_val_vdict(d, k).id if k != kh_end(d) else -1
+
+    def get_reference_name(self, tid):
+        """
+        return :term:`reference` name corresponding to numerical :term:`tid`
+        """
+        if not self.is_open:
+            raise ValueError('I/O operation on closed file')
+
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int rid = tid
+        if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
+            raise ValueError('Invalid tid')
+        return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
+
+    def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
         """fetch records in a :term:`region` using 0-based indexing. The
         region is specified by :term:`contig`, *start* and *end*.
         Alternatively, a samtools :term:`region` string can be supplied.
@@ -4263,8 +4320,14 @@ cdef class VariantFile(HTSFile):
         if not self.index:
             raise ValueError('fetch requires an index')
 
+        _, tid, start, stop = self.parse_region(contig, start, stop, region,
+                                                None, end=end, reference=reference)
+
+        if contig is None:
+            contig = self.get_reference_name(tid)
+
         self.is_reading = 1
-        return self.index.fetch(self, contig, start, stop, region, reopen)
+        return self.index.fetch(self, contig, start, stop, reopen)
 
     def new_record(self, *args, **kwargs):
         """Create a new empty :class:`VariantRecord`.
diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx
index f1d2fa9..02ff2a2 100644
--- a/pysam/libcbgzf.pyx
+++ b/pysam/libcbgzf.pyx
@@ -70,7 +70,7 @@ cdef class BGZFile(object):
 
         if not self.bgzf.is_write:
             import errno
-            raise OSError(errno.EBADF, "write() on read-only BGZFile object")
+            raise IOError(errno.EBADF, "write() on read-only BGZFile object")
 
         if isinstance(data, bytes):
             length = len(data)
@@ -92,7 +92,7 @@ cdef class BGZFile(object):
 
         if self.bgzf.is_write:
             import errno
-            raise OSError(errno.EBADF, "read() on write-only BGZFile object")
+            raise IOError(errno.EBADF, "read() on write-only BGZFile object")
 
         if size < 0:
             chunks = []
@@ -167,7 +167,7 @@ cdef class BGZFile(object):
         if not self.bgzf:
             raise ValueError("rewind() on closed BGZFile object")
         if not self.bgzf.is_write:
-            raise OSError("Can't rewind in write mode")
+            raise IOError("Can't rewind in write mode")
         if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0:
             raise IOError('Error seeking BGZFFile object')
 
diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd
index 2f5f44b..9ac09e6 100644
--- a/pysam/libcfaidx.pxd
+++ b/pysam/libcfaidx.pxd
@@ -39,7 +39,7 @@ cdef class FastaFile:
     cdef object _filename, _references, _lengths, reference2length
     cdef faidx_t* fastafile
     cdef char* _fetch(self, char* reference,
-                      int start, int end, int* length)
+                      int start, int end, int* length) except? NULL
 
 
 cdef class FastqProxy:
@@ -48,7 +48,7 @@ cdef class FastqProxy:
     cpdef array.array get_quality_array(self, int offset=*)
 
 
-cdef class PersistentFastqProxy:
+cdef class FastxRecord:
     """
     Python container for pysam.libcfaidx.FastqProxy with persistence.
     """
@@ -56,7 +56,6 @@ cdef class PersistentFastqProxy:
     cdef cython.str tostring(self)
     cpdef array.array get_quality_array(self, int offset=*)
 
-
 cdef class FastxFile:
     cdef object _filename
     cdef BGZF * fastqfile
diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx
index 3af76f6..bf04217 100644
--- a/pysam/libcfaidx.pyx
+++ b/pysam/libcfaidx.pyx
@@ -13,7 +13,7 @@
 # of the internal API. These are:
 #
 # class FastqProxy
-# class PersistentFastqProxy
+# class FastxRecord
 #
 # For backwards compatibility, the following classes are also defined:
 #
@@ -48,6 +48,11 @@
 import sys
 import os
 import re
+
+
+from libc.errno  cimport errno
+from libc.string cimport strerror
+
 from cpython cimport array
 
 from cpython cimport PyErr_SetString, \
@@ -283,24 +288,35 @@ cdef class FastaFile:
                                   rend-1,
                                   &length)
 
-        if seq == NULL:
-            raise ValueError(
-                "failure when retrieving sequence on '%s'" % reference)
+        if not seq:
+            if errno:
+                raise IOError(errno, strerror(errno))
+            else:
+                raise ValueError("failure when retrieving sequence on '%s'" % reference)
 
         try:
             return charptr_to_str(seq)
         finally:
             free(seq)
 
-    cdef char * _fetch(self, char * reference, int start, int end, int * length):
+    cdef char *_fetch(self, char *reference, int start, int end, int *length) except? NULL:
         '''fetch sequence for reference, start and end'''
 
+        cdef char *seq
         with nogil:
-            return faidx_fetch_seq(self.fastafile,
-                                   reference,
-                                   start,
-                                   end-1,
-                                   length)
+            seq = faidx_fetch_seq(self.fastafile,
+                                  reference,
+                                  start,
+                                  end-1,
+                                  length)
+
+        if not seq:
+            if errno:
+                raise IOError(errno, strerror(errno))
+            else:
+                raise ValueError("failure when retrieving sequence on '%s'" % reference)
+
+        return seq
 
     def get_reference_length(self, reference):
         '''return the length of reference.'''
@@ -365,18 +381,43 @@ cdef class FastqProxy:
         return qualitystring_to_array(force_bytes(self.quality),
                                       offset=offset)
 
-cdef class PersistentFastqProxy:
-    """
-    Python container for pysam.libcfaidx.FastqProxy with persistence.
-    Needed to compare multiple fastq records from the same file.
+cdef class FastxRecord:
+    """A fasta/fastq record.
+
+    A record must contain a name and a sequence. If either of them are
+    None, a ValueError is raised on writing.
+
     """
-    def __init__(self, FastqProxy FastqRead):
-        self.comment = FastqRead.comment
-        self.quality = FastqRead.quality
-        self.sequence = FastqRead.sequence
-        self.name = FastqRead.name
+    def __init__(self,
+                 name=None,
+                 comment=None,
+                 sequence=None,
+                 quality=None,
+                 FastqProxy proxy=None):
+        if proxy is not None:
+            self.comment = proxy.comment
+            self.quality = proxy.quality
+            self.sequence = proxy.sequence
+            self.name = proxy.name
+        else:
+            self.comment = comment
+            self.quality = quality
+            self.sequence = sequence
+            self.name = name
+
+    def __copy__(self):
+        return FastxRecord(self.name, self.comment, self.sequence, self.quality)
+
+    def __deepcopy__(self, memo):
+        return FastxRecord(self.name, self.comment, self.sequence, self.quality)
 
     cdef cython.str tostring(self):
+        if self.name is None:
+            raise ValueError("can not write record without name")
+
+        if self.sequence is None:
+            raise ValueError("can not write record without a sequence")
+        
         if self.comment is None:
             comment = ""
         else:
@@ -388,6 +429,28 @@ cdef class PersistentFastqProxy:
             return "@%s%s\n%s\n+\n%s" % (self.name, comment,
                                          self.sequence, self.quality)
 
+    def set_name(self, name):
+        if name is None:
+            raise ValueError("FastxRecord must have a name and not None")
+        self.name = name
+
+    def set_comment(self, comment):
+        self.comment = comment    
+        
+    def set_sequence(self, sequence, quality=None):
+        """set sequence of this record.
+
+        """
+        self.sequence = sequence
+        if quality is not None:
+            if len(sequence) != len(quality):
+                raise ValueError("sequence and quality length do not match: {} vs {}".format(
+                    len(sequence), len(quality)))
+
+            self.quality = quality
+        else:
+            self.quality = None
+
     def __str__(self):
         return self.tostring()
 
@@ -422,11 +485,11 @@ cdef class FastxFile:
         If True (default) make a copy of the entry in the file during
         iteration. If set to False, no copy will be made. This will
         permit faster iteration, but an entry will not persist when
-        the iteration continues.
+        the iteration continues or is not in-place modifyable.
 
     Notes
     -----
-    Prior to version 0.8.2, this was called FastqFile.
+    Prior to version 0.8.2, this class was called FastqFile.
 
     Raises
     ------
@@ -549,7 +612,7 @@ cdef class FastxFile:
             l = kseq_read(self.entry)
         if (l >= 0):
             if self.persist:
-                return PersistentFastqProxy(makeFastqProxy(self.entry))
+                return FastxRecord(proxy=makeFastqProxy(self.entry))
             return makeFastqProxy(self.entry)
         else:
             raise StopIteration
@@ -568,4 +631,5 @@ __all__ = ["FastaFile",
            "FastqFile",
            "FastxFile",
            "Fastafile",
+           "FastxRecord",
            "FastqProxy"]
diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd
index 78a55f8..7abd472 100644
--- a/pysam/libchtslib.pxd
+++ b/pysam/libchtslib.pxd
@@ -757,7 +757,7 @@ cdef extern from "htslib/hts.h" nogil:
 
     ctypedef struct probaln_par_t:
         float d, e
-        int bw;
+        int bw
 
     int probaln_glocal(const uint8_t *ref,
                        int l_ref,
@@ -807,17 +807,17 @@ cdef extern from "htslib/hts.h" nogil:
     # /*! @abstract Deallocates any memory allocated by hts_md5_init. */
     void hts_md5_destroy(hts_md5_context *ctx)
 
-    inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
-    inline int hts_bin_bot(int bin, int n_lvls)
+    int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
+    int hts_bin_bot(int bin, int n_lvls)
 
     # * Endianness *
-    inline int ed_is_big()
-    inline uint16_t ed_swap_2(uint16_t v)
-    inline void *ed_swap_2p(void *x)
-    inline uint32_t ed_swap_4(uint32_t v)
-    inline void *ed_swap_4p(void *x)
-    inline uint64_t ed_swap_8(uint64_t v)
-    inline void *ed_swap_8p(void *x)
+    int ed_is_big()
+    uint16_t ed_swap_2(uint16_t v)
+    void *ed_swap_2p(void *x)
+    uint32_t ed_swap_4(uint32_t v)
+    void *ed_swap_4p(void *x)
+    uint64_t ed_swap_8(uint64_t v)
+    void *ed_swap_8p(void *x)
 
 
 cdef extern from "htslib/sam.h" nogil:
diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx
index 4b8d9c0..f6943ea 100644
--- a/pysam/libchtslib.pyx
+++ b/pysam/libchtslib.pyx
@@ -1,8 +1,11 @@
 # cython: embedsignature=True
 # cython: profile=True
 # adds doc-strings for sphinx
-import os
-import io
+
+########################################################################
+########################################################################
+## Cython cimports
+########################################################################
 
 from posix.unistd cimport dup
 from libc.errno  cimport errno
@@ -14,9 +17,22 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_
 from pysam.libcutils cimport encode_filename, from_string_and_size
 
 
+########################################################################
+########################################################################
+## Python imports
+########################################################################
+
+import os
+import io
+import re
 from warnings         import warn
 
 
+########################################################################
+########################################################################
+## Constants
+########################################################################
+
 __all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
 
 # defines imported from samtools
@@ -24,11 +40,6 @@ DEF SEEK_SET = 0
 DEF SEEK_CUR = 1
 DEF SEEK_END = 2
 
-########################################################################
-########################################################################
-## Constants
-########################################################################
-
 # maximum genomic coordinace
 cdef int   MAX_POS = 2 << 29
 
@@ -38,6 +49,12 @@ cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', '
 cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
 
 
+########################################################################
+########################################################################
+## Verbosity functions
+########################################################################
+
+
 cpdef set_verbosity(int verbosity):
     """Set htslib's hts_verbose global variable to the specified value."""
     return hts_set_verbosity(verbosity)
@@ -47,6 +64,11 @@ cpdef get_verbosity():
     return hts_get_verbosity()
 
 
+########################################################################
+########################################################################
+## HFile wrapper class
+########################################################################
+
 cdef class HFile(object):
     cdef hFILE *fp
     cdef readonly object name, mode
@@ -76,7 +98,7 @@ cdef class HFile(object):
             self.fp = hopen(name, mode)
 
         if not self.fp:
-            raise OSError(errno, 'failed to open HFile', self.name)
+            raise IOError(errno, 'failed to open HFile', self.name)
 
     def close(self):
         if self.fp == NULL:
@@ -86,11 +108,11 @@ cdef class HFile(object):
         self.fp = NULL
 
         if hclose(fp) != 0:
-            raise OSError(herrno(self.fp), 'failed to close HFile', self.name)
+            raise IOError(herrno(self.fp), 'failed to close HFile', self.name)
 
     def fileno(self):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
         if isinstance(self.name, int):
             return self.name
         else:
@@ -113,13 +135,13 @@ cdef class HFile(object):
 
     def flush(self):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
         if hflush(self.fp) != 0:
-            raise OSError(herrno(self.fp), 'failed to flush HFile', self.name)
+            raise IOError(herrno(self.fp), 'failed to flush HFile', self.name)
 
     def isatty(self):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
         return False
 
     def readable(self):
@@ -127,7 +149,7 @@ cdef class HFile(object):
 
     def read(self, Py_ssize_t size=-1):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
 
         if size == 0:
             return b''
@@ -147,7 +169,7 @@ cdef class HFile(object):
             ret = hread(self.fp, <void *>cpart, chunk_size)
 
             if ret < 0:
-                OSError(herrno(self.fp), 'failed to read HFile', self.name)
+                IOError(herrno(self.fp), 'failed to read HFile', self.name)
             elif not ret:
                 break
 
@@ -165,7 +187,7 @@ cdef class HFile(object):
 
     def readinto(self, buf):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
 
         size = len(buf)
 
@@ -176,13 +198,13 @@ cdef class HFile(object):
         ret = hread(self.fp, <void *>mv, size)
 
         if ret < 0:
-            OSError(herrno(self.fp), 'failed to read HFile', self.name)
+            IOError(herrno(self.fp), 'failed to read HFile', self.name)
 
         return ret
 
     def readline(self, Py_ssize_t size=-1):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
 
         if size == 0:
             return b''
@@ -204,7 +226,7 @@ cdef class HFile(object):
             ret = hgetln(cpart, chunk_size+1, self.fp)
 
             if ret < 0:
-                OSError(herrno(self.fp), 'failed to read HFile', self.name)
+                IOError(herrno(self.fp), 'failed to read HFile', self.name)
             elif not ret:
                 break
 
@@ -226,23 +248,23 @@ cdef class HFile(object):
 
     def seek(self, Py_ssize_t offset, int whence=SEEK_SET):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
 
         cdef Py_ssize_t off = hseek(self.fp, offset, whence)
 
         if off < 0:
-            raise OSError(herrno(self.fp), 'seek failed on HFile', self.name)
+            raise IOError(herrno(self.fp), 'seek failed on HFile', self.name)
 
         return off
 
     def tell(self):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
 
         ret = htell(self.fp)
 
         if ret < 0:
-            raise OSError(herrno(self.fp), 'tell failed on HFile', self.name)
+            raise IOError(herrno(self.fp), 'tell failed on HFile', self.name)
 
         return ret
 
@@ -257,12 +279,12 @@ cdef class HFile(object):
 
     def write(self, bytes b):
         if self.fp == NULL:
-            raise OSError('operation on closed HFile')
+            raise IOError('operation on closed HFile')
 
         got = hwrite(self.fp, <void *>b, len(b))
 
         if got < 0:
-            raise OSError(herrno(self.fp), 'write failed on HFile', self.name)
+            raise IOError(herrno(self.fp), 'write failed on HFile', self.name)
 
         return got
 
@@ -271,6 +293,12 @@ cdef class HFile(object):
             self.write(line)
 
 
+########################################################################
+########################################################################
+## Helpers for backward compatibility to hide the difference between
+## boolean properties and methods
+########################################################################
+
 class CallableValue(object):
     def __init__(self, value):
         self.value = value
@@ -290,6 +318,11 @@ CTrue = CallableValue(True)
 CFalse = CallableValue(False)
 
 
+########################################################################
+########################################################################
+## HTSFile wrapper class (base class for AlignmentFile and VariantFile)
+########################################################################
+
 cdef class HTSFile(object):
     """
     Base class for HTS file types
@@ -322,13 +355,13 @@ cdef class HTSFile(object):
 
         cdef int ret = bgzf_check_EOF(bgzfp)
         if ret < 0:
-            raise OSError(errno, 'error checking for EOF marker')
+            raise IOError(errno, 'error checking for EOF marker')
         elif ret == 0:
             msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename)
             if ignore_truncation:
                 warn(msg)
             else:
-                raise OSError(msg)
+                raise IOError(msg)
 
     def __enter__(self):
         return self
@@ -449,7 +482,7 @@ cdef class HTSFile(object):
         if not self.is_open:
             raise ValueError('I/O operation on closed file')
         if self.is_stream:
-            raise OSError('seek not available in streams')
+            raise IOError('seek not available in streams')
 
         cdef int64_t ret
         if self.htsfile.format.compression == bgzf:
@@ -468,7 +501,7 @@ cdef class HTSFile(object):
         if not self.is_open:
             raise ValueError('I/O operation on closed file')
         if self.is_stream:
-            raise OSError('tell not available in streams')
+            raise IOError('tell not available in streams')
 
         cdef int64_t ret
         if self.htsfile.format.compression == bgzf:
@@ -529,10 +562,122 @@ cdef class HTSFile(object):
             with nogil:
                 return hts_hopen(hfile, cfilename, cmode)
 
-    def _exists(self):
-        """return False iff file is local, a file and exists.
+    def parse_region(self, contig=None, start=None, stop=None, region=None,tid=None,
+                           reference=None, end=None):
+        """parse alternative ways to specify a genomic region. A region can
+        either be specified by :term:`contig`, `start` and
+        `stop`. `start` and `stop` denote 0-based, half-open
+        intervals.  :term:`reference` and `end` are also accepted for
+        backward compatiblity as synonyms for :term:`contig` and `stop`,
+        respectively.
+
+        Alternatively, a samtools :term:`region` string can be
+        supplied.
+
+        If any of the coordinates are missing they will be replaced by the
+        minimum (`start`) or maximum (`stop`) coordinate.
+
+        Note that region strings are 1-based inclusive, while `start` and `stop` denote
+        an interval in 0-based, half-open coordinates (like BED files and Python slices).
+
+        Returns
+        -------
+
+        tuple :  a tuple of `flag`, :term:`tid`, `start` and `stop`. The
+        flag indicates whether no coordinates were supplied and the
+        genomic region is the complete genomic space.
+
+        Raises
+        ------
+
+        ValueError
+           for invalid or out of bounds regions.
+
+        """
+        cdef int rtid
+        cdef long long rstart
+        cdef long long rstop
+
+        if reference is not None:
+            if contig is not None:
+                raise ValueError('contig and reference should not both be specified')
+            contig = reference
+
+        if end is not None:
+            if stop is not None:
+                raise ValueError('stop and end should not both be specified')
+            stop = end
+
+        if contig is None and tid is None and region is None:
+            return 0, 0, 0, 0
+
+        rtid = -1
+        rstart = 0
+        rstop = MAX_POS
+        if start is not None:
+            try:
+                rstart = start
+            except OverflowError:
+                raise ValueError('start out of range (%i)' % start)
+
+        if stop is not None:
+            try:
+                rstop = stop
+            except OverflowError:
+                raise ValueError('stop out of range (%i)' % stop)
+
+        if region:
+            region = force_str(region)
+            parts = re.split('[:-]', region)
+            contig = parts[0]
+            if len(parts) >= 2:
+                rstart = int(parts[1]) - 1
+            if len(parts) >= 3:
+                rstop = int(parts[2])
+
+        if tid is not None:
+            if not self.is_valid_tid(tid):
+                raise IndexError('invalid tid')
+            rtid = tid
+        else:
+            rtid = self.get_tid(contig)
+
+        if rtid < 0:
+            raise ValueError('invalid contig `%s`' % contig)
+        if rstart > rstop:
+            raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
+        if not 0 <= rstart < MAX_POS:
+            raise ValueError('start out of range (%i)' % rstart)
+        if not 0 <= rstop <= MAX_POS:
+            raise ValueError('stop out of range (%i)' % rstop)
+
+        return 1, rtid, rstart, rstop
+
+    def is_valid_tid(self, tid):
+        """
+        return True if the numerical :term:`tid` is valid; False otherwise.
+
+        returns -1 if contig is not known.
+        """
+        raise NotImplementedError()
+
+    def is_valid_reference_name(self, contig):
+        """
+        return True if the contig name :term:`contig` is valid; False otherwise.
+        """
+        return self.get_tid(contig) != -1
+
+    def get_tid(self, contig):
         """
-        return (not isinstance(self.filename, (str, bytes)) or
-                self.filename == b'-' or
-                self.is_remote or
-                os.path.exists(self.filename))
+        return the numerical :term:`tid` corresponding to
+        :term:`contig`
+
+        returns -1 if contig is not known.
+        """
+        raise NotImplementedError()
+
+    def get_reference_name(self, tid):
+        """
+        return :term:`contig` name corresponding to numerical :term:`tid`
+        """
+        raise NotImplementedError()
diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx
index b10c0d0..f8b0e38 100644
--- a/pysam/libctabix.pyx
+++ b/pysam/libctabix.pyx
@@ -73,7 +73,8 @@ from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
     BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
     tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
     tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
-    tbx_destroy, hisremote, region_list
+    tbx_destroy, hisremote, region_list, \
+    TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC
 
 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
 from pysam.libcutils cimport encode_filename, from_string_and_size
@@ -828,55 +829,57 @@ def tabix_compress(filename_in,
             r = bgzf_write(fp, buffer, c)
         if r < 0:
             free(buffer)
-            raise OSError("writing failed")
+            raise IOError("writing failed")
         
     free(buffer)
     r = bgzf_close(fp)
     if r < 0:
-        raise OSError("error %i when writing to file %s" % (r, filename_out))
+        raise IOError("error %i when writing to file %s" % (r, filename_out))
 
     r = close(fd_src)
     # an empty file will return with -1, thus ignore this.
     if r < 0:
         if not (r == -1 and is_empty):
-            raise OSError("error %i when closing file %s" % (r, filename_in))
-
-
-def tabix_index( filename, 
-                 force = False,
-                 seq_col = None, 
-                 start_col = None, 
-                 end_col = None,
-                 preset = None,
-                 meta_char = "#",
-                 zerobased = False,
-                 int min_shift = -1,
+            raise IOError("error %i when closing file %s" % (r, filename_in))
+
+
+def tabix_index(filename, 
+                force=False,
+                seq_col=None, 
+                start_col=None, 
+                end_col=None,
+                preset=None,
+                meta_char="#",
+                int line_skip=0,
+                zerobased=False,
+                int min_shift=-1,
                 ):
     '''index tab-separated *filename* using tabix.
 
-    An existing index will not be overwritten unless
-    *force* is set.
+    An existing index will not be overwritten unless *force* is set.
 
-    The index will be built from coordinates
-    in columns *seq_col*, *start_col* and *end_col*.
+    The index will be built from coordinates in columns *seq_col*,
+    *start_col* and *end_col*.
 
-    The contents of *filename* have to be sorted by 
-    contig and position - the method does not check
-    if the file is sorted.
+    The contents of *filename* have to be sorted by contig and
+    position - the method does not check if the file is sorted.
 
-    Column indices are 0-based. Coordinates in the file
-    are assumed to be 1-based.
+    Column indices are 0-based. Note that this is different from the
+    tabix command line utility where column indices start at 1.
+    
+    Coordinates in the file are assumed to be 1-based unless
+    *zerobased* is set.
 
-    If *preset* is provided, the column coordinates
-    are taken from a preset. Valid values for preset
-    are "gff", "bed", "sam", "vcf", psltbl", "pileup".
+    If *preset* is provided, the column coordinates are taken from a
+    preset. Valid values for preset are "gff", "bed", "sam", "vcf",
+    psltbl", "pileup".
     
-    Lines beginning with *meta_char* and the first
-    *line_skip* lines will be skipped.
+    Lines beginning with *meta_char* and the first *line_skip* lines
+    will be skipped.
     
     If *filename* does not end in ".gz", it will be automatically
-    compressed. The original file will be removed and only the 
-    compressed file will be retained. 
+    compressed. The original file will be removed and only the
+    compressed file will be retained.
 
     If *filename* ends in *gz*, the file is assumed to be already
     compressed with bgzf.
@@ -911,12 +914,11 @@ def tabix_index( filename,
     #     comments, lines to ignore at beginning
     # 0 is a missing column
     preset2conf = {
-        'gff' : (0, 1, 4, 5, ord('#'), 0),
-        'bed' : (0x10000, 1, 2, 3, ord('#'), 0),
-        'psltbl' : (0x10000, 15, 17, 18, ord('#'), 0),
-        'sam' : (1, 3, 4, 0, ord('@'), 0),
-        'vcf' : (2, 1, 2, 0, ord('#'), 0),
-        'pileup': (3, 1, 2, 0, ord('#'), 0),
+        'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
+        'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
+        'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
+        'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
+        'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
         }
 
     if preset:
@@ -927,20 +929,20 @@ def tabix_index( filename,
                 "unknown preset '%s', valid presets are '%s'" %
                 (preset, ",".join(preset2conf.keys())))
     else:
-        if end_col == None:
+        if end_col is None:
             end_col = -1
+            
         preset = 0
-
-        # note that tabix internally works with 0-based coordinates
-        # and open/closed intervals.  When using a preset, conversion
-        # is automatically taken care of.  Otherwise, the coordinates
-        # are assumed to be 1-based closed intervals and -1 is
-        # subtracted from the start coordinate. To avoid doing this,
-        # set the TI_FLAG_UCSC=0x10000 flag:
+        # tabix internally works with 0-based coordinates and
+        # open/closed intervals.  When using a preset, conversion is
+        # automatically taken care of.  Otherwise, the coordinates are
+        # assumed to be 1-based closed intervals and -1 is subtracted
+        # from the start coordinate. To avoid doing this, set the
+        # TI_FLAG_UCSC=0x10000 flag:
         if zerobased:
-            preset = preset | 0x10000
+            preset = preset | TBX_UCSC
 
-        conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
+        conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), line_skip)
                 
     cdef tbx_conf_t conf
     conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx
index dc434e0..5ff0948 100644
--- a/pysam/libctabixproxies.pyx
+++ b/pysam/libctabixproxies.pyx
@@ -9,6 +9,7 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
 from pysam.libcutils cimport encode_filename, from_string_and_size
 
 import collections
+import copy
 
 
 cdef char *StrOrEmpty(char * buffer):
@@ -450,12 +451,18 @@ cdef class GTFProxy(NamedTupleProxy):
         '''return max number of fields.'''
         return 9
 
-    def asDict(self):
+    def as_dict(self):
         """parse attributes - return as dict
+
+        The dictionary can be modified to update attributes.
         """
-        return collections.OrderedDict(self.attribute_iterator())
+        if not self.attribute_dict:
+            self.attribute_dict = self.attribute_string2dict(
+                self.attributes)
+            self.is_modified = True
+        return self.attribute_dict
     
-    def fromDict(self, d):
+    def from_dict(self, d):
         '''set attributes from a dictionary.'''
         self.attribute_dict = None
         attribute_string = force_bytes(
@@ -645,7 +652,14 @@ cdef class GTFProxy(NamedTupleProxy):
                     self.attributes)
             self.attribute_dict[key] = value
             self.is_modified = True
-            
+
+    # for backwards compatibility
+    def asDict(self, *args, **kwargs):
+        return self.as_dict(*args, **kwargs)
+
+    def fromDict(self, *args, **kwargs):
+        return self.from_dict(*args, **kwargs)
+    
 
 cdef class GFF3Proxy(GTFProxy):
 
diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx
index 2b90420..3609c3b 100644
--- a/pysam/libcutils.pyx
+++ b/pysam/libcutils.pyx
@@ -270,7 +270,7 @@ def _pysam_dispatch(collection,
         stdout_h = c_open(force_bytes(stdout_f),
                           O_WRONLY)
         if stdout_h == -1:
-            raise OSError("error while opening {} for writing".format(stdout_f))
+            raise IOError("error while opening {} for writing".format(stdout_f))
 
         pysam_set_stdout_fn(force_bytes(stdout_f))
         pysam_set_stdout(stdout_h)
diff --git a/pysam/version.py b/pysam/version.py
index ac832cf..89cf2de 100644
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,10 +1,10 @@
 # pysam versioning information
-__version__ = "0.11.2.2"
+__version__ = "0.12"
 
 # TODO: upgrade number
-__samtools_version__ = "1.4.1"
+__samtools_version__ = "1.5"
 
 # TODO: upgrade code and number
-__bcftools_version__ = "1.4.1"
+__bcftools_version__ = "1.5"
 
-__htslib_version__ = "1.4.1"
+__htslib_version__ = "1.5"
diff --git a/run_tests_travis.sh b/run_tests_travis.sh
index a229ff5..45c928e 100755
--- a/run_tests_travis.sh
+++ b/run_tests_travis.sh
@@ -1,5 +1,16 @@
 #!/usr/bin/env bash
 
+# test script for pysam.
+# The script performs the following tasks:
+# 1. Setup a conda environment and install dependencies via conda
+# 2. Build pysam via the conda recipe
+# 3. Build pysam via setup.py from repository
+# 4. Run tests on the setup.py version
+# 5. Additional build tests
+# 5.1 pip install with cython
+# 5.2 pip install without cython
+# 5.3 pip install without cython and without configure options
+
 pushd .
 
 WORKDIR=`pwd`
@@ -15,7 +26,7 @@ bash Miniconda3.sh -b
 
 # Create a new conda environment with the target python version
 ~/miniconda3/bin/conda install conda-build -y
-~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose psutil pip 
+~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy pytest psutil pip 
 
 # activate testenv environment
 source ~/miniconda3/bin/activate testenv
@@ -25,7 +36,8 @@ conda config --add channels defaults
 conda config --add channels r
 conda config --add channels bioconda
 
-conda install -y samtools bcftools htslib
+# pin versions, so that tests do not fail when pysam/htslib out of step
+conda install -y "samtools=1.5" "bcftools=1.5" "htslib=1.5"
 
 # Need to make C compiler and linker use the anaconda includes and libraries:
 export PREFIX=~/miniconda3/
@@ -39,36 +51,29 @@ bcftools --version
 # Try building conda recipe first
 ~/miniconda3/bin/conda-build ci/conda-recipe/ --python=$CONDA_PY
 
-# install code from the repository
+# install code from the repository via setup.py
+echo "installing via setup.py from repository"
 python setup.py install
 
-# find build/
-
-# change into tests directory. Otherwise,
-# 'import pysam' will import the repository,
-# not the installed version. This causes
-# problems in the compilation test.
-cd tests
-
 # create auxilliary data
 echo
 echo 'building test data'
 echo
-make -C pysam_data
-make -C cbcf_data
+make -C tests/pysam_data
+make -C tests/cbcf_data
+
+# echo any limits that are in place
+ulimit -a
 
-# run nosetests
-# -s: do not capture stdout, conflicts with pysam.dispatch
-# -v: verbose output
-nosetests -s -v
+# run tests
+pytest
 
 if [ $? != 0 ]; then
     exit 1
 fi
 
-# build source tar-ball. Make sure to build so that .pyx files
-# are cythonized.
-cd ..
+# build source tar-ball. Make sure to run 'build' target so that .pyx
+# files are cythonized.
 python setup.py build sdist
 
 if [ $? != 0 ]; then
diff --git a/samtools/bam.h b/samtools/bam.h
index 108987c..48388b7 100644
--- a/samtools/bam.h
+++ b/samtools/bam.h
@@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE.  */
   @copyright Genome Research Ltd.
  */
 
-#define BAM_VERSION "1.4.1"
+#define BAM_VERSION "1.5"
 
 #include <stdint.h>
 #include <stdlib.h>
diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c
index 18cb6c4..f82686d 100644
--- a/samtools/bam_reheader.c.pysam.c
+++ b/samtools/bam_reheader.c.pysam.c
@@ -436,7 +436,7 @@ int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
     }
 }
 
-static int usage(FILE *fp, int ret) {
+static void usage(FILE *fp, int ret) {
     fprintf(fp,
            "Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n"
            "   or  samtools reheader [-P] -i in.header.sam file.bam\n"
@@ -445,7 +445,7 @@ static int usage(FILE *fp, int ret) {
            "    -P, --no-PG      Do not generate an @PG header line.\n"
            "    -i, --in-place   Modify the bam/cram file directly.\n"
            "                     (Defaults to outputting to pysam_stdout.)\n");
-    return(ret);
+    exit(ret);
 }
 
 int main_reheader(int argc, char *argv[])
@@ -466,16 +466,16 @@ int main_reheader(int argc, char *argv[])
         switch (c) {
         case 'P': add_PG = 0; break;
         case 'i': inplace = 1; break;
-        case 'h': return(usage(pysam_stdout, 0)); break;
+        case 'h': usage(pysam_stdout, 0); break;
         default:
             fprintf(pysam_stderr, "Invalid option '%c'\n", c);
-            return(usage(pysam_stderr, 1));
+            usage(pysam_stderr, 1);
         }
     }
 
     if (argc - optind != 2)
-      return(usage(pysam_stderr, 1));
-
+        usage(pysam_stderr, 1);
+    
     { // read the header
         samFile *fph = sam_open(argv[optind], "r");
         if (fph == 0) {
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c
index be9789c..d32a241 100644
--- a/samtools/bam_sort.c
+++ b/samtools/bam_sort.c
@@ -37,6 +37,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
+#include <pthread.h>
 #include "htslib/ksort.h"
 #include "htslib/khash.h"
 #include "htslib/klist.h"
@@ -45,6 +46,14 @@ DEALINGS IN THE SOFTWARE.  */
 #include "sam_opts.h"
 #include "samtools.h"
 
+
+// Struct which contains the a record, and the pointer to the sort tag (if any)
+// Used to speed up sort-by-tag.
+typedef struct bam1_p {
+    bam1_t *b;
+    const uint8_t *tag;
+} bam1_p;
+
 /* Minimum memory required in megabytes before sort will attempt to run. This
    is to prevent accidents where failing to use the -m option correctly results
    in the creation of a temporary file for each read in the input file.
@@ -81,6 +90,8 @@ KHASH_MAP_INIT_STR(c2i, int)
 KLIST_INIT(hdrln, char*, hdrln_free_char)
 
 static int g_is_by_qname = 0;
+static int g_is_by_tag = 0;
+static char g_sort_tag[2] = {0,0};
 
 static int strnum_cmp(const char *_a, const char *_b)
 {
@@ -111,20 +122,29 @@ static int strnum_cmp(const char *_a, const char *_b)
 typedef struct {
     int i;
     uint64_t pos, idx;
-    bam1_t *b;
+    bam1_p b;
 } heap1_t;
 
 #define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
 
+static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b);
+
 // Function to compare reads in the heap and determine which one is < the other
 static inline int heap_lt(const heap1_t a, const heap1_t b)
 {
-    if (g_is_by_qname) {
+    if (g_is_by_tag) {
+        int t;
+        if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0;
+        t = bam1_lt_by_tag(b.b,a.b);
+        return t;
+    } else if (g_is_by_qname) {
         int t;
-        if (a.b == NULL || b.b == NULL) return a.b == NULL? 1 : 0;
-        t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b));
-        return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0)));
-    } else return __pos_cmp(a, b);
+        if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0;
+        t = strnum_cmp(bam_get_qname(a.b.b), bam_get_qname(b.b.b));
+        return (t > 0 || (t == 0 && (a.b.b->core.flag&0xc0) > (b.b.b->core.flag&0xc0)));
+    } else {
+        return __pos_cmp(a, b);
+    }
 }
 
 KSORT_INIT(heap, heap1_t, heap_lt)
@@ -1098,7 +1118,8 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
 
 /*!
   @abstract    Merge multiple sorted BAM.
-  @param  is_by_qname whether to sort by query name
+  @param  by_qname    whether to sort by query name
+  @param  sort_tag    if non-null, sort by the given tag
   @param  out         output BAM file name
   @param  mode        sam_open() mode to be used to create the final output file
                       (overrides level settings from UNCOMP and LEVEL1 flags)
@@ -1115,7 +1136,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
   @discussion Padding information may NOT correctly maintained. This
   function is NOT thread safe.
  */
-int bam_merge_core2(int by_qname, const char *out, const char *mode,
+int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode,
                     const char *headers, int n, char * const *fn, int flag,
                     const char *reg, int n_threads, const char *cmd,
                     const htsFormat *in_fmt, const htsFormat *out_fmt)
@@ -1150,6 +1171,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
     }
 
     g_is_by_qname = by_qname;
+    if (sort_tag) {
+        g_is_by_tag = 1;
+        g_sort_tag[0] = sort_tag[0];
+        g_sort_tag[1] = sort_tag[1];
+    }
+
     fp = (samFile**)calloc(n, sizeof(samFile*));
     if (!fp) goto mem_fail;
     heap = (heap1_t*)calloc(n, sizeof(heap1_t));
@@ -1324,18 +1351,25 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
         heap1_t *h = heap + i;
         int res;
         h->i = i;
-        h->b = bam_init1();
-        if (!h->b) goto mem_fail;
-        res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b);
+        h->b.b = bam_init1();
+        h->b.tag = NULL;
+        if (!h->b.b) goto mem_fail;
+        res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b.b) : sam_read1(fp[i], hdr[i], h->b.b);
         if (res >= 0) {
-            bam_translate(h->b, translation_tbl + i);
-            h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
+            bam_translate(h->b.b, translation_tbl + i);
+            h->pos = ((uint64_t)h->b.b->core.tid<<32) | (uint32_t)((int32_t)h->b.b->core.pos+1)<<1 | bam_is_rev(h->b.b);
             h->idx = idx++;
+            if (g_is_by_tag) {
+                h->b.tag = bam_aux_get(h->b.b, g_sort_tag);
+            } else {
+                h->b.tag = NULL;
+            }
         }
         else if (res == -1 && (!iter[i] || iter[i]->finished)) {
             h->pos = HEAP_EMPTY;
-            bam_destroy1(h->b);
-            h->b = NULL;
+            bam_destroy1(h->b.b);
+            h->b.b = NULL;
+            h->b.tag = NULL;
         } else {
             print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
             goto fail;
@@ -1357,7 +1391,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
     // Begin the actual merge
     ks_heapmake(heap, n, heap);
     while (heap->pos != HEAP_EMPTY) {
-        bam1_t *b = heap->b;
+        bam1_t *b = heap->b.b;
         if (flag & MERGE_RG) {
             uint8_t *rg = bam_aux_get(b, "RG");
             if (rg) bam_aux_del(b, rg);
@@ -1372,10 +1406,16 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
             bam_translate(b, translation_tbl + heap->i);
             heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
             heap->idx = idx++;
+            if (g_is_by_tag) {
+                heap->b.tag = bam_aux_get(heap->b.b, g_sort_tag);
+            } else {
+                heap->b.tag = NULL;
+            }
         } else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) {
             heap->pos = HEAP_EMPTY;
-            bam_destroy1(heap->b);
-            heap->b = NULL;
+            bam_destroy1(heap->b.b);
+            heap->b.b = NULL;
+            heap->b.tag = NULL;
         } else {
             print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
             goto fail;
@@ -1419,7 +1459,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
         if (iter && iter[i]) hts_itr_destroy(iter[i]);
         if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]);
         if (fp && fp[i]) sam_close(fp[i]);
-        if (heap && heap[i].b) bam_destroy1(heap[i].b);
+        if (heap && heap[i].b.b) bam_destroy1(heap[i].b.b);
     }
     if (hout) bam_hdr_destroy(hout);
     free(RG);
@@ -1439,7 +1479,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch
     strcpy(mode, "wb");
     if (flag & MERGE_UNCOMP) strcat(mode, "0");
     else if (flag & MERGE_LEVEL1) strcat(mode, "1");
-    return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL);
+    return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL);
 }
 
 static void merge_usage(FILE *to)
@@ -1449,6 +1489,7 @@ static void merge_usage(FILE *to)
 "\n"
 "Options:\n"
 "  -n         Input files are sorted by read name\n"
+"  -t TAG     Input files are sorted by TAG value\n"
 "  -r         Attach RG tag (inferred from file names)\n"
 "  -u         Uncompressed BAM output\n"
 "  -f         Overwrite the output BAM if exist\n"
@@ -1467,6 +1508,7 @@ int bam_merge(int argc, char *argv[])
 {
     int c, is_by_qname = 0, flag = 0, ret = 0, level = -1;
     char *fn_headers = NULL, *reg = NULL, mode[12];
+    char *sort_tag = NULL;
     long random_seed = (long)time(NULL);
     char** fn = NULL;
     int fn_size = 0;
@@ -1483,12 +1525,13 @@ int bam_merge(int argc, char *argv[])
         return 0;
     }
 
-    while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'r': flag |= MERGE_RG; break;
         case 'f': flag |= MERGE_FORCE; break;
         case 'h': fn_headers = strdup(optarg); break;
         case 'n': is_by_qname = 1; break;
+        case 't': sort_tag = strdup(optarg); break;
         case '1': flag |= MERGE_LEVEL1; level = 1; break;
         case 'u': flag |= MERGE_UNCOMP; level = 0; break;
         case 'R': reg = strdup(optarg); break;
@@ -1551,7 +1594,7 @@ int bam_merge(int argc, char *argv[])
     strcpy(mode, "wb");
     sam_open_mode(mode+1, argv[optind], NULL);
     if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
-    if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers,
+    if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers,
                         fn_size+nargcfiles, fn, flag, reg, ga.nthreads,
                         "merge", &ga.in, &ga.out) < 0)
         ret = 1;
@@ -1572,10 +1615,6 @@ end:
  * BAM sorting *
  ***************/
 
-#include <pthread.h>
-
-typedef bam1_t *bam1_p;
-
 static int change_SO(bam_hdr_t *h, const char *so)
 {
     char *p, *q, *beg = NULL, *end = NULL, *newtext;
@@ -1611,13 +1650,95 @@ static int change_SO(bam_hdr_t *h, const char *so)
 }
 
 // Function to compare reads and determine which one is < the other
-static inline int bam1_lt(const bam1_p a, const bam1_p b)
+// Handle sort-by-pos and sort-by-name. Used as the secondary sort in bam1_lt_by_tag, if reads are equivalent by tag.
+static inline int bam1_lt_core(const bam1_p a, const bam1_p b)
 {
     if (g_is_by_qname) {
-        int t = strnum_cmp(bam_get_qname(a), bam_get_qname(b));
-        return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
-    } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam_is_rev(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam_is_rev(b)));
+        int t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b));
+        return (t < 0 || (t == 0 && (a.b->core.flag&0xc0) < (b.b->core.flag&0xc0)));
+    } else {
+        return (((uint64_t)a.b->core.tid<<32|(a.b->core.pos+1)<<1|bam_is_rev(a.b)) < ((uint64_t)b.b->core.tid<<32|(b.b->core.pos+1)<<1|bam_is_rev(b.b)));
+    }
 }
+
+uint8_t normalize_type(const uint8_t* aux) {
+    if (*aux == 'c' || *aux == 'C' || *aux == 's' || *aux == 'S' || *aux == 'i' || *aux == 'I') {
+        return 'c';
+    } else if (*aux == 'f' || *aux == 'd') {
+        return 'f';
+    } else if (*aux == 'H' || *aux == 'Z') {
+         return 'H';
+    } else {
+        return *aux;
+    }
+}
+
+// Sort record by tag, using pos or read name as a secondary key if tags are identical. Reads not carrying the tag sort first.
+// Tags are first sorted by the type character (in case the types differ), or by the appropriate comparator for that type if they agree.
+static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b)
+{
+    const uint8_t* aux_a = a.tag;
+    const uint8_t* aux_b = b.tag;
+
+    if (aux_a == NULL && aux_b != NULL) {
+        return 1;
+    } else if (aux_a != NULL && aux_b == NULL) {
+        return 0;
+    } else if (aux_a == NULL && aux_b == NULL) {
+        return bam1_lt_core(a,b);
+    }
+
+    // 'Normalize' the letters of the datatypes to a canonical letter,
+    // so that comparison of different types
+    // forms a correct total ordering.
+    uint8_t a_type = normalize_type(aux_a);
+    uint8_t b_type = normalize_type(aux_b);
+
+    if (a_type != b_type) {
+        // Fix int to float comparisons by using bam_aux2f() to read the int
+        if (a_type == 'c' && b_type == 'f') {
+            a_type = 'f';
+        } else if (a_type == 'f' && b_type == 'c') {
+            b_type = 'f';
+        } else {
+            // Unfixable mismatched types
+            return a_type < b_type ? 1 : 0;
+        }
+    }
+
+    if (a_type == 'c') {
+        int64_t va = bam_aux2i(aux_a);
+        int64_t vb = bam_aux2i(aux_b);
+        return (va < vb || (va == vb && bam1_lt_core(a, b)));
+    } else if (a_type == 'f') {
+        double va = bam_aux2f(aux_a);
+        double vb = bam_aux2f(aux_b);
+        return (va < vb || (va == vb && bam1_lt_core(a,b)));
+    } else if (a_type == 'A') {
+        char va = bam_aux2A(aux_a);
+        char vb = bam_aux2A(aux_b);
+        return (va < vb || (va == vb && bam1_lt_core(a,b)));
+    } else if (a_type == 'H') {
+        int t = strcmp(bam_aux2Z(aux_a), bam_aux2Z(aux_b));
+        return (t < 0 || (t == 0 && bam1_lt_core(a,b)));
+    } else {
+        return bam1_lt_core(a,b);
+    }
+}
+
+// Function to compare reads and determine which one is < the other
+// Handle sort-by-pos, sort-by-name, or sort-by-tag
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+    if (g_is_by_tag) {
+        return bam1_lt_by_tag(a, b);
+    } else {
+        return bam1_lt_core(a,b);
+    }
+}
+
+
+
 KSORT_INIT(sort, bam1_p, bam1_lt)
 
 typedef struct {
@@ -1640,7 +1761,7 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf,
     if (sam_hdr_write(fp, h) != 0) goto fail;
     if (n_threads > 1) hts_set_threads(fp, n_threads);
     for (i = 0; i < l; ++i) {
-        if (sam_write1(fp, h, buf[i]) < 0) goto fail;
+        if (sam_write1(fp, h, buf[i].b) < 0) goto fail;
     }
     if (sam_close(fp) < 0) return -1;
     return 0;
@@ -1662,7 +1783,7 @@ static void *worker(void *data)
     uint32_t max_ncigar = 0;
     int i;
     for (i = 0; i < w->buf_len; i++) {
-        uint32_t nc = w->buf[i]->core.n_cigar;
+        uint32_t nc = w->buf[i].b->core.n_cigar;
         if (max_ncigar < nc)
             max_ncigar = nc;
     }
@@ -1730,6 +1851,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
   and the leftmost position of an alignment
 
   @param  is_by_qname whether to sort by query name
+  @param  sort_by_tag if non-null, sort by the given tag
   @param  fn       name of the file to be sorted
   @param  prefix   prefix of the temporary files (prefix.NNNN.bam are written)
   @param  fnout    name of the final output file to be written
@@ -1743,7 +1865,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
   and then merge them by calling bam_merge_core2(). This function is
   NOT thread safe.
  */
-int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
+int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
                       const char *fnout, const char *modeout,
                       size_t _max_mem, int n_threads,
                       const htsFormat *in_fmt, const htsFormat *out_fmt)
@@ -1752,10 +1874,16 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
     size_t mem, max_k, k, max_mem;
     bam_hdr_t *header = NULL;
     samFile *fp;
-    bam1_t *b, **buf;
+    bam1_p *buf;
+    bam1_t *b;
 
     if (n_threads < 2) n_threads = 1;
     g_is_by_qname = is_by_qname;
+    if (sort_by_tag) {
+        g_is_by_tag = 1;
+        strncpy(g_sort_tag, sort_by_tag, 2);
+    }
+
     max_k = k = 0; mem = 0;
     max_mem = _max_mem * n_threads;
     buf = NULL;
@@ -1769,8 +1897,13 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
         print_error("sort", "failed to read header from \"%s\"", fn);
         goto err;
     }
-    if (is_by_qname) change_SO(header, "queryname");
-    else change_SO(header, "coordinate");
+
+    if (sort_by_tag != NULL)
+        change_SO(header, "unknown");
+    else if (is_by_qname)
+        change_SO(header, "queryname");
+    else
+        change_SO(header, "coordinate");
 
     // No gain to using the thread pool here as the flow of this code
     // is such that we are *either* reading *or* sorting.  Hence a shared
@@ -1783,17 +1916,28 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
         if (k == max_k) {
             size_t kk, old_max = max_k;
             max_k = max_k? max_k<<1 : 0x10000;
-            buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*));
-            for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL;
+            buf = (bam1_p*)realloc(buf, max_k * sizeof(bam1_p));
+            for (kk = old_max; kk < max_k; ++kk) {
+                buf[kk].b = NULL;
+                buf[kk].tag = NULL;
+            }
         }
-        if (buf[k] == NULL) buf[k] = bam_init1();
-        b = buf[k];
+        if (buf[k].b == NULL) buf[k].b = bam_init1();
+        b = buf[k].b;
         if ((ret = sam_read1(fp, header, b)) < 0) break;
         if (b->l_data < b->m_data>>2) { // shrink
             b->m_data = b->l_data;
             kroundup32(b->m_data);
             b->data = (uint8_t*)realloc(b->data, b->m_data);
         }
+
+        // Pull out the pointer to the sort tag if applicable
+        if (g_is_by_tag) {
+            buf[k].tag = bam_aux_get(b, g_sort_tag);
+        } else {
+            buf[k].tag = NULL;
+        }
+
         mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
         ++k;
         if (mem >= max_mem) {
@@ -1832,7 +1976,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
             fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
             sprintf(fns[i], "%s.%.4d.bam", prefix, i);
         }
-        if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
+        if (bam_merge_core2(is_by_qname, sort_by_tag, fnout, modeout, NULL, n_files, fns,
                             MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
                             NULL, n_threads, "sort", in_fmt, out_fmt) < 0) {
             // Propagate bam_merge_core2() failure; it has already emitted a
@@ -1850,7 +1994,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
 
  err:
     // free
-    for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]);
+    for (k = 0; k < max_k; ++k) bam_destroy1(buf[k].b);
     free(buf);
     bam_hdr_destroy(header);
     sam_close(fp);
@@ -1863,7 +2007,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma
     int ret;
     char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
     sprintf(fnout, "%s.bam", prefix);
-    ret = bam_sort_core_ext(is_by_qname, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL);
+    ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL);
     free(fnout);
     return ret;
 }
@@ -1876,6 +2020,7 @@ static void sort_usage(FILE *fp)
 "  -l INT     Set compression level, from 0 (uncompressed) to 9 (best)\n"
 "  -m INT     Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
 "  -n         Sort by read name\n"
+"  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
 "  -o FILE    Write final output to FILE rather than standard output\n"
 "  -T PREFIX  Write temporary files to PREFIX.nnnn.bam\n");
     sam_global_opt_help(fp, "-.O..@");
@@ -1902,6 +2047,7 @@ int bam_sort(int argc, char *argv[])
 {
     size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
     int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1;
+    char* sort_tag = NULL;
     char *fnout = "-", modeout[12];
     kstring_t tmpprefix = { 0, 0, NULL };
     struct stat st;
@@ -1913,10 +2059,11 @@ int bam_sort(int argc, char *argv[])
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'o': fnout = optarg; o_seen = 1; break;
         case 'n': is_by_qname = 1; break;
+        case 't': sort_tag = strdup(optarg); break;
         case 'm': {
                 char *q;
                 max_mem = strtol(optarg, &q, 0);
@@ -1970,7 +2117,7 @@ int bam_sort(int argc, char *argv[])
         ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
     }
 
-    ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
+    ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-",
                             tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
                             &ga.in, &ga.out);
     if (ret >= 0)
diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c
index ea2a30d..524f724 100644
--- a/samtools/bam_sort.c.pysam.c
+++ b/samtools/bam_sort.c.pysam.c
@@ -39,6 +39,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
+#include <pthread.h>
 #include "htslib/ksort.h"
 #include "htslib/khash.h"
 #include "htslib/klist.h"
@@ -47,6 +48,14 @@ DEALINGS IN THE SOFTWARE.  */
 #include "sam_opts.h"
 #include "samtools.h"
 
+
+// Struct which contains the a record, and the pointer to the sort tag (if any)
+// Used to speed up sort-by-tag.
+typedef struct bam1_p {
+    bam1_t *b;
+    const uint8_t *tag;
+} bam1_p;
+
 /* Minimum memory required in megabytes before sort will attempt to run. This
    is to prevent accidents where failing to use the -m option correctly results
    in the creation of a temporary file for each read in the input file.
@@ -83,6 +92,8 @@ KHASH_MAP_INIT_STR(c2i, int)
 KLIST_INIT(hdrln, char*, hdrln_free_char)
 
 static int g_is_by_qname = 0;
+static int g_is_by_tag = 0;
+static char g_sort_tag[2] = {0,0};
 
 static int strnum_cmp(const char *_a, const char *_b)
 {
@@ -113,20 +124,29 @@ static int strnum_cmp(const char *_a, const char *_b)
 typedef struct {
     int i;
     uint64_t pos, idx;
-    bam1_t *b;
+    bam1_p b;
 } heap1_t;
 
 #define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
 
+static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b);
+
 // Function to compare reads in the heap and determine which one is < the other
 static inline int heap_lt(const heap1_t a, const heap1_t b)
 {
-    if (g_is_by_qname) {
+    if (g_is_by_tag) {
+        int t;
+        if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0;
+        t = bam1_lt_by_tag(b.b,a.b);
+        return t;
+    } else if (g_is_by_qname) {
         int t;
-        if (a.b == NULL || b.b == NULL) return a.b == NULL? 1 : 0;
-        t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b));
-        return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0)));
-    } else return __pos_cmp(a, b);
+        if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0;
+        t = strnum_cmp(bam_get_qname(a.b.b), bam_get_qname(b.b.b));
+        return (t > 0 || (t == 0 && (a.b.b->core.flag&0xc0) > (b.b.b->core.flag&0xc0)));
+    } else {
+        return __pos_cmp(a, b);
+    }
 }
 
 KSORT_INIT(heap, heap1_t, heap_lt)
@@ -1100,7 +1120,8 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
 
 /*!
   @abstract    Merge multiple sorted BAM.
-  @param  is_by_qname whether to sort by query name
+  @param  by_qname    whether to sort by query name
+  @param  sort_tag    if non-null, sort by the given tag
   @param  out         output BAM file name
   @param  mode        sam_open() mode to be used to create the final output file
                       (overrides level settings from UNCOMP and LEVEL1 flags)
@@ -1117,7 +1138,7 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
   @discussion Padding information may NOT correctly maintained. This
   function is NOT thread safe.
  */
-int bam_merge_core2(int by_qname, const char *out, const char *mode,
+int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode,
                     const char *headers, int n, char * const *fn, int flag,
                     const char *reg, int n_threads, const char *cmd,
                     const htsFormat *in_fmt, const htsFormat *out_fmt)
@@ -1152,6 +1173,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
     }
 
     g_is_by_qname = by_qname;
+    if (sort_tag) {
+        g_is_by_tag = 1;
+        g_sort_tag[0] = sort_tag[0];
+        g_sort_tag[1] = sort_tag[1];
+    }
+
     fp = (samFile**)calloc(n, sizeof(samFile*));
     if (!fp) goto mem_fail;
     heap = (heap1_t*)calloc(n, sizeof(heap1_t));
@@ -1326,18 +1353,25 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
         heap1_t *h = heap + i;
         int res;
         h->i = i;
-        h->b = bam_init1();
-        if (!h->b) goto mem_fail;
-        res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b);
+        h->b.b = bam_init1();
+        h->b.tag = NULL;
+        if (!h->b.b) goto mem_fail;
+        res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b.b) : sam_read1(fp[i], hdr[i], h->b.b);
         if (res >= 0) {
-            bam_translate(h->b, translation_tbl + i);
-            h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
+            bam_translate(h->b.b, translation_tbl + i);
+            h->pos = ((uint64_t)h->b.b->core.tid<<32) | (uint32_t)((int32_t)h->b.b->core.pos+1)<<1 | bam_is_rev(h->b.b);
             h->idx = idx++;
+            if (g_is_by_tag) {
+                h->b.tag = bam_aux_get(h->b.b, g_sort_tag);
+            } else {
+                h->b.tag = NULL;
+            }
         }
         else if (res == -1 && (!iter[i] || iter[i]->finished)) {
             h->pos = HEAP_EMPTY;
-            bam_destroy1(h->b);
-            h->b = NULL;
+            bam_destroy1(h->b.b);
+            h->b.b = NULL;
+            h->b.tag = NULL;
         } else {
             print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
             goto fail;
@@ -1359,7 +1393,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
     // Begin the actual merge
     ks_heapmake(heap, n, heap);
     while (heap->pos != HEAP_EMPTY) {
-        bam1_t *b = heap->b;
+        bam1_t *b = heap->b.b;
         if (flag & MERGE_RG) {
             uint8_t *rg = bam_aux_get(b, "RG");
             if (rg) bam_aux_del(b, rg);
@@ -1374,10 +1408,16 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
             bam_translate(b, translation_tbl + heap->i);
             heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
             heap->idx = idx++;
+            if (g_is_by_tag) {
+                heap->b.tag = bam_aux_get(heap->b.b, g_sort_tag);
+            } else {
+                heap->b.tag = NULL;
+            }
         } else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) {
             heap->pos = HEAP_EMPTY;
-            bam_destroy1(heap->b);
-            heap->b = NULL;
+            bam_destroy1(heap->b.b);
+            heap->b.b = NULL;
+            heap->b.tag = NULL;
         } else {
             print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
             goto fail;
@@ -1421,7 +1461,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode,
         if (iter && iter[i]) hts_itr_destroy(iter[i]);
         if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]);
         if (fp && fp[i]) sam_close(fp[i]);
-        if (heap && heap[i].b) bam_destroy1(heap[i].b);
+        if (heap && heap[i].b.b) bam_destroy1(heap[i].b.b);
     }
     if (hout) bam_hdr_destroy(hout);
     free(RG);
@@ -1441,7 +1481,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch
     strcpy(mode, "wb");
     if (flag & MERGE_UNCOMP) strcat(mode, "0");
     else if (flag & MERGE_LEVEL1) strcat(mode, "1");
-    return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL);
+    return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL);
 }
 
 static void merge_usage(FILE *to)
@@ -1451,6 +1491,7 @@ static void merge_usage(FILE *to)
 "\n"
 "Options:\n"
 "  -n         Input files are sorted by read name\n"
+"  -t TAG     Input files are sorted by TAG value\n"
 "  -r         Attach RG tag (inferred from file names)\n"
 "  -u         Uncompressed BAM output\n"
 "  -f         Overwrite the output BAM if exist\n"
@@ -1469,6 +1510,7 @@ int bam_merge(int argc, char *argv[])
 {
     int c, is_by_qname = 0, flag = 0, ret = 0, level = -1;
     char *fn_headers = NULL, *reg = NULL, mode[12];
+    char *sort_tag = NULL;
     long random_seed = (long)time(NULL);
     char** fn = NULL;
     int fn_size = 0;
@@ -1485,12 +1527,13 @@ int bam_merge(int argc, char *argv[])
         return 0;
     }
 
-    while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'r': flag |= MERGE_RG; break;
         case 'f': flag |= MERGE_FORCE; break;
         case 'h': fn_headers = strdup(optarg); break;
         case 'n': is_by_qname = 1; break;
+        case 't': sort_tag = strdup(optarg); break;
         case '1': flag |= MERGE_LEVEL1; level = 1; break;
         case 'u': flag |= MERGE_UNCOMP; level = 0; break;
         case 'R': reg = strdup(optarg); break;
@@ -1553,7 +1596,7 @@ int bam_merge(int argc, char *argv[])
     strcpy(mode, "wb");
     sam_open_mode(mode+1, argv[optind], NULL);
     if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
-    if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers,
+    if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers,
                         fn_size+nargcfiles, fn, flag, reg, ga.nthreads,
                         "merge", &ga.in, &ga.out) < 0)
         ret = 1;
@@ -1574,10 +1617,6 @@ end:
  * BAM sorting *
  ***************/
 
-#include <pthread.h>
-
-typedef bam1_t *bam1_p;
-
 static int change_SO(bam_hdr_t *h, const char *so)
 {
     char *p, *q, *beg = NULL, *end = NULL, *newtext;
@@ -1613,13 +1652,95 @@ static int change_SO(bam_hdr_t *h, const char *so)
 }
 
 // Function to compare reads and determine which one is < the other
-static inline int bam1_lt(const bam1_p a, const bam1_p b)
+// Handle sort-by-pos and sort-by-name. Used as the secondary sort in bam1_lt_by_tag, if reads are equivalent by tag.
+static inline int bam1_lt_core(const bam1_p a, const bam1_p b)
 {
     if (g_is_by_qname) {
-        int t = strnum_cmp(bam_get_qname(a), bam_get_qname(b));
-        return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
-    } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam_is_rev(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam_is_rev(b)));
+        int t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b));
+        return (t < 0 || (t == 0 && (a.b->core.flag&0xc0) < (b.b->core.flag&0xc0)));
+    } else {
+        return (((uint64_t)a.b->core.tid<<32|(a.b->core.pos+1)<<1|bam_is_rev(a.b)) < ((uint64_t)b.b->core.tid<<32|(b.b->core.pos+1)<<1|bam_is_rev(b.b)));
+    }
 }
+
+uint8_t normalize_type(const uint8_t* aux) {
+    if (*aux == 'c' || *aux == 'C' || *aux == 's' || *aux == 'S' || *aux == 'i' || *aux == 'I') {
+        return 'c';
+    } else if (*aux == 'f' || *aux == 'd') {
+        return 'f';
+    } else if (*aux == 'H' || *aux == 'Z') {
+         return 'H';
+    } else {
+        return *aux;
+    }
+}
+
+// Sort record by tag, using pos or read name as a secondary key if tags are identical. Reads not carrying the tag sort first.
+// Tags are first sorted by the type character (in case the types differ), or by the appropriate comparator for that type if they agree.
+static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b)
+{
+    const uint8_t* aux_a = a.tag;
+    const uint8_t* aux_b = b.tag;
+
+    if (aux_a == NULL && aux_b != NULL) {
+        return 1;
+    } else if (aux_a != NULL && aux_b == NULL) {
+        return 0;
+    } else if (aux_a == NULL && aux_b == NULL) {
+        return bam1_lt_core(a,b);
+    }
+
+    // 'Normalize' the letters of the datatypes to a canonical letter,
+    // so that comparison of different types
+    // forms a correct total ordering.
+    uint8_t a_type = normalize_type(aux_a);
+    uint8_t b_type = normalize_type(aux_b);
+
+    if (a_type != b_type) {
+        // Fix int to float comparisons by using bam_aux2f() to read the int
+        if (a_type == 'c' && b_type == 'f') {
+            a_type = 'f';
+        } else if (a_type == 'f' && b_type == 'c') {
+            b_type = 'f';
+        } else {
+            // Unfixable mismatched types
+            return a_type < b_type ? 1 : 0;
+        }
+    }
+
+    if (a_type == 'c') {
+        int64_t va = bam_aux2i(aux_a);
+        int64_t vb = bam_aux2i(aux_b);
+        return (va < vb || (va == vb && bam1_lt_core(a, b)));
+    } else if (a_type == 'f') {
+        double va = bam_aux2f(aux_a);
+        double vb = bam_aux2f(aux_b);
+        return (va < vb || (va == vb && bam1_lt_core(a,b)));
+    } else if (a_type == 'A') {
+        char va = bam_aux2A(aux_a);
+        char vb = bam_aux2A(aux_b);
+        return (va < vb || (va == vb && bam1_lt_core(a,b)));
+    } else if (a_type == 'H') {
+        int t = strcmp(bam_aux2Z(aux_a), bam_aux2Z(aux_b));
+        return (t < 0 || (t == 0 && bam1_lt_core(a,b)));
+    } else {
+        return bam1_lt_core(a,b);
+    }
+}
+
+// Function to compare reads and determine which one is < the other
+// Handle sort-by-pos, sort-by-name, or sort-by-tag
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+    if (g_is_by_tag) {
+        return bam1_lt_by_tag(a, b);
+    } else {
+        return bam1_lt_core(a,b);
+    }
+}
+
+
+
 KSORT_INIT(sort, bam1_p, bam1_lt)
 
 typedef struct {
@@ -1642,7 +1763,7 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf,
     if (sam_hdr_write(fp, h) != 0) goto fail;
     if (n_threads > 1) hts_set_threads(fp, n_threads);
     for (i = 0; i < l; ++i) {
-        if (sam_write1(fp, h, buf[i]) < 0) goto fail;
+        if (sam_write1(fp, h, buf[i].b) < 0) goto fail;
     }
     if (sam_close(fp) < 0) return -1;
     return 0;
@@ -1664,7 +1785,7 @@ static void *worker(void *data)
     uint32_t max_ncigar = 0;
     int i;
     for (i = 0; i < w->buf_len; i++) {
-        uint32_t nc = w->buf[i]->core.n_cigar;
+        uint32_t nc = w->buf[i].b->core.n_cigar;
         if (max_ncigar < nc)
             max_ncigar = nc;
     }
@@ -1732,6 +1853,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
   and the leftmost position of an alignment
 
   @param  is_by_qname whether to sort by query name
+  @param  sort_by_tag if non-null, sort by the given tag
   @param  fn       name of the file to be sorted
   @param  prefix   prefix of the temporary files (prefix.NNNN.bam are written)
   @param  fnout    name of the final output file to be written
@@ -1745,7 +1867,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c
   and then merge them by calling bam_merge_core2(). This function is
   NOT thread safe.
  */
-int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
+int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
                       const char *fnout, const char *modeout,
                       size_t _max_mem, int n_threads,
                       const htsFormat *in_fmt, const htsFormat *out_fmt)
@@ -1754,10 +1876,16 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
     size_t mem, max_k, k, max_mem;
     bam_hdr_t *header = NULL;
     samFile *fp;
-    bam1_t *b, **buf;
+    bam1_p *buf;
+    bam1_t *b;
 
     if (n_threads < 2) n_threads = 1;
     g_is_by_qname = is_by_qname;
+    if (sort_by_tag) {
+        g_is_by_tag = 1;
+        strncpy(g_sort_tag, sort_by_tag, 2);
+    }
+
     max_k = k = 0; mem = 0;
     max_mem = _max_mem * n_threads;
     buf = NULL;
@@ -1771,8 +1899,13 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
         print_error("sort", "failed to read header from \"%s\"", fn);
         goto err;
     }
-    if (is_by_qname) change_SO(header, "queryname");
-    else change_SO(header, "coordinate");
+
+    if (sort_by_tag != NULL)
+        change_SO(header, "unknown");
+    else if (is_by_qname)
+        change_SO(header, "queryname");
+    else
+        change_SO(header, "coordinate");
 
     // No gain to using the thread pool here as the flow of this code
     // is such that we are *either* reading *or* sorting.  Hence a shared
@@ -1785,17 +1918,28 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
         if (k == max_k) {
             size_t kk, old_max = max_k;
             max_k = max_k? max_k<<1 : 0x10000;
-            buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*));
-            for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL;
+            buf = (bam1_p*)realloc(buf, max_k * sizeof(bam1_p));
+            for (kk = old_max; kk < max_k; ++kk) {
+                buf[kk].b = NULL;
+                buf[kk].tag = NULL;
+            }
         }
-        if (buf[k] == NULL) buf[k] = bam_init1();
-        b = buf[k];
+        if (buf[k].b == NULL) buf[k].b = bam_init1();
+        b = buf[k].b;
         if ((ret = sam_read1(fp, header, b)) < 0) break;
         if (b->l_data < b->m_data>>2) { // shrink
             b->m_data = b->l_data;
             kroundup32(b->m_data);
             b->data = (uint8_t*)realloc(b->data, b->m_data);
         }
+
+        // Pull out the pointer to the sort tag if applicable
+        if (g_is_by_tag) {
+            buf[k].tag = bam_aux_get(b, g_sort_tag);
+        } else {
+            buf[k].tag = NULL;
+        }
+
         mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
         ++k;
         if (mem >= max_mem) {
@@ -1834,7 +1978,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
             fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
             sprintf(fns[i], "%s.%.4d.bam", prefix, i);
         }
-        if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
+        if (bam_merge_core2(is_by_qname, sort_by_tag, fnout, modeout, NULL, n_files, fns,
                             MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
                             NULL, n_threads, "sort", in_fmt, out_fmt) < 0) {
             // Propagate bam_merge_core2() failure; it has already emitted a
@@ -1852,7 +1996,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix,
 
  err:
     // free
-    for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]);
+    for (k = 0; k < max_k; ++k) bam_destroy1(buf[k].b);
     free(buf);
     bam_hdr_destroy(header);
     sam_close(fp);
@@ -1865,7 +2009,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma
     int ret;
     char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
     sprintf(fnout, "%s.bam", prefix);
-    ret = bam_sort_core_ext(is_by_qname, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL);
+    ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL);
     free(fnout);
     return ret;
 }
@@ -1878,6 +2022,7 @@ static void sort_usage(FILE *fp)
 "  -l INT     Set compression level, from 0 (uncompressed) to 9 (best)\n"
 "  -m INT     Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
 "  -n         Sort by read name\n"
+"  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
 "  -o FILE    Write final output to FILE rather than standard output\n"
 "  -T PREFIX  Write temporary files to PREFIX.nnnn.bam\n");
     sam_global_opt_help(fp, "-.O..@");
@@ -1904,6 +2049,7 @@ int bam_sort(int argc, char *argv[])
 {
     size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
     int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1;
+    char* sort_tag = NULL;
     char *fnout = "-", modeout[12];
     kstring_t tmpprefix = { 0, 0, NULL };
     struct stat st;
@@ -1915,10 +2061,11 @@ int bam_sort(int argc, char *argv[])
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'o': fnout = optarg; o_seen = 1; break;
         case 'n': is_by_qname = 1; break;
+        case 't': sort_tag = strdup(optarg); break;
         case 'm': {
                 char *q;
                 max_mem = strtol(optarg, &q, 0);
@@ -1972,7 +2119,7 @@ int bam_sort(int argc, char *argv[])
         ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
     }
 
-    ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
+    ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-",
                             tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
                             &ga.in, &ga.out);
     if (ret >= 0)
diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c
index bbfe602..31e9b28 100644
--- a/samtools/bam_stat.c.pysam.c
+++ b/samtools/bam_stat.c.pysam.c
@@ -95,11 +95,11 @@ static const char *percent(char *buffer, long long n, long long total)
     return buffer;
 }
 
-static int usage_exit(FILE *fp, int exit_status)
+static void usage_exit(FILE *fp, int exit_status)
 {
     fprintf(fp, "Usage: samtools flagstat [options] <in.bam>\n");
     sam_global_opt_help(fp, "-.---@");
-    return(exit_status);
+    exit(exit_status);
 }
 
 int bam_flagstat(int argc, char *argv[])
@@ -125,13 +125,13 @@ int bam_flagstat(int argc, char *argv[])
         default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
             /* else fall-through */
         case '?':
-	  return(usage_exit(pysam_stderr, EXIT_FAILURE));
+            usage_exit(pysam_stderr, EXIT_FAILURE);
         }
     }
 
     if (argc != optind+1) {
-      if (argc == optind) return(usage_exit(pysam_stdout, EXIT_SUCCESS));
-      else return(usage_exit(pysam_stderr, EXIT_FAILURE));
+        if (argc == optind) usage_exit(pysam_stdout, EXIT_SUCCESS);
+        else usage_exit(pysam_stderr, EXIT_FAILURE);
     }
     fp = sam_open_format(argv[optind], "r", &ga.in);
     if (fp == NULL) {
diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c
index 8956b1f..248bc81 100644
--- a/samtools/bamtk.c.pysam.c
+++ b/samtools/bamtk.c.pysam.c
@@ -183,7 +183,7 @@ int samtools_main(int argc, char *argv[])
         fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
         return 1;
     }
-/* AH:    else if (strcmp(argv[1], "tview") == 0)   ret = bam_tview_main(argc-1, argv+1); */
+    /* else if (strcmp(argv[1], "tview") == 0)   ret = bam_tview_main(argc-1, argv+1); */
     else if (strcmp(argv[1], "--version") == 0) {
         fprintf(pysam_stdout, 
 "samtools %s\n"
diff --git a/samtools/sam_view.c b/samtools/sam_view.c
index 9c2d15b..ee65fcd 100644
--- a/samtools/sam_view.c
+++ b/samtools/sam_view.c
@@ -40,7 +40,9 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/faidx.h"
 #include "htslib/kstring.h"
 #include "htslib/khash.h"
+#include "htslib/klist.h"
 #include "htslib/thread_pool.h"
+#include "htslib/bgzf.h"
 #include "samtools.h"
 #include "sam_opts.h"
 
@@ -48,6 +50,8 @@ DEALINGS IN THE SOFTWARE.  */
 #define DEFAULT_QUALITY_TAG "QT"
 
 KHASH_SET_INIT_STR(rg)
+#define taglist_free(p)
+KLIST_INIT(ktaglist, char*, taglist_free)
 
 typedef khash_t(rg) *rghash_t;
 
@@ -675,8 +679,13 @@ static void bam2fq_usage(FILE *to, const char *command)
 "  -s FILE              write singleton reads to FILE [assume single-end]\n"
 "  -t                   copy RG, BC and QT tags to the %s header line\n",
     fq ? "FASTQ" : "FASTA");
+    fprintf(to,
+"  -T TAGLIST           copy arbitrary tags to the %s header line\n",
+    fq ? "FASTQ" : "FASTA");
     if (fq) fprintf(to,
 "  -v INT               default quality score if not given in file [1]\n"
+"  -i                   add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
+"  -c                   compression level [0..9] to use when creating gz or bgzf fastq files\n"
 "  --i1 FILE            write first index reads to FILE\n"
 "  --i2 FILE            write second index reads to FILE\n"
 "  --barcode-tag TAG    Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
@@ -700,7 +709,7 @@ typedef struct bam2fq_opts {
     char *fnse;
     char *fnr[3];
     char *fn_input; // pointer to input filename in argv do not free
-    bool has12, has12always, use_oq, copy_tags;
+    bool has12, has12always, use_oq, copy_tags, illumina_tag;
     int flag_on, flag_off, flag_alloff;
     sam_global_args ga;
     fastfile filetype;
@@ -709,18 +718,24 @@ typedef struct bam2fq_opts {
     char *quality_tag;
     char *index_file[2];
     char *index_format;
+    char *extra_tags;
+    char compression_level;
 } bam2fq_opts_t;
 
 typedef struct bam2fq_state {
     samFile *fp;
-    FILE *fpse;
-    FILE *fpr[3];
-    FILE *fpi[2];
+    BGZF *fpse;
+    BGZF *fpr[3];
+    BGZF *fpi[2];
+    BGZF *hstdout;
     bam_hdr_t *h;
-    bool has12, use_oq, copy_tags;
+    bool has12, use_oq, copy_tags, illumina_tag;
     int flag_on, flag_off, flag_alloff;
     fastfile filetype;
     int def_qual;
+    klist_t(ktaglist) *taglist;
+    char *index_sequence;
+    char compression_level;
 } bam2fq_state_t;
 
 /*
@@ -771,19 +786,26 @@ static char *get_read(const bam1_t *rec)
 /*
  * get and decode the quality from a BAM record
  */
-static char *get_quality(const bam1_t *rec)
+static int get_quality(const bam1_t *rec, char **qual_out)
 {
     char *quality = calloc(1, rec->core.l_qseq + 1);
     char *q = (char *)bam_get_qual(rec);
     int n;
 
-    if (*q == '\xff') { free(quality); return NULL; }
+    if (!quality) return -1;
+
+    if (*q == '\xff') {
+        free(quality);
+        *qual_out = NULL;
+        return 0;
+    }
 
     for (n=0; n < rec->core.l_qseq; n++) {
         quality[n] = q[n]+33;
     }
     if (rec->core.flag & BAM_FREVERSE) reverse(quality);
-    return quality;
+    *qual_out = quality;
+    return 0;
 }
 
 //
@@ -817,49 +839,131 @@ static int getLength(char **s)
     return n;
 }
 
+static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf)
+{
+    uint8_t *s = bam_aux_get(rec, tag);
+    if (s) {
+        char aux_type = *s;
+        switch (aux_type) {
+            case 'C':
+            case 'S': aux_type = 'I'; break;
+            case 'c':
+            case 's': aux_type = 'i'; break;
+            case 'd': aux_type = 'f'; break;
+        }
+
+        // Ensure space.  Need 6 chars + length of tag.  Max length of
+        // i is 16, A is 21, B currently 26, Z is unknown, so
+        // have to check that one later.
+        if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false;
+
+        kputc('\t', linebuf);
+        kputsn(tag, 2, linebuf);
+        kputc(':', linebuf);
+        kputc(aux_type=='I'? 'i': aux_type, linebuf);
+        kputc(':', linebuf);
+        switch (aux_type) {
+            case 'H':
+            case 'Z':
+                if (kputs(bam_aux2Z(s), linebuf) < 0) return false;
+                break;
+            case 'i': kputw(bam_aux2i(s), linebuf); break;
+            case 'I': kputuw(bam_aux2i(s), linebuf); break;
+            case 'A': kputc(bam_aux2A(s), linebuf); break;
+            case 'f': kputd(bam_aux2f(s), linebuf); break;
+            case 'B': kputs("*** Unhandled aux type ***", linebuf); return false;
+            default:  kputs("*** Unknown aux type ***", linebuf); return false;
+       }
+    }
+    return true;
+}
+
+static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec)
+{
+    if (!index_sequence) return 0;
+
+    kstring_t new = {0,0,NULL};
+    if (linebuf->s) {
+        char *s = strchr(linebuf->s, '\n');
+        if (s) {
+            if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0)
+                return -1;
+            *s = 0;
+            kputs(linebuf->s, &new);
+            kputc(' ', &new);
+            readpart readpart = which_readpart(rec);
+            if (readpart == READ_1) kputc('1', &new);
+            else if (readpart == READ_2) kputc('2', &new);
+            else kputc('0', &new);
+
+            kputc(':', &new);
+            if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new);
+            else                              kputc('N', &new);
+
+            kputs(":0:", &new);
+            kputs(index_sequence, &new);
+            kputc('\n', &new);
+            kputs(s+1, &new);
+            free(ks_release(linebuf));
+            linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m;
+        }
+    }
+    return 0;
+}
+
 static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
 {
     int i;
 
     linebuf->l = 0;
     // Write read name
-    kputc(state->filetype == FASTA? '>' : '@', linebuf);
-    kputs(bam_get_qname(rec), linebuf);
+    if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false;
+    if (kputs(bam_get_qname(rec), linebuf) < 0) return false;
     // Add the /1 /2 if requested
     if (state->has12) {
         readpart readpart = which_readpart(rec);
-        if (readpart == READ_1) kputs("/1", linebuf);
-        else if (readpart == READ_2) kputs("/2", linebuf);
+        if (readpart == READ_1) {
+            if (kputs("/1", linebuf) < 0) return false;
+        } else if (readpart == READ_2) {
+            if (kputs("/2", linebuf) < 0) return false;
+        }
     }
     if (state->copy_tags) {
         for (i = 0; copied_tags[i]; ++i) {
-            uint8_t *s;
-            if ((s = bam_aux_get(rec, copied_tags[i])) != 0) {
-                if (*s == 'Z') {
-                    kputc('\t', linebuf);
-                    kputsn(copied_tags[i], 2, linebuf);
-                    kputsn(":Z:", 3, linebuf);
-                    kputs(bam_aux2Z(s), linebuf);
-                }
+            if (!copy_tag(copied_tags[i], rec, linebuf)) {
+                fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
+                return false;
+            }
+        }
+    }
+
+    if (state->taglist->size) {
+        kliter_t(ktaglist) *p;
+        for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) {
+            if (!copy_tag(kl_val(p), rec, linebuf)) {
+                fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
+                return false;
             }
         }
     }
-    kputc('\n', linebuf);
-    kputs(seq, linebuf);
-    kputc('\n', linebuf);
+
+    if (kputc('\n', linebuf) < 0) return false;
+    if (kputs(seq, linebuf) < 0) return false;
+    if (kputc('\n', linebuf) < 0) return false;
 
     if (state->filetype == FASTQ) {
         // Write quality
-        kputs("+\n", linebuf);
+        if (kputs("+\n", linebuf) < 0) return false;
         if (qual && *qual) {
-            kputs(qual, linebuf);
+            if (kputs(qual, linebuf) < 0) return false;
         } else {
             int len = strlen(seq);
+            if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false;
             for (i = 0; i < len; ++i) {
                 kputc(33 + state->def_qual, linebuf);
             }
         }
-        kputc('\n', linebuf);
+        if (kputc('\n', linebuf) < 0) return false;
     }
     return true;
 }
@@ -867,21 +971,31 @@ static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *li
 /*
  * Create FASTQ lines from the barcode tag using the index-format 
  */
-static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_t* opts)
+static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts)
 {
     uint8_t *p;
     char *ifmt = opts->index_format;
     char *tag = NULL;
     char *qual = NULL;
+    char *sub_tag = NULL;
+    char *sub_qual = NULL;
+    size_t tag_len;
     int file_number = 0;
     kstring_t linebuf = { 0, 0, NULL }; // Buffer
 
+
     // read barcode tag
     p = bam_aux_get(rec,opts->barcode_tag);
     if (p) tag = bam_aux2Z(p);
 
     if (!tag) return true; // there is no tag
 
+    tag_len = strlen(tag);
+    sub_tag = calloc(1, tag_len + 1);
+    if (!sub_tag) goto fail;
+    sub_qual = calloc(1, tag_len + 1);
+    if (!sub_qual) goto fail;
+
     // read quality tag
     p = bam_aux_get(rec, opts->quality_tag);
     if (p) qual = bam_aux2Z(p);
@@ -892,9 +1006,6 @@ static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_
         char action = *ifmt;        // should be 'i' or 'n'
         ifmt++; // skip over action
         int index_len = getLength(&ifmt);
-
-        char *sub_tag = calloc(1, strlen(tag)+1);
-        char *sub_qual = calloc(1, strlen(tag)+1);
         int n = 0;
 
         if (index_len < 0) {
@@ -916,15 +1027,32 @@ static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_
                 n++;
             }
         }
+        sub_tag[n] = '\0';
+        sub_qual[n] = '\0';
 
         if (action=='i' && *sub_tag && state->fpi[file_number]) {
-            make_fq_line(rec, sub_tag, sub_qual, &linebuf, state);
-            fputs(linebuf.s, state->fpi[file_number++]);
+            //if (file_number==0) state->index_sequence = strdup(sub_tag);    // we're going to need this later...
+            state->index_sequence = strdup(sub_tag);    // we're going to need this later...
+            if (!state->index_sequence) goto fail;
+            if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail;
+            if (state->illumina_tag) {
+                if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf, rec) < 0) {
+                    goto fail;
+                }
+            }
+            if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0)
+                goto fail;
         }
-        free(sub_qual); free(sub_tag);
 
     }
 
+    free(sub_qual); free(sub_tag);
+    free(linebuf.s);
+    return true;
+
+ fail:
+    perror(__func__);
+    free(sub_qual); free(sub_tag);
     free(linebuf.s);
     return true;
 }
@@ -939,25 +1067,32 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t
     char *qual = NULL;
 
     char *seq = get_read(b);
+    if (!seq) return false;
 
     if (state->use_oq) {
         oq = bam_aux_get(b, "OQ");
         if (oq) {
             oq++; 
             qual = strdup(bam_aux2Z(oq));
+            if (!qual) goto fail;
             if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
                 reverse(qual);
             }
         }
     } else {
-        qual = get_quality(b);
+        if (get_quality(b, &qual) < 0) goto fail;
     }
 
-    make_fq_line(b, seq, qual, linebuf, state);
+    if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail;
 
     free(qual);
     free(seq);
     return true;
+
+ fail:
+    free(seq);
+    free(qual);
+    return false;
 }
 
 static void free_opts(bam2fq_opts_t *opts)
@@ -965,6 +1100,7 @@ static void free_opts(bam2fq_opts_t *opts)
     free(opts->barcode_tag);
     free(opts->quality_tag);
     free(opts->index_format);
+    free(opts->extra_tags);
     free(opts);
 }
 
@@ -982,6 +1118,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
     opts->index_format = NULL;
     opts->index_file[0] = NULL;
     opts->index_file[1] = NULL;
+    opts->extra_tags = NULL;
+    opts->compression_level = 1;
 
     int c;
     sam_global_args_init(&opts->ga);
@@ -998,7 +1136,7 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
         {"quality-tag", required_argument, NULL, 'q'},
         { NULL, 0, NULL, 0 }
     };
-    while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:nNOs:tv:@:", lopts, NULL)) > 0) {
+    while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) {
         switch (c) {
             case 'b': opts->barcode_tag = strdup(optarg); break;
             case 'q': opts->quality_tag = strdup(optarg); break;
@@ -1016,6 +1154,9 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
             case 'O': opts->use_oq = true; break;
             case 's': opts->fnse = optarg; break;
             case 't': opts->copy_tags = true; break;
+            case 'i': opts->illumina_tag = true; break;
+            case 'c': opts->compression_level = atoi(optarg); break;
+            case 'T': opts->extra_tags = strdup(optarg); break;
             case 'v': opts->def_qual = atoi(optarg); break;
             case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
             default:
@@ -1104,6 +1245,24 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
     return true;
 }
 
+static BGZF *open_fqfile(char *filename, int c)
+{
+    char mode[4] = "w";
+    size_t len = strlen(filename);
+
+    mode[2] = 0; mode[3] = 0;
+    if (len > 3 && strstr(filename + (len - 3),".gz")) {
+        mode[1] = 'g'; mode[2] = c+'0';
+    } else if ((len > 4 && strstr(filename + (len - 4),".bgz"))
+               || (len > 5 && strstr(filename + (len - 5),".bgzf"))) {
+        mode[1] = c+'0';
+    } else {
+        mode[1] = 'u';
+    }
+
+    return bgzf_open(filename,mode);
+}
+
 static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
 {
     bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
@@ -1112,9 +1271,29 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
     state->flag_alloff = opts->flag_alloff;
     state->has12 = opts->has12;
     state->use_oq = opts->use_oq;
+    state->illumina_tag = opts->illumina_tag;
     state->copy_tags = opts->copy_tags;
     state->filetype = opts->filetype;
     state->def_qual = opts->def_qual;
+    state->index_sequence = NULL;
+    state->hstdout = bgzf_dopen(fileno(stdout), "wu");
+    state->compression_level = opts->compression_level;
+
+    state->taglist = kl_init(ktaglist);
+    if (opts->extra_tags) {
+        char *save_p;
+        char *s = strtok_r(opts->extra_tags, ",", &save_p);
+        while (s) {
+            if (strlen(s) != 2) {
+                fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s);
+                free(state);
+                return false;
+            }
+            char **et = kl_pushp(ktaglist, state->taglist);
+            *et = s;
+            s = strtok_r(NULL, ",", &save_p);
+        }
+    }
 
     state->fp = sam_open(opts->fn_input, "r");
     if (state->fp == NULL) {
@@ -1125,7 +1304,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
     if (opts->ga.nthreads > 0)
         hts_set_threads(state->fp, opts->ga.nthreads);
     uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
-    if (opts->use_oq) rf |= SAM_AUX;
+    if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX;
     if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
         fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
         free(state);
@@ -1137,7 +1316,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
         return false;
     }
     if (opts->fnse) {
-        state->fpse = fopen(opts->fnse,"w");
+        state->fpse = open_fqfile(opts->fnse, state->compression_level);
         if (state->fpse == NULL) {
             print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse);
             free(state);
@@ -1148,20 +1327,20 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
     int i;
     for (i = 0; i < 3; ++i) {
         if (opts->fnr[i]) {
-            state->fpr[i] = fopen(opts->fnr[i], "w");
+            state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level);
             if (state->fpr[i] == NULL) {
                 print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]);
                 free(state);
                 return false;
             }
         } else {
-            state->fpr[i] = stdout;
+            state->fpr[i] = state->hstdout;
         }
     }
     for (i = 0; i < 2; i++) {
         state->fpi[i] = NULL;
         if (opts->index_file[i]) {
-            state->fpi[i] = fopen(opts->index_file[i], "w");
+            state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level);
             if (state->fpi[i] == NULL) {
                 print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]);
                 free(state);
@@ -1186,17 +1365,23 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
     bool valid = true;
     bam_hdr_destroy(state->h);
     check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
-    if (state->fpse && fclose(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
+    if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
     int i;
     for (i = 0; i < 3; ++i) {
-        if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
+        if (state->fpr[i] == state->hstdout) {
+            if (i==0 && bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing STDOUT"); valid = false; }
+        } else {
+            if (bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
+        }
     }
     for (i = 0; i < 2; i++) {
-        if (state->fpi[i] && fclose(state->fpi[i])) { 
+        if (state->fpi[i] && bgzf_close(state->fpi[i])) { 
             print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
             valid = false;
         }
     }
+    kl_destroy(ktaglist,state->taglist);
+    free(state->index_sequence);
     free(state);
     return valid;
 }
@@ -1210,8 +1395,10 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
 
 }
 
-static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t* opts)
+static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
 {
+    int n;
+    bam1_t *records[3];
     bam1_t* b = bam_init1();
     char *current_qname = NULL;
     int64_t n_reads = 0, n_singletons = 0; // Statistics
@@ -1219,35 +1406,47 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t*
     int score[3];
     int at_eof;
     if (b == NULL ) {
-        perror("[bam2fq_mainloop_singletontrack] Malloc error for bam record buffer.");
+        perror("[bam2fq_mainloop] Malloc error for bam record buffer.");
         return false;
     }
 
     bool valid = true;
     while (true) {
-        at_eof = sam_read1(state->fp, state->h, b) < 0;
+        int res = sam_read1(state->fp, state->h, b);
+        if (res < -1) {
+            fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n");
+            return false;
+        }
+        at_eof = res < 0;
 
         if (!at_eof && filter_it_out(b, state)) continue;
         if (!at_eof) ++n_reads;
 
         if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) {
             if (current_qname) {
+                if (state->illumina_tag) {
+                    for (n=0; valid && n<3; n++) {
+                        if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false;
+                    }
+                    if (!valid) break;
+                }
+                free(state->index_sequence); state->index_sequence = NULL;
                 if (score[1] > 0 && score[2] > 0) {
                     // print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
-                    if (fputs(linebuf[1].s, state->fpr[1]) == EOF) { valid = false; break; }
-                    if (fputs(linebuf[2].s, state->fpr[2]) == EOF) { valid = false; break; }
-                } else if (score[1] > 0 || score[2] > 0) {
+                    if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
+                    if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
+                } else if ((score[1] > 0 || score[2] > 0) && state->fpse) {
                     // print whichever one exists to fpse
                     if (score[1] > 0) {
-                        if (fputs(linebuf[1].s, state->fpse) == EOF) { valid = false; break; }
+                        if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
                     } else {
-                        if (fputs(linebuf[2].s, state->fpse) == EOF) { valid = false; break; }
+                        if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
                     }
                     ++n_singletons;
                 }
                 if (score[0]) { // TODO: check this
                     // print linebuf[0] to fpr[0]
-                    if (fputs(linebuf[0].s, state->fpr[0]) == EOF) { valid = false; break; }
+                    if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; }
                 }
             }
 
@@ -1255,23 +1454,25 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t*
 
             free(current_qname);
             current_qname = strdup(bam_get_qname(b));
+            if (!current_qname) { valid = false; break; }
             score[0] = score[1] = score[2] = 0;
         }
 
         // Prefer a copy of the read that has base qualities
         int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
         if (b_score > score[which_readpart(b)]) {
+            if (state->fpi[0]) if (!tags2fq(b, state, opts)) return false;
+            records[which_readpart(b)] = b;
             if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) {
                 fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__);
                 return false;
             }
             score[which_readpart(b)] = b_score;
-            if (state->fpi[0]) tags2fq(b, state, opts);
         }
     }
     if (!valid)
     {
-        perror("[bam2fq_mainloop_singletontrack] Error writing to FASTx files.");
+        perror("[bam2fq_mainloop] Error writing to FASTx files.");
     }
     bam_destroy1(b);
     free(current_qname);
@@ -1284,31 +1485,6 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t*
     return valid;
 }
 
-static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
-{
-    // process a name collated BAM into fastq
-    bam1_t* b = bam_init1();
-    if (b == NULL) {
-        perror(NULL);
-        return false;
-    }
-    int64_t n_reads = 0; // Statistics
-    kstring_t linebuf = { 0, 0, NULL }; // Buffer
-    while (sam_read1(state->fp, state->h, b) >= 0) {
-        if (filter_it_out(b, state)) continue;
-        ++n_reads;
-
-        if (!bam1_to_fq(b, &linebuf, state)) return false;
-        fputs(linebuf.s, state->fpr[which_readpart(b)]);
-        if (state->fpi[0]) tags2fq(b, state, opts);
-    }
-    free(linebuf.s);
-    bam_destroy1(b);
-
-    fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
-    return true;
-}
-
 int main_bam2fq(int argc, char *argv[])
 {
     int status = EXIT_SUCCESS;
@@ -1320,11 +1496,7 @@ int main_bam2fq(int argc, char *argv[])
 
     if (!init_state(opts, &state)) return EXIT_FAILURE;
 
-    if (state->fpse) {
-        if (!bam2fq_mainloop_singletontrack(state,opts)) status = EXIT_FAILURE;
-    } else {
-        if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
-    }
+    if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
 
     if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
     sam_global_args_free(&opts->ga);
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c
index 6df47c9..f46cc9f 100644
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -42,7 +42,9 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/faidx.h"
 #include "htslib/kstring.h"
 #include "htslib/khash.h"
+#include "htslib/klist.h"
 #include "htslib/thread_pool.h"
+#include "htslib/bgzf.h"
 #include "samtools.h"
 #include "sam_opts.h"
 
@@ -50,6 +52,8 @@ DEALINGS IN THE SOFTWARE.  */
 #define DEFAULT_QUALITY_TAG "QT"
 
 KHASH_SET_INIT_STR(rg)
+#define taglist_free(p)
+KLIST_INIT(ktaglist, char*, taglist_free)
 
 typedef khash_t(rg) *rghash_t;
 
@@ -677,8 +681,13 @@ static void bam2fq_usage(FILE *to, const char *command)
 "  -s FILE              write singleton reads to FILE [assume single-end]\n"
 "  -t                   copy RG, BC and QT tags to the %s header line\n",
     fq ? "FASTQ" : "FASTA");
+    fprintf(to,
+"  -T TAGLIST           copy arbitrary tags to the %s header line\n",
+    fq ? "FASTQ" : "FASTA");
     if (fq) fprintf(to,
 "  -v INT               default quality score if not given in file [1]\n"
+"  -i                   add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
+"  -c                   compression level [0..9] to use when creating gz or bgzf fastq files\n"
 "  --i1 FILE            write first index reads to FILE\n"
 "  --i2 FILE            write second index reads to FILE\n"
 "  --barcode-tag TAG    Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
@@ -702,7 +711,7 @@ typedef struct bam2fq_opts {
     char *fnse;
     char *fnr[3];
     char *fn_input; // pointer to input filename in argv do not free
-    bool has12, has12always, use_oq, copy_tags;
+    bool has12, has12always, use_oq, copy_tags, illumina_tag;
     int flag_on, flag_off, flag_alloff;
     sam_global_args ga;
     fastfile filetype;
@@ -711,18 +720,24 @@ typedef struct bam2fq_opts {
     char *quality_tag;
     char *index_file[2];
     char *index_format;
+    char *extra_tags;
+    char compression_level;
 } bam2fq_opts_t;
 
 typedef struct bam2fq_state {
     samFile *fp;
-    FILE *fpse;
-    FILE *fpr[3];
-    FILE *fpi[2];
+    BGZF *fpse;
+    BGZF *fpr[3];
+    BGZF *fpi[2];
+    BGZF *hpysam_stdout;
     bam_hdr_t *h;
-    bool has12, use_oq, copy_tags;
+    bool has12, use_oq, copy_tags, illumina_tag;
     int flag_on, flag_off, flag_alloff;
     fastfile filetype;
     int def_qual;
+    klist_t(ktaglist) *taglist;
+    char *index_sequence;
+    char compression_level;
 } bam2fq_state_t;
 
 /*
@@ -773,19 +788,26 @@ static char *get_read(const bam1_t *rec)
 /*
  * get and decode the quality from a BAM record
  */
-static char *get_quality(const bam1_t *rec)
+static int get_quality(const bam1_t *rec, char **qual_out)
 {
     char *quality = calloc(1, rec->core.l_qseq + 1);
     char *q = (char *)bam_get_qual(rec);
     int n;
 
-    if (*q == '\xff') { free(quality); return NULL; }
+    if (!quality) return -1;
+
+    if (*q == '\xff') {
+        free(quality);
+        *qual_out = NULL;
+        return 0;
+    }
 
     for (n=0; n < rec->core.l_qseq; n++) {
         quality[n] = q[n]+33;
     }
     if (rec->core.flag & BAM_FREVERSE) reverse(quality);
-    return quality;
+    *qual_out = quality;
+    return 0;
 }
 
 //
@@ -819,49 +841,131 @@ static int getLength(char **s)
     return n;
 }
 
+static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf)
+{
+    uint8_t *s = bam_aux_get(rec, tag);
+    if (s) {
+        char aux_type = *s;
+        switch (aux_type) {
+            case 'C':
+            case 'S': aux_type = 'I'; break;
+            case 'c':
+            case 's': aux_type = 'i'; break;
+            case 'd': aux_type = 'f'; break;
+        }
+
+        // Ensure space.  Need 6 chars + length of tag.  Max length of
+        // i is 16, A is 21, B currently 26, Z is unknown, so
+        // have to check that one later.
+        if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false;
+
+        kputc('\t', linebuf);
+        kputsn(tag, 2, linebuf);
+        kputc(':', linebuf);
+        kputc(aux_type=='I'? 'i': aux_type, linebuf);
+        kputc(':', linebuf);
+        switch (aux_type) {
+            case 'H':
+            case 'Z':
+                if (kputs(bam_aux2Z(s), linebuf) < 0) return false;
+                break;
+            case 'i': kputw(bam_aux2i(s), linebuf); break;
+            case 'I': kputuw(bam_aux2i(s), linebuf); break;
+            case 'A': kputc(bam_aux2A(s), linebuf); break;
+            case 'f': kputd(bam_aux2f(s), linebuf); break;
+            case 'B': kputs("*** Unhandled aux type ***", linebuf); return false;
+            default:  kputs("*** Unknown aux type ***", linebuf); return false;
+       }
+    }
+    return true;
+}
+
+static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec)
+{
+    if (!index_sequence) return 0;
+
+    kstring_t new = {0,0,NULL};
+    if (linebuf->s) {
+        char *s = strchr(linebuf->s, '\n');
+        if (s) {
+            if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0)
+                return -1;
+            *s = 0;
+            kputs(linebuf->s, &new);
+            kputc(' ', &new);
+            readpart readpart = which_readpart(rec);
+            if (readpart == READ_1) kputc('1', &new);
+            else if (readpart == READ_2) kputc('2', &new);
+            else kputc('0', &new);
+
+            kputc(':', &new);
+            if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new);
+            else                              kputc('N', &new);
+
+            kputs(":0:", &new);
+            kputs(index_sequence, &new);
+            kputc('\n', &new);
+            kputs(s+1, &new);
+            free(ks_release(linebuf));
+            linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m;
+        }
+    }
+    return 0;
+}
+
 static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
 {
     int i;
 
     linebuf->l = 0;
     // Write read name
-    kputc(state->filetype == FASTA? '>' : '@', linebuf);
-    kputs(bam_get_qname(rec), linebuf);
+    if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false;
+    if (kputs(bam_get_qname(rec), linebuf) < 0) return false;
     // Add the /1 /2 if requested
     if (state->has12) {
         readpart readpart = which_readpart(rec);
-        if (readpart == READ_1) kputs("/1", linebuf);
-        else if (readpart == READ_2) kputs("/2", linebuf);
+        if (readpart == READ_1) {
+            if (kputs("/1", linebuf) < 0) return false;
+        } else if (readpart == READ_2) {
+            if (kputs("/2", linebuf) < 0) return false;
+        }
     }
     if (state->copy_tags) {
         for (i = 0; copied_tags[i]; ++i) {
-            uint8_t *s;
-            if ((s = bam_aux_get(rec, copied_tags[i])) != 0) {
-                if (*s == 'Z') {
-                    kputc('\t', linebuf);
-                    kputsn(copied_tags[i], 2, linebuf);
-                    kputsn(":Z:", 3, linebuf);
-                    kputs(bam_aux2Z(s), linebuf);
-                }
+            if (!copy_tag(copied_tags[i], rec, linebuf)) {
+                fprintf(pysam_stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
+                return false;
+            }
+        }
+    }
+
+    if (state->taglist->size) {
+        kliter_t(ktaglist) *p;
+        for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) {
+            if (!copy_tag(kl_val(p), rec, linebuf)) {
+                fprintf(pysam_stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
+                return false;
             }
         }
     }
-    kputc('\n', linebuf);
-    kputs(seq, linebuf);
-    kputc('\n', linebuf);
+
+    if (kputc('\n', linebuf) < 0) return false;
+    if (kputs(seq, linebuf) < 0) return false;
+    if (kputc('\n', linebuf) < 0) return false;
 
     if (state->filetype == FASTQ) {
         // Write quality
-        kputs("+\n", linebuf);
+        if (kputs("+\n", linebuf) < 0) return false;
         if (qual && *qual) {
-            kputs(qual, linebuf);
+            if (kputs(qual, linebuf) < 0) return false;
         } else {
             int len = strlen(seq);
+            if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false;
             for (i = 0; i < len; ++i) {
                 kputc(33 + state->def_qual, linebuf);
             }
         }
-        kputc('\n', linebuf);
+        if (kputc('\n', linebuf) < 0) return false;
     }
     return true;
 }
@@ -869,21 +973,31 @@ static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *li
 /*
  * Create FASTQ lines from the barcode tag using the index-format 
  */
-static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_t* opts)
+static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts)
 {
     uint8_t *p;
     char *ifmt = opts->index_format;
     char *tag = NULL;
     char *qual = NULL;
+    char *sub_tag = NULL;
+    char *sub_qual = NULL;
+    size_t tag_len;
     int file_number = 0;
     kstring_t linebuf = { 0, 0, NULL }; // Buffer
 
+
     // read barcode tag
     p = bam_aux_get(rec,opts->barcode_tag);
     if (p) tag = bam_aux2Z(p);
 
     if (!tag) return true; // there is no tag
 
+    tag_len = strlen(tag);
+    sub_tag = calloc(1, tag_len + 1);
+    if (!sub_tag) goto fail;
+    sub_qual = calloc(1, tag_len + 1);
+    if (!sub_qual) goto fail;
+
     // read quality tag
     p = bam_aux_get(rec, opts->quality_tag);
     if (p) qual = bam_aux2Z(p);
@@ -894,9 +1008,6 @@ static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_
         char action = *ifmt;        // should be 'i' or 'n'
         ifmt++; // skip over action
         int index_len = getLength(&ifmt);
-
-        char *sub_tag = calloc(1, strlen(tag)+1);
-        char *sub_qual = calloc(1, strlen(tag)+1);
         int n = 0;
 
         if (index_len < 0) {
@@ -918,15 +1029,32 @@ static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_
                 n++;
             }
         }
+        sub_tag[n] = '\0';
+        sub_qual[n] = '\0';
 
         if (action=='i' && *sub_tag && state->fpi[file_number]) {
-            make_fq_line(rec, sub_tag, sub_qual, &linebuf, state);
-            fputs(linebuf.s, state->fpi[file_number++]);
+            //if (file_number==0) state->index_sequence = strdup(sub_tag);    // we're going to need this later...
+            state->index_sequence = strdup(sub_tag);    // we're going to need this later...
+            if (!state->index_sequence) goto fail;
+            if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail;
+            if (state->illumina_tag) {
+                if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf, rec) < 0) {
+                    goto fail;
+                }
+            }
+            if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0)
+                goto fail;
         }
-        free(sub_qual); free(sub_tag);
 
     }
 
+    free(sub_qual); free(sub_tag);
+    free(linebuf.s);
+    return true;
+
+ fail:
+    perror(__func__);
+    free(sub_qual); free(sub_tag);
     free(linebuf.s);
     return true;
 }
@@ -941,25 +1069,32 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t
     char *qual = NULL;
 
     char *seq = get_read(b);
+    if (!seq) return false;
 
     if (state->use_oq) {
         oq = bam_aux_get(b, "OQ");
         if (oq) {
             oq++; 
             qual = strdup(bam_aux2Z(oq));
+            if (!qual) goto fail;
             if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
                 reverse(qual);
             }
         }
     } else {
-        qual = get_quality(b);
+        if (get_quality(b, &qual) < 0) goto fail;
     }
 
-    make_fq_line(b, seq, qual, linebuf, state);
+    if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail;
 
     free(qual);
     free(seq);
     return true;
+
+ fail:
+    free(seq);
+    free(qual);
+    return false;
 }
 
 static void free_opts(bam2fq_opts_t *opts)
@@ -967,6 +1102,7 @@ static void free_opts(bam2fq_opts_t *opts)
     free(opts->barcode_tag);
     free(opts->quality_tag);
     free(opts->index_format);
+    free(opts->extra_tags);
     free(opts);
 }
 
@@ -984,6 +1120,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
     opts->index_format = NULL;
     opts->index_file[0] = NULL;
     opts->index_file[1] = NULL;
+    opts->extra_tags = NULL;
+    opts->compression_level = 1;
 
     int c;
     sam_global_args_init(&opts->ga);
@@ -1000,7 +1138,7 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
         {"quality-tag", required_argument, NULL, 'q'},
         { NULL, 0, NULL, 0 }
     };
-    while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:nNOs:tv:@:", lopts, NULL)) > 0) {
+    while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) {
         switch (c) {
             case 'b': opts->barcode_tag = strdup(optarg); break;
             case 'q': opts->quality_tag = strdup(optarg); break;
@@ -1018,6 +1156,9 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
             case 'O': opts->use_oq = true; break;
             case 's': opts->fnse = optarg; break;
             case 't': opts->copy_tags = true; break;
+            case 'i': opts->illumina_tag = true; break;
+            case 'c': opts->compression_level = atoi(optarg); break;
+            case 'T': opts->extra_tags = strdup(optarg); break;
             case 'v': opts->def_qual = atoi(optarg); break;
             case '?': bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false;
             default:
@@ -1106,6 +1247,24 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
     return true;
 }
 
+static BGZF *open_fqfile(char *filename, int c)
+{
+    char mode[4] = "w";
+    size_t len = strlen(filename);
+
+    mode[2] = 0; mode[3] = 0;
+    if (len > 3 && strstr(filename + (len - 3),".gz")) {
+        mode[1] = 'g'; mode[2] = c+'0';
+    } else if ((len > 4 && strstr(filename + (len - 4),".bgz"))
+               || (len > 5 && strstr(filename + (len - 5),".bgzf"))) {
+        mode[1] = c+'0';
+    } else {
+        mode[1] = 'u';
+    }
+
+    return bgzf_open(filename,mode);
+}
+
 static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
 {
     bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
@@ -1114,9 +1273,29 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
     state->flag_alloff = opts->flag_alloff;
     state->has12 = opts->has12;
     state->use_oq = opts->use_oq;
+    state->illumina_tag = opts->illumina_tag;
     state->copy_tags = opts->copy_tags;
     state->filetype = opts->filetype;
     state->def_qual = opts->def_qual;
+    state->index_sequence = NULL;
+    state->hpysam_stdout = bgzf_dopen(fileno(pysam_stdout), "wu");
+    state->compression_level = opts->compression_level;
+
+    state->taglist = kl_init(ktaglist);
+    if (opts->extra_tags) {
+        char *save_p;
+        char *s = strtok_r(opts->extra_tags, ",", &save_p);
+        while (s) {
+            if (strlen(s) != 2) {
+                fprintf(pysam_stderr, "Parsing extra tags - '%s' is not two characters\n", s);
+                free(state);
+                return false;
+            }
+            char **et = kl_pushp(ktaglist, state->taglist);
+            *et = s;
+            s = strtok_r(NULL, ",", &save_p);
+        }
+    }
 
     state->fp = sam_open(opts->fn_input, "r");
     if (state->fp == NULL) {
@@ -1127,7 +1306,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
     if (opts->ga.nthreads > 0)
         hts_set_threads(state->fp, opts->ga.nthreads);
     uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
-    if (opts->use_oq) rf |= SAM_AUX;
+    if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX;
     if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
         fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
         free(state);
@@ -1139,7 +1318,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
         return false;
     }
     if (opts->fnse) {
-        state->fpse = fopen(opts->fnse,"w");
+        state->fpse = open_fqfile(opts->fnse, state->compression_level);
         if (state->fpse == NULL) {
             print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse);
             free(state);
@@ -1150,20 +1329,20 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
     int i;
     for (i = 0; i < 3; ++i) {
         if (opts->fnr[i]) {
-            state->fpr[i] = fopen(opts->fnr[i], "w");
+            state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level);
             if (state->fpr[i] == NULL) {
                 print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]);
                 free(state);
                 return false;
             }
         } else {
-            state->fpr[i] = pysam_stdout;
+            state->fpr[i] = state->hpysam_stdout;
         }
     }
     for (i = 0; i < 2; i++) {
         state->fpi[i] = NULL;
         if (opts->index_file[i]) {
-            state->fpi[i] = fopen(opts->index_file[i], "w");
+            state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level);
             if (state->fpi[i] == NULL) {
                 print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]);
                 free(state);
@@ -1188,17 +1367,23 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
     bool valid = true;
     bam_hdr_destroy(state->h);
     check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
-    if (state->fpse && fclose(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
+    if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
     int i;
     for (i = 0; i < 3; ++i) {
-        if (state->fpr[i] != pysam_stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
+        if (state->fpr[i] == state->hpysam_stdout) {
+            if (i==0 && bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing STDOUT"); valid = false; }
+        } else {
+            if (bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
+        }
     }
     for (i = 0; i < 2; i++) {
-        if (state->fpi[i] && fclose(state->fpi[i])) { 
+        if (state->fpi[i] && bgzf_close(state->fpi[i])) { 
             print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
             valid = false;
         }
     }
+    kl_destroy(ktaglist,state->taglist);
+    free(state->index_sequence);
     free(state);
     return valid;
 }
@@ -1212,8 +1397,10 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
 
 }
 
-static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t* opts)
+static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
 {
+    int n;
+    bam1_t *records[3];
     bam1_t* b = bam_init1();
     char *current_qname = NULL;
     int64_t n_reads = 0, n_singletons = 0; // Statistics
@@ -1221,35 +1408,47 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t*
     int score[3];
     int at_eof;
     if (b == NULL ) {
-        perror("[bam2fq_mainloop_singletontrack] Malloc error for bam record buffer.");
+        perror("[bam2fq_mainloop] Malloc error for bam record buffer.");
         return false;
     }
 
     bool valid = true;
     while (true) {
-        at_eof = sam_read1(state->fp, state->h, b) < 0;
+        int res = sam_read1(state->fp, state->h, b);
+        if (res < -1) {
+            fprintf(pysam_stderr, "[bam2fq_mainloop] Failed to read bam record.\n");
+            return false;
+        }
+        at_eof = res < 0;
 
         if (!at_eof && filter_it_out(b, state)) continue;
         if (!at_eof) ++n_reads;
 
         if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) {
             if (current_qname) {
+                if (state->illumina_tag) {
+                    for (n=0; valid && n<3; n++) {
+                        if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false;
+                    }
+                    if (!valid) break;
+                }
+                free(state->index_sequence); state->index_sequence = NULL;
                 if (score[1] > 0 && score[2] > 0) {
                     // print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
-                    if (fputs(linebuf[1].s, state->fpr[1]) == EOF) { valid = false; break; }
-                    if (fputs(linebuf[2].s, state->fpr[2]) == EOF) { valid = false; break; }
-                } else if (score[1] > 0 || score[2] > 0) {
+                    if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
+                    if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
+                } else if ((score[1] > 0 || score[2] > 0) && state->fpse) {
                     // print whichever one exists to fpse
                     if (score[1] > 0) {
-                        if (fputs(linebuf[1].s, state->fpse) == EOF) { valid = false; break; }
+                        if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
                     } else {
-                        if (fputs(linebuf[2].s, state->fpse) == EOF) { valid = false; break; }
+                        if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
                     }
                     ++n_singletons;
                 }
                 if (score[0]) { // TODO: check this
                     // print linebuf[0] to fpr[0]
-                    if (fputs(linebuf[0].s, state->fpr[0]) == EOF) { valid = false; break; }
+                    if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; }
                 }
             }
 
@@ -1257,23 +1456,25 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t*
 
             free(current_qname);
             current_qname = strdup(bam_get_qname(b));
+            if (!current_qname) { valid = false; break; }
             score[0] = score[1] = score[2] = 0;
         }
 
         // Prefer a copy of the read that has base qualities
         int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
         if (b_score > score[which_readpart(b)]) {
+            if (state->fpi[0]) if (!tags2fq(b, state, opts)) return false;
+            records[which_readpart(b)] = b;
             if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) {
                 fprintf(pysam_stderr, "[%s] Error converting read to FASTA/Q\n", __func__);
                 return false;
             }
             score[which_readpart(b)] = b_score;
-            if (state->fpi[0]) tags2fq(b, state, opts);
         }
     }
     if (!valid)
     {
-        perror("[bam2fq_mainloop_singletontrack] Error writing to FASTx files.");
+        perror("[bam2fq_mainloop] Error writing to FASTx files.");
     }
     bam_destroy1(b);
     free(current_qname);
@@ -1286,31 +1487,6 @@ static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t*
     return valid;
 }
 
-static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
-{
-    // process a name collated BAM into fastq
-    bam1_t* b = bam_init1();
-    if (b == NULL) {
-        perror(NULL);
-        return false;
-    }
-    int64_t n_reads = 0; // Statistics
-    kstring_t linebuf = { 0, 0, NULL }; // Buffer
-    while (sam_read1(state->fp, state->h, b) >= 0) {
-        if (filter_it_out(b, state)) continue;
-        ++n_reads;
-
-        if (!bam1_to_fq(b, &linebuf, state)) return false;
-        fputs(linebuf.s, state->fpr[which_readpart(b)]);
-        if (state->fpi[0]) tags2fq(b, state, opts);
-    }
-    free(linebuf.s);
-    bam_destroy1(b);
-
-    fprintf(pysam_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
-    return true;
-}
-
 int main_bam2fq(int argc, char *argv[])
 {
     int status = EXIT_SUCCESS;
@@ -1322,11 +1498,7 @@ int main_bam2fq(int argc, char *argv[])
 
     if (!init_state(opts, &state)) return EXIT_FAILURE;
 
-    if (state->fpse) {
-        if (!bam2fq_mainloop_singletontrack(state,opts)) status = EXIT_FAILURE;
-    } else {
-        if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
-    }
+    if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
 
     if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
     sam_global_args_free(&opts->ga);
diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c
index 8ebb52a..bbae50c 100644
--- a/samtools/stats.c.pysam.c
+++ b/samtools/stats.c.pysam.c
@@ -220,7 +220,7 @@ typedef struct
 stats_t;
 KHASH_MAP_INIT_STR(c2stats, stats_t*)
 
-static int error(const char *format, ...);
+static void error(const char *format, ...);
 int is_in_regions(bam1_t *bam_line, stats_t *stats);
 void realloc_buffers(stats_t *stats, int seq_len);
 
@@ -1352,7 +1352,7 @@ void init_group_id(stats_t *stats, const char *id)
 }
 
 
-static int error(const char *format, ...)
+static void error(const char *format, ...)
 {
     if ( !format )
     {
@@ -1379,7 +1379,6 @@ static int error(const char *format, ...)
         fprintf(pysam_stdout, "    -x, --sparse                        Suppress outputting IS rows where there are no insertions.\n");
         sam_global_opt_help(pysam_stdout, "-.--.@");
         fprintf(pysam_stdout, "\n");
-	return(0);
     }
     else
     {
@@ -1647,7 +1646,7 @@ int main_stats(int argc, char *argv[])
             case 'S': info->split_tag = optarg; break;
             case 'P': info->split_prefix = optarg; break;
             case '?':
-	    case 'h': return(error(NULL));
+            case 'h': error(NULL);
             default:
                 if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0)
                     error("Unknown argument: %s\n", optarg);
@@ -1660,7 +1659,7 @@ int main_stats(int argc, char *argv[])
     if ( !bam_fname )
     {
         if ( isatty(STDIN_FILENO) )
-	  return(error(NULL));
+            error(NULL);
         bam_fname = "-";
     }
 
diff --git a/samtools/version.h b/samtools/version.h
index 004d7ed..1f3fa45 100644
--- a/samtools/version.h
+++ b/samtools/version.h
@@ -1 +1 @@
-#define SAMTOOLS_VERSION "1.4.1"
+#define SAMTOOLS_VERSION "1.5"
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..5cb6c3f
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,8 @@
+[bdist_wheel]
+universal = 0
+
+[tool:pytest]
+# -s: do not capture stdout, conflicts with pysam.dispatch
+# -v: verbose output
+addopts = -s -v
+testpaths = pysam tests
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py
index 6d9101c..aafa826 100644
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -5,13 +5,9 @@ import collections
 import copy
 import array
 
-from TestUtils import checkFieldEqual
+from TestUtils import checkFieldEqual, BAM_DATADIR, WORKDIR
 
 
-SAMTOOLS = "samtools"
-WORKDIR = "pysam_test_work"
-DATADIR = "pysam_data"
-
 
 class ReadTest(unittest.TestCase):
 
@@ -252,6 +248,8 @@ class TestAlignedSegment(ReadTest):
         self.assertEqual(a.infer_query_length(), 35)
         a.cigarstring = '35M5S'
         self.assertEqual(a.infer_query_length(), 40)
+        a.cigarstring = None
+        self.assertEqual(a.infer_query_length(), None)
 
     def test_infer_read_length(self):
         '''Test infer_read_length on M|=|X|I|D|H|S cigar ops'''
@@ -274,6 +272,8 @@ class TestAlignedSegment(ReadTest):
         self.assertEqual(a.infer_read_length(), 40)
         a.cigarstring = '35M5S'
         self.assertEqual(a.infer_read_length(), 40)
+        a.cigarstring = None
+        self.assertEqual(a.infer_read_length(), None)
 
     def test_get_aligned_pairs_soft_clipping(self):
         a = self.buildRead()
@@ -486,7 +486,7 @@ class TestCigarStats(ReadTest):
 
 
 class TestAlignedPairs(unittest.TestCase):
-    filename = os.path.join(DATADIR, "example_aligned_pairs.bam")
+    filename = os.path.join(BAM_DATADIR, "example_aligned_pairs.bam")
 
     def testReferenceBases(self):
         """reference bases should always be the same nucleotide
@@ -633,7 +633,7 @@ class TestTags(ReadTest):
         see http://groups.google.com/group/pysam-user-group/browse_thread/thread/67ca204059ea465a
         '''
         samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex8.bam"),
+            os.path.join(BAM_DATADIR, "ex8.bam"),
             "rb")
 
         for entry in samfile:
@@ -808,11 +808,11 @@ class TestCopy(ReadTest):
 class TestAsString(unittest.TestCase):
 
     def testAsString(self):
-        with open(os.path.join(DATADIR, "ex2.sam")) as samf:
+        with open(os.path.join(BAM_DATADIR, "ex2.sam")) as samf:
             reference = [x[:-1] for x in samf if not x.startswith("@")]
 
         with pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex2.bam"), "r") as pysamf:
+            os.path.join(BAM_DATADIR, "ex2.bam"), "r") as pysamf:
             for s, p in zip(reference, pysamf):
                 self.assertEqual(s, p.tostring(pysamf))
 
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py
index a866881..f81d752 100644
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -26,11 +26,7 @@ import pysam
 import pysam.samtools
 from TestUtils import checkBinaryEqual, checkURL, \
     check_samtools_view_equal, checkFieldEqual, force_str, \
-    get_temp_filename
-
-
-DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                       "pysam_data"))
+    get_temp_filename, BAM_DATADIR
 
 
 ##################################################
@@ -49,7 +45,7 @@ class BasicTestBAMFromFetch(unittest.TestCase):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.bam"),
+            os.path.join(BAM_DATADIR, "ex3.bam"),
             "rb")
         self.reads = list(self.samfile.fetch())
 
@@ -273,7 +269,7 @@ class BasicTestSAMFromFetch(BasicTestBAMFromFetch):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.sam"),
+            os.path.join(BAM_DATADIR, "ex3.sam"),
             "r")
         self.reads = list(self.samfile.fetch())
 
@@ -282,7 +278,7 @@ class BasicTestCRAMFromFetch(BasicTestBAMFromFetch):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.cram"),
+            os.path.join(BAM_DATADIR, "ex3.cram"),
             "rc")
         self.reads = list(self.samfile.fetch())
 
@@ -328,7 +324,7 @@ class BasicTestSAMFromFilename(BasicTestBAMFromFetch):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.sam"),
+            os.path.join(BAM_DATADIR, "ex3.sam"),
             "r")
         self.reads = [r for r in self.samfile]
 
@@ -337,7 +333,7 @@ class BasicTestCRAMFromFilename(BasicTestCRAMFromFetch):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.cram"),
+            os.path.join(BAM_DATADIR, "ex3.cram"),
             "rc")
         self.reads = [r for r in self.samfile]
 
@@ -346,7 +342,7 @@ class BasicTestBAMFromFilename(BasicTestBAMFromFetch):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.bam"),
+            os.path.join(BAM_DATADIR, "ex3.bam"),
             "rb")
         self.reads = [r for r in self.samfile]
 
@@ -354,7 +350,7 @@ class BasicTestBAMFromFilename(BasicTestBAMFromFetch):
 class BasicTestBAMFromFile(BasicTestBAMFromFetch):
 
     def setUp(self):
-        with open(os.path.join(DATADIR, "ex3.bam")) as f:
+        with open(os.path.join(BAM_DATADIR, "ex3.bam")) as f:
             self.samfile = pysam.AlignmentFile(
                 f, "rb")
         self.reads = [r for r in self.samfile]
@@ -363,7 +359,7 @@ class BasicTestBAMFromFile(BasicTestBAMFromFetch):
 class BasicTestBAMFromFileNo(BasicTestBAMFromFetch):
 
     def setUp(self):
-        with open(os.path.join(DATADIR, "ex3.bam")) as f:
+        with open(os.path.join(BAM_DATADIR, "ex3.bam")) as f:
             self.samfile = pysam.AlignmentFile(
                 f.fileno(), "rb")
         self.reads = [r for r in self.samfile]
@@ -372,7 +368,7 @@ class BasicTestBAMFromFileNo(BasicTestBAMFromFetch):
 class BasicTestSAMFromFile(BasicTestBAMFromFetch):
 
     def setUp(self):
-        with open(os.path.join(DATADIR, "ex3.sam")) as f:
+        with open(os.path.join(BAM_DATADIR, "ex3.sam")) as f:
             self.samfile = pysam.AlignmentFile(
                 f, "r")
         self.reads = [r for r in self.samfile]
@@ -381,7 +377,7 @@ class BasicTestSAMFromFile(BasicTestBAMFromFetch):
 class BasicTestSAMFromFileNo(BasicTestBAMFromFetch):
 
     def setUp(self):
-        with open(os.path.join(DATADIR, "ex3.sam")) as f:
+        with open(os.path.join(BAM_DATADIR, "ex3.sam")) as f:
             self.samfile = pysam.AlignmentFile(
                 f.fileno(), "r")
         self.reads = [r for r in self.samfile]
@@ -390,7 +386,7 @@ class BasicTestSAMFromFileNo(BasicTestBAMFromFetch):
 class BasicTestCRAMFromFile(BasicTestCRAMFromFetch):
 
     def setUp(self):
-        with open(os.path.join(DATADIR, "ex3.cram")) as f:
+        with open(os.path.join(BAM_DATADIR, "ex3.cram")) as f:
             self.samfile = pysam.AlignmentFile(f, "rc")
         self.reads = [r for r in self.samfile]
 
@@ -398,7 +394,7 @@ class BasicTestCRAMFromFile(BasicTestCRAMFromFetch):
 class BasicTestCRAMFromFileNo(BasicTestCRAMFromFetch):
 
     def setUp(self):
-        with open(os.path.join(DATADIR, "ex3.cram")) as f:
+        with open(os.path.join(BAM_DATADIR, "ex3.cram")) as f:
             self.samfile = pysam.AlignmentFile(
                 f.fileno(), "rc")
         self.reads = [r for r in self.samfile]
@@ -408,7 +404,7 @@ class BasicTestSAMFromStringIO(BasicTestBAMFromFetch):
 
     def testRaises(self):
         statement = "samtools view -h {}".format(
-                os.path.join(DATADIR, "ex3.bam"))
+                os.path.join(BAM_DATADIR, "ex3.bam"))
         stdout = subprocess.check_output(statement.split(" "))
         bam = StringIO()
         if sys.version_info.major >= 3:
@@ -461,7 +457,7 @@ class TestIO(unittest.TestCase):
         '''
 
         with pysam.AlignmentFile(
-                os.path.join(DATADIR, input_filename),
+                os.path.join(BAM_DATADIR, input_filename),
                 input_mode) as infile:
 
             if "b" in input_mode:
@@ -498,7 +494,7 @@ class TestIO(unittest.TestCase):
             outfile.close()
 
         self.assertTrue(checkf(
-            os.path.join(DATADIR, reference_filename),
+            os.path.join(BAM_DATADIR, reference_filename),
             output_filename),
             "files %s and %s are not the same" %
             (reference_filename,
@@ -530,7 +526,7 @@ class TestIO(unittest.TestCase):
                        "ex2.cram",
                        "tmp_ex2.cram",
                        "rc", "wc",
-                       sequence_filename="pysam_data/ex1.fa",
+                       sequence_filename=os.path.join(BAM_DATADIR, "ex1.fa"),
                        checkf=check_samtools_view_equal)
 
     def testSAM2BAM(self):
@@ -551,7 +547,7 @@ class TestIO(unittest.TestCase):
                        "ex2.cram",
                        "tmp_ex2.cram",
                        "rb", "wc",
-                       sequence_filename="pysam_data/ex1.fa",
+                       sequence_filename=os.path.join(BAM_DATADIR, "ex1.fa"),
                        checkf=partial(
                            check_samtools_view_equal,
                            without_header=True))
@@ -562,7 +558,7 @@ class TestIO(unittest.TestCase):
                        "ex2.bam",
                        "tmp_ex2.bam",
                        "rc", "wb",
-                       sequence_filename="pysam_data/ex1.fa",
+                       sequence_filename=os.path.join(BAM_DATADIR, "ex1.fa"),
                        checkf=partial(
                            check_samtools_view_equal,
                            without_header=True))
@@ -572,7 +568,7 @@ class TestIO(unittest.TestCase):
                        "ex2.cram",
                        "tmp_ex2.cram",
                        "r", "wc",
-                       sequence_filename="pysam_data/ex1.fa",
+                       sequence_filename=os.path.join(BAM_DATADIR, "ex1.fa"),
                        checkf=partial(
                            check_samtools_view_equal,
                            without_header=True))
@@ -582,7 +578,7 @@ class TestIO(unittest.TestCase):
                        "ex2.sam",
                        "tmp_ex2.sam",
                        "rc", "wh",
-                       sequence_filename="pysam_data/ex1.fa",
+                       sequence_filename=os.path.join(BAM_DATADIR, "ex1.fa"),
                        checkf=partial(
                            check_samtools_view_equal,
                            without_header=True))
@@ -601,7 +597,7 @@ class TestIO(unittest.TestCase):
     def testReadSamWithoutTargetNames(self):
         '''see issue 104.'''
         input_filename = os.path.join(
-            DATADIR,
+            BAM_DATADIR,
             "example_unmapped_reads_no_sq.sam")
 
         # raise exception in default mode
@@ -626,7 +622,7 @@ class TestIO(unittest.TestCase):
     def testReadBamWithoutTargetNames(self):
         '''see issue 104.'''
         input_filename = os.path.join(
-            DATADIR, "example_unmapped_reads_no_sq.bam")
+            BAM_DATADIR, "example_unmapped_reads_no_sq.bam")
 
         # raise exception in default mode
         self.assertRaises(ValueError,
@@ -646,7 +642,7 @@ class TestIO(unittest.TestCase):
             result = list(infile.fetch(until_eof=True))
 
     def test_fail_read_sam_without_header(self):
-        input_filename = os.path.join(DATADIR, "ex1.sam")
+        input_filename = os.path.join(BAM_DATADIR, "ex1.sam")
 
         self.assertRaises(ValueError,
                           pysam.AlignmentFile,
@@ -654,20 +650,20 @@ class TestIO(unittest.TestCase):
                           "r")
 
     def test_pass_read_sam_without_header_with_refs(self):
-        with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.sam"),
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.sam"),
                                  "r",
                                  reference_names=["chr1", "chr2"],
                                  reference_lengths=[1575, 1584]) as samfile:
             self.assertEqual(len(list(samfile.fetch(until_eof=True))), 3270)
 
     def test_pass_read_sam_with_header_without_header_check(self):
-        with pysam.AlignmentFile(os.path.join(DATADIR, "ex2.sam"),
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex2.sam"),
                                  "r", check_header=False) as samfile:
             self.assertEqual(len(list(samfile.fetch(until_eof=True))), 3270)
 
     def test_fail_when_reading_unformatted_files(self):
         '''test reading from a file that is not bam/sam formatted'''
-        input_filename = os.path.join(DATADIR, 'Makefile')
+        input_filename = os.path.join(BAM_DATADIR, 'Makefile')
 
         self.assertRaises(ValueError,
                           pysam.AlignmentFile,
@@ -681,7 +677,7 @@ class TestIO(unittest.TestCase):
 
     def testBAMWithoutAlignedSegments(self):
         '''see issue 117'''
-        input_filename = os.path.join(DATADIR, "test_unaligned.bam")
+        input_filename = os.path.join(BAM_DATADIR, "test_unaligned.bam")
         samfile = pysam.AlignmentFile(input_filename,
                                       "rb",
                                       check_sq=False)
@@ -689,7 +685,7 @@ class TestIO(unittest.TestCase):
 
     def testBAMWithShortBAI(self):
         '''see issue 116'''
-        input_filename = os.path.join(DATADIR, "example_bai.bam")
+        input_filename = os.path.join(BAM_DATADIR, "example_bai.bam")
         samfile = pysam.AlignmentFile(input_filename,
                                       "rb",
                                       check_sq=False)
@@ -698,14 +694,14 @@ class TestIO(unittest.TestCase):
     def testFetchFromClosedFile(self):
 
         samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex1.bam"),
+            os.path.join(BAM_DATADIR, "ex1.bam"),
             "rb")
         samfile.close()
         self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120)
 
     def testFetchFromClosedFileObject(self):
 
-        f = open(os.path.join(DATADIR, "ex1.bam"))
+        f = open(os.path.join(BAM_DATADIR, "ex1.bam"))
         samfile = pysam.AlignmentFile(f, "rb")
         f.close()
         self.assertTrue(f.closed)
@@ -715,7 +711,7 @@ class TestIO(unittest.TestCase):
                        "tmp_ex1.bam",
                        "rb", "wb")
 
-        f = open(os.path.join(DATADIR, "ex1.bam"))
+        f = open(os.path.join(BAM_DATADIR, "ex1.bam"))
         samfile = pysam.AlignmentFile(f, "rb")
         self.assertFalse(f.closed)
         samfile.close()
@@ -725,7 +721,7 @@ class TestIO(unittest.TestCase):
     def testClosedFile(self):
         '''test that access to a closed samfile raises ValueError.'''
 
-        samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
+        samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"),
                                       "rb")
         samfile.close()
         self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120)
@@ -748,7 +744,7 @@ class TestIO(unittest.TestCase):
     # def testReadingFromSamFileWithoutHeader(self):
     #     '''read from samfile without header.
     #     '''
-    #     samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex7.sam"),
+    #     samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex7.sam"),
     #                             check_header=False,
     #                             check_sq=False)
     #     self.assertRaises(NotImplementedError, samfile.__iter__)
@@ -756,15 +752,15 @@ class TestIO(unittest.TestCase):
     def testReadingFromFileWithoutIndex(self):
         '''read from bam file without index.'''
 
-        shutil.copyfile(os.path.join(DATADIR, "ex2.bam"),
-                        'tmp_ex2.bam')
-        samfile = pysam.AlignmentFile('tmp_ex2.bam',
+        shutil.copyfile(os.path.join(BAM_DATADIR, "ex2.bam"),
+                        'tests/tmp_ex2.bam')
+        samfile = pysam.AlignmentFile('tests/tmp_ex2.bam',
                                       "rb")
         self.assertRaises(ValueError, samfile.fetch)
         self.assertEqual(
             len(list(samfile.fetch(until_eof=True))),
             3270)
-        os.unlink('tmp_ex2.bam')
+        os.unlink('tests/tmp_ex2.bam')
 
     # def testReadingUniversalFileMode(self):
     #     '''read from samfile without header.
@@ -781,7 +777,7 @@ class TestIO(unittest.TestCase):
 
     def testHead(self):
         '''test IteratorRowHead'''
-        samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
+        samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"),
                                       "rb")
         l10 = list(samfile.head(10))
         l100 = list(samfile.head(100))
@@ -808,7 +804,7 @@ class TestIO(unittest.TestCase):
                        "r", "wbu")
 
     def testEmptyBAM(self):
-        samfile = pysam.Samfile(os.path.join(DATADIR, "empty.bam"),
+        samfile = pysam.Samfile(os.path.join(BAM_DATADIR, "empty.bam"),
                                 "rb")
         self.assertEqual(samfile.mapped, 0)
         self.assertEqual(samfile.unmapped, 0)
@@ -818,11 +814,11 @@ class TestIO(unittest.TestCase):
         self.assertRaises(
             ValueError,
             pysam.Samfile,
-            os.path.join(DATADIR, "example_empty_with_header.bam"),
+            os.path.join(BAM_DATADIR, "example_empty_with_header.bam"),
             "rb")
 
         samfile = pysam.Samfile(
-            os.path.join(DATADIR, "example_empty_with_header.bam"),
+            os.path.join(BAM_DATADIR, "example_empty_with_header.bam"),
             "rb",
             check_sq=False)
         self.assertEqual(samfile.mapped, 0)
@@ -833,20 +829,20 @@ class TestIO(unittest.TestCase):
     def testOpenFromFilename(self):
 
         samfile = pysam.AlignmentFile(
-            filename=os.path.join(DATADIR, "ex1.bam"),
+            filename=os.path.join(BAM_DATADIR, "ex1.bam"),
             mode="rb")
         self.assertEqual(len(list(samfile.fetch())), 3270)
 
     def testBAMWithCSIIndex(self):
         '''see issue 116'''
-        input_filename = os.path.join(DATADIR, "ex1_csi.bam")
+        input_filename = os.path.join(BAM_DATADIR, "ex1_csi.bam")
         samfile = pysam.AlignmentFile(input_filename,
                                       "rb",
                                       check_sq=False)
         samfile.fetch('chr2')
 
     def test_fetch_by_tid(self):
-        with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), "rb") as samfile:
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"), "rb") as samfile:
             self.assertEqual(len(list(samfile.fetch('chr1'))),
                              len(list(samfile.fetch(tid=0))))
             self.assertEqual(len(list(samfile.fetch('chr2'))),
@@ -861,7 +857,7 @@ class TestIO(unittest.TestCase):
                 tid=-1)
             self.assertEqual(len(list(samfile.fetch('chr1',start=1000, end=2000))),
                              len(list(samfile.fetch(tid=0, start=1000, end=2000))))
-            
+
 
 class TestAutoDetect(unittest.TestCase):
 
@@ -869,7 +865,7 @@ class TestAutoDetect(unittest.TestCase):
         """test SAM autodetection."""
 
         with pysam.AlignmentFile(
-                os.path.join(DATADIR, "ex3.sam")) as inf:
+                os.path.join(BAM_DATADIR, "ex3.sam")) as inf:
             self.assertFalse(inf.is_bam)
             self.assertFalse(inf.is_cram)
 
@@ -879,7 +875,7 @@ class TestAutoDetect(unittest.TestCase):
         """test BAM autodetection."""
 
         with pysam.AlignmentFile(
-                os.path.join(DATADIR, "ex3.bam")) as inf:
+                os.path.join(BAM_DATADIR, "ex3.bam")) as inf:
             self.assertTrue(inf.is_bam)
             self.assertFalse(inf.is_cram)
             self.assertEqual(len(list(inf.fetch('chr1'))), 1)
@@ -889,7 +885,7 @@ class TestAutoDetect(unittest.TestCase):
         """test CRAM autodetection."""
 
         with pysam.AlignmentFile(
-                os.path.join(DATADIR, "ex3.cram")) as inf:
+                os.path.join(BAM_DATADIR, "ex3.cram")) as inf:
             self.assertFalse(inf.is_bam)
             self.assertTrue(inf.is_cram)
             self.assertEqual(len(list(inf.fetch('chr1'))), 1)
@@ -903,7 +899,7 @@ class TestAutoDetect(unittest.TestCase):
 ##################################################
 class TestIteratorRowBAM(unittest.TestCase):
 
-    filename = os.path.join(DATADIR, "ex2.bam")
+    filename = os.path.join(BAM_DATADIR, "ex2.bam")
     mode = "rb"
     reference_filename = None
 
@@ -967,7 +963,7 @@ class TestIteratorRowBAM(unittest.TestCase):
 
 class TestIteratorRowAllBAM(unittest.TestCase):
 
-    filename = os.path.join(DATADIR, "ex2.bam")
+    filename = os.path.join(BAM_DATADIR, "ex2.bam")
     mode = "rb"
 
     def setUp(self):
@@ -1009,7 +1005,7 @@ class TestIteratorColumnBAM(unittest.TestCase):
                   }
 
     def setUp(self):
-        self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex4.bam"),
+        self.samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex4.bam"),
                                            "rb")
 
     def checkRange(self, contig, start=None, end=None, truncate=False):
@@ -1080,12 +1076,12 @@ class TestIteratorColumnBAM(unittest.TestCase):
 
 
 class TestIteratorRowCRAM(TestIteratorRowBAM):
-    filename = os.path.join(DATADIR, "ex2.cram")
+    filename = os.path.join(BAM_DATADIR, "ex2.cram")
     mode = "rc"
 
 
 class TestIteratorRowCRAMWithReferenceFilename(TestIteratorRowCRAM):
-    reference_filename = os.path.join(DATADIR, "ex1.fa")
+    reference_filename = os.path.join(BAM_DATADIR, "ex1.fa")
 
 
 ##########################################################
@@ -1105,7 +1101,7 @@ class TestIteratorColumn2(unittest.TestCase):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex1.bam"),
+            os.path.join(BAM_DATADIR, "ex1.bam"),
             "rb")
 
     def testStart(self):
@@ -1152,7 +1148,7 @@ class TestFloatTagBug(unittest.TestCase):
 
         Fixed in 0.1.19
         '''
-        samfile = pysam.AlignmentFile(os.path.join(DATADIR, "tag_bug.bam"))
+        samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "tag_bug.bam"))
         read = next(samfile.fetch(until_eof=True))
         self.assertTrue(('XC', 1) in read.tags)
         self.assertEqual(read.opt('XC'), 1)
@@ -1167,7 +1163,7 @@ class TestLargeFieldBug(unittest.TestCase):
         causes an error:
             NotImplementedError: tags field too large
         '''
-        samfile = pysam.AlignmentFile(os.path.join(DATADIR, "issue100.bam"))
+        samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "issue100.bam"))
         read = next(samfile.fetch(until_eof=True))
         new_read = pysam.AlignedSegment()
         new_read.tags = read.tags
@@ -1205,14 +1201,19 @@ class TestTagParsing(unittest.TestCase):
     def testNegativeIntegers2(self):
         x = -2
         r = self.makeRead()
-        r.tags = [("XD", int(x))]
+        r.tags = [("XD", x)]
         outfile = pysam.AlignmentFile(
-            "test.bam",
+            "tests/test.bam",
             "wb",
             referencenames=("chr1",),
             referencelengths = (1000,))
         outfile.write(r)
         outfile.close()
+        infile = pysam.AlignmentFile("tests/test.bam")
+        r = next(infile)
+        self.assertEqual(r.tags, [("XD", x)])
+        infile.close()
+        os.unlink("tests/test.bam")
 
     def testCigarString(self):
         r = self.makeRead()
@@ -1283,7 +1284,7 @@ class TestClipping(unittest.TestCase):
     def testClipping(self):
 
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "softclip.bam"),
+            os.path.join(BAM_DATADIR, "softclip.bam"),
             "rb")
 
         for read in self.samfile:
@@ -1354,7 +1355,7 @@ class TestHeaderSAM(unittest.TestCase):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.sam"),
+            os.path.join(BAM_DATADIR, "ex3.sam"),
             "r")
 
     def testHeaders(self):
@@ -1379,7 +1380,7 @@ class TestHeaderBAM(TestHeaderSAM):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.bam"),
+            os.path.join(BAM_DATADIR, "ex3.bam"),
             "rb")
 
 
@@ -1387,7 +1388,7 @@ class TestHeaderCRAM(TestHeaderSAM):
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex3.cram"),
+            os.path.join(BAM_DATADIR, "ex3.cram"),
             "rc")
 
     def compareHeaders(self, a, b):
@@ -1484,7 +1485,7 @@ class TestHeaderWriteRead(unittest.TestCase):
                 fn,
                 flag_write,
                 header=header,
-                reference_filename="pysam_data/ex1.fa") as outf:
+                reference_filename=os.path.join(BAM_DATADIR, "ex1.fa")) as outf:
             a = pysam.AlignedSegment()
             a.query_name = "abc"
             outf.write(a)
@@ -1512,13 +1513,13 @@ class TestUnmappedReads(unittest.TestCase):
 
     # TODO
     # def testSAM(self):
-    #     samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex5.sam"),
+    #     samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex5.sam"),
     #                             "r")
     #     self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2)
     #     samfile.close()
 
     def testBAM(self):
-        samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex5.bam"),
+        samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex5.bam"),
                                       "rb")
         self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2)
         samfile.close()
@@ -1527,7 +1528,7 @@ class TestUnmappedReads(unittest.TestCase):
 class TestPileupObjects(unittest.TestCase):
 
     def setUp(self):
-        self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
+        self.samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"),
                                            "rb")
 
     def testPileupColumn(self):
@@ -1588,7 +1589,7 @@ class TestPileupObjects(unittest.TestCase):
 class TestContextManager(unittest.TestCase):
 
     def testManager(self):
-        with pysam.AlignmentFile(os.path.join(DATADIR, 'ex1.bam'),
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR, 'ex1.bam'),
                                  'rb') as samfile:
             samfile.fetch()
         self.assertEqual(samfile.closed, True)
@@ -1597,7 +1598,7 @@ class TestContextManager(unittest.TestCase):
 class TestExceptions(unittest.TestCase):
 
     def setUp(self):
-        self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
+        self.samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"),
                                            "rb")
 
     def testMissingFile(self):
@@ -1674,14 +1675,14 @@ class TestWrongFormat(unittest.TestCase):
     def testOpenSamAsBam(self):
         self.assertRaises(ValueError,
                           pysam.AlignmentFile,
-                          os.path.join(DATADIR, 'ex1.sam'),
+                          os.path.join(BAM_DATADIR, 'ex1.sam'),
                           'rb')
 
     def testOpenBamAsSam(self):
         # test fails, needs to be implemented.
         # sam.fetch() fails on reading, not on opening
         #self.assertRaises(ValueError, pysam.AlignmentFile,
-        #                  os.path.join(DATADIR, 'ex1.bam'),
+        #                  os.path.join(BAM_DATADIR, 'ex1.bam'),
         #                  'r')
         pass
 
@@ -1694,7 +1695,7 @@ class TestWrongFormat(unittest.TestCase):
     def testOpenFastaAsBam(self):
         self.assertRaises(ValueError,
                           pysam.AlignmentFile,
-                          os.path.join(DATADIR, 'ex1.fa'),
+                          os.path.join(BAM_DATADIR, 'ex1.fa'),
                           'rb')
 
 
@@ -1712,8 +1713,8 @@ class TestDeNovoConstruction(unittest.TestCase):
               'SQ': [{'LN': 1575, 'SN': 'chr1'},
                      {'LN': 1584, 'SN': 'chr2'}], }
 
-    bamfile = os.path.join(DATADIR, "ex6.bam")
-    samfile = os.path.join(DATADIR, "ex6.sam")
+    bamfile = os.path.join(BAM_DATADIR, "ex6.bam")
+    samfile = os.path.join(BAM_DATADIR, "ex6.sam")
 
     def setUp(self):
 
@@ -1816,8 +1817,8 @@ class TestDeNovoConstructionUserTags(TestDeNovoConstruction):
               'x3': {'A': 6, 'B': 5},
               'x2': {'A': 4, 'B': 5}}
 
-    bamfile = os.path.join(DATADIR, "example_user_header.bam")
-    samfile = os.path.join(DATADIR, "example_user_header.sam")
+    bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam")
+    samfile = os.path.join(BAM_DATADIR, "example_user_header.sam")
 
 
 class TestEmptyHeader(unittest.TestCase):
@@ -1825,7 +1826,7 @@ class TestEmptyHeader(unittest.TestCase):
     '''see issue 84.'''
 
     def testEmptyHeader(self):
-        s = pysam.AlignmentFile(os.path.join(DATADIR,
+        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR,
                                              'example_empty_header.bam'))
         self.assertEqual(s.header, {'SQ': [{'LN': 1000, 'SN': 'chr1'}]})
 
@@ -1835,7 +1836,7 @@ class TestHeaderWithProgramOptions(unittest.TestCase):
     '''see issue 39.'''
 
     def testHeader(self):
-        s = pysam.AlignmentFile(os.path.join(DATADIR,
+        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR,
                                              'rg_with_tab.bam'))
         self.assertEqual(
             s.header,
@@ -1856,10 +1857,10 @@ class TestTruncatedBAM(unittest.TestCase):
     def testTruncatedBam2(self):
         self.assertRaises(IOError,
                           pysam.AlignmentFile,
-                          os.path.join(DATADIR, 'ex2_truncated.bam'))
+                          os.path.join(BAM_DATADIR, 'ex2_truncated.bam'))
 
     def testTruncatedBam2(self):
-        s = pysam.AlignmentFile(os.path.join(DATADIR, 'ex2_truncated.bam'),
+        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, 'ex2_truncated.bam'),
                                 ignore_truncation=True)
         iterall = lambda x: len([a for a in x])
         self.assertRaises(IOError, iterall, s)
@@ -1899,7 +1900,7 @@ class TestBTagSam(unittest.TestCase):
                [12, 15],
                [-1.0, 5.0, 2.5]]
 
-    filename = os.path.join(DATADIR, 'example_btag.sam')
+    filename = os.path.join(BAM_DATADIR, 'example_btag.sam')
 
     read0 = [('RG', 'QW85I'),
              ('PG', 'tmap'),
@@ -1920,14 +1921,14 @@ class TestBTagSam(unittest.TestCase):
             tags = read.tags
             if x == 0:
                 self.assertEqual(tags, self.read0)
-            
+
             fz = list(dict(tags)["FZ"])
             self.assertEqual(fz, self.compare[x])
             self.assertEqual(list(read.opt("FZ")), self.compare[x])
             self.assertEqual(tags, read.get_tags())
             for tag, value in tags:
                 self.assertEqual(value, read.get_tag(tag))
-            
+
     def testReadWriteTags(self):
 
         s = pysam.AlignmentFile(self.filename)
@@ -1935,7 +1936,7 @@ class TestBTagSam(unittest.TestCase):
             before = read.tags
             read.tags = before
             self.assertEqual(read.tags, before)
-            
+
             read.set_tags(before)
             self.assertEqual(read.tags, before)
 
@@ -1945,13 +1946,13 @@ class TestBTagSam(unittest.TestCase):
 
 
 class TestBTagBam(TestBTagSam):
-    filename = os.path.join(DATADIR, 'example_btag.bam')
+    filename = os.path.join(BAM_DATADIR, 'example_btag.bam')
 
 
 class TestDoubleFetchBAM(unittest.TestCase):
     '''check if two iterators on the same bamfile are independent.'''
 
-    filename = os.path.join(DATADIR, 'ex1.bam')
+    filename = os.path.join(BAM_DATADIR, 'ex1.bam')
     mode = "rb"
 
     def testDoubleFetch(self):
@@ -1988,14 +1989,14 @@ class TestDoubleFetchBAM(unittest.TestCase):
 
 
 class TestDoubleFetchCRAM(TestDoubleFetchBAM):
-    filename = os.path.join(DATADIR, 'ex2.cram')
+    filename = os.path.join(BAM_DATADIR, 'ex2.cram')
     mode = "rc"
 
 
 class TestDoubleFetchCRAMWithReference(TestDoubleFetchBAM):
-    filename = os.path.join(DATADIR, 'ex2.cram')
+    filename = os.path.join(BAM_DATADIR, 'ex2.cram')
     mode = "rc"
-    reference_filename = os.path.join(DATADIR, 'ex1.fa')
+    reference_filename = os.path.join(BAM_DATADIR, 'ex1.fa')
 
 
 class TestRemoteFileFTP(unittest.TestCase):
@@ -2032,7 +2033,7 @@ class TestRemoteFileHTTP(unittest.TestCase):
 
     url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/ex1.bam"
     region = "chr1:1-1000"
-    local = os.path.join(DATADIR, "ex1.bam")
+    local = os.path.join(BAM_DATADIR, "ex1.bam")
 
     def testView(self):
         if not checkURL(self.url):
@@ -2110,13 +2111,13 @@ class TestLargeOptValues(unittest.TestCase):
 
     def testSAM(self):
         samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex10.sam"),
+            os.path.join(BAM_DATADIR, "ex10.sam"),
             "r")
         self.check(samfile)
 
     def testBAM(self):
         samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex10.bam"),
+            os.path.join(BAM_DATADIR, "ex10.bam"),
             "rb")
         self.check(samfile)
 
@@ -2125,8 +2126,8 @@ class TestPileup(unittest.TestCase):
 
     '''test pileup functionality.'''
 
-    samfilename = "pysam_data/ex1.bam"
-    fastafilename = "pysam_data/ex1.fa"
+    samfilename = os.path.join(BAM_DATADIR, "ex1.bam")
+    fastafilename = os.path.join(BAM_DATADIR, "ex1.fa")
 
     def setUp(self):
 
@@ -2174,8 +2175,8 @@ class TestPileup(unittest.TestCase):
 
 class TestCountCoverage(unittest.TestCase):
 
-    samfilename = "pysam_data/ex1.bam"
-    fastafilename = "pysam_data/ex1.fa"
+    samfilename = os.path.join(BAM_DATADIR, "ex1.bam")
+    fastafilename = os.path.join(BAM_DATADIR, "ex1.fa")
 
     def setUp(self):
 
@@ -2183,7 +2184,7 @@ class TestCountCoverage(unittest.TestCase):
         self.fastafile = pysam.Fastafile(self.fastafilename)
 
         samfile = pysam.AlignmentFile(
-            "test_count_coverage_read_all.bam", 'wb',
+            "tests/test_count_coverage_read_all.bam", 'wb',
             template=self.samfile)
         for ii, read in enumerate(self.samfile.fetch()):
             # if ii % 2 == 0: # setting BFUNMAP makes no sense...
@@ -2196,11 +2197,13 @@ class TestCountCoverage(unittest.TestCase):
                 read.flag = read.flag | 0x400
             samfile.write(read)
         samfile.close()
-        pysam.samtools.index("test_count_coverage_read_all.bam")
+        pysam.samtools.index("tests/test_count_coverage_read_all.bam")
 
     def tearDown(self):
         self.samfile.close()
         self.fastafile.close()
+        os.unlink("tests/test_count_coverage_read_all.bam")
+        os.unlink("tests/test_count_coverage_read_all.bam.bai")
 
     def count_coverage_python(self, bam, chrom, start, stop,
                               read_callback,
@@ -2301,7 +2304,7 @@ class TestCountCoverage(unittest.TestCase):
         def filter(read):
             return not (read.flag & (0x4 | 0x100 | 0x200 | 0x400))
 
-        with pysam.AlignmentFile("test_count_coverage_read_all.bam") as samfile:
+        with pysam.AlignmentFile("tests/test_count_coverage_read_all.bam") as samfile:
 
             fast_counts = samfile.count_coverage(
                 chrom, start, stop,
@@ -2314,9 +2317,6 @@ class TestCountCoverage(unittest.TestCase):
                     read.flag & (0x4 | 0x100 | 0x200 | 0x400)),
                 quality_threshold=0)
 
-        os.unlink("test_count_coverage_read_all.bam")
-        os.unlink("test_count_coverage_read_all.bam.bai")
-
         self.assertEqual(fast_counts[0], manual_counts[0])
         self.assertEqual(fast_counts[1], manual_counts[1])
         self.assertEqual(fast_counts[2], manual_counts[2])
@@ -2324,7 +2324,7 @@ class TestCountCoverage(unittest.TestCase):
 
     def test_count_coverage_nofilter(self):
         samfile = pysam.AlignmentFile(
-            "test_count_coverage_nofilter.bam", 'wb', template=self.samfile)
+            "tests/test_count_coverage_nofilter.bam", 'wb', template=self.samfile)
         for ii, read in enumerate(self.samfile.fetch()):
             # if ii % 2 == 0: # setting BFUNMAP makes no sense...
                 #read.flag = read.flag | 0x4
@@ -2336,12 +2336,12 @@ class TestCountCoverage(unittest.TestCase):
                 read.flag = read.flag | 0x400
             samfile.write(read)
         samfile.close()
-        pysam.samtools.index("test_count_coverage_nofilter.bam")
+        pysam.samtools.index("tests/test_count_coverage_nofilter.bam")
         chr = 'chr1'
         start = 0
         stop = 2000
 
-        with pysam.AlignmentFile("test_count_coverage_nofilter.bam") as samfile:
+        with pysam.AlignmentFile("tests/test_count_coverage_nofilter.bam") as samfile:
 
             fast_counts = samfile.count_coverage(chr, start, stop,
                                                  read_callback='nofilter',
@@ -2351,8 +2351,8 @@ class TestCountCoverage(unittest.TestCase):
                                                        read_callback=lambda x: True,
                                                        quality_threshold=0)
 
-        os.unlink("test_count_coverage_nofilter.bam")
-        os.unlink("test_count_coverage_nofilter.bam.bai")
+        os.unlink("tests/test_count_coverage_nofilter.bam")
+        os.unlink("tests/test_count_coverage_nofilter.bam.bai")
         self.assertEqual(fast_counts[0], manual_counts[0])
         self.assertEqual(fast_counts[1], manual_counts[1])
         self.assertEqual(fast_counts[2], manual_counts[2])
@@ -2365,7 +2365,7 @@ class TestPileupQueryPosition(unittest.TestCase):
 
     def testPileup(self):
         last = {}
-        with pysam.AlignmentFile(os.path.join(DATADIR, self.filename)) as inf:
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR, self.filename)) as inf:
             for col in inf.pileup():
                 for r in col.pileups:
                     # print r.alignment.query_name
@@ -2380,7 +2380,7 @@ class TestPileupQueryPosition(unittest.TestCase):
 
 
 class TestFindIntrons(unittest.TestCase):
-    samfilename = "pysam_data/ex_spliced.bam"
+    samfilename = os.path.join(BAM_DATADIR, "ex_spliced.bam")
 
     def setUp(self):
         self.samfile = pysam.AlignmentFile(self.samfilename)
@@ -2392,7 +2392,7 @@ class TestFindIntrons(unittest.TestCase):
         all_read_counts = self.samfile.count()
         splice_sites = self.samfile.find_introns(self.samfile.fetch())
         self.assertEqual(sum(splice_sites.values()), all_read_counts -1)  # there is a single unspliced read in there
-         
+
     def test_first(self):
         reads = list(self.samfile.fetch())[:10]
         splice_sites = self.samfile.find_introns(reads)
@@ -2442,21 +2442,21 @@ class TestLogging(unittest.TestCase):
         self.assertTrue(True)
 
     def testFail1(self):
-        self.check(os.path.join(DATADIR, "ex9_fail.bam"),
+        self.check(os.path.join(BAM_DATADIR, "ex9_fail.bam"),
                    False)
-        self.check(os.path.join(DATADIR, "ex9_fail.bam"),
+        self.check(os.path.join(BAM_DATADIR, "ex9_fail.bam"),
                    True)
 
     def testNoFail1(self):
-        self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
+        self.check(os.path.join(BAM_DATADIR, "ex9_nofail.bam"),
                    False)
-        self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
+        self.check(os.path.join(BAM_DATADIR, "ex9_nofail.bam"),
                    True)
 
     def testNoFail2(self):
-        self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
+        self.check(os.path.join(BAM_DATADIR, "ex9_nofail.bam"),
                    True)
-        self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
+        self.check(os.path.join(BAM_DATADIR, "ex9_nofail.bam"),
                    True)
 
 # TODOS
@@ -2470,7 +2470,7 @@ class TestAlignmentFileUtilityFunctions(unittest.TestCase):
     def testCount(self):
 
         with pysam.AlignmentFile(
-                os.path.join(DATADIR, "ex1.bam"),
+                os.path.join(BAM_DATADIR, "ex1.bam"),
                 "rb") as samfile:
 
             for contig in ("chr1", "chr2"):
@@ -2508,7 +2508,7 @@ class TestAlignmentFileUtilityFunctions(unittest.TestCase):
     def testMate(self):
         '''test mate access.'''
 
-        with open(os.path.join(DATADIR, "ex1.sam"), "rb") as inf:
+        with open(os.path.join(BAM_DATADIR, "ex1.sam"), "rb") as inf:
             readnames = [x.split(b"\t")[0] for x in inf.readlines()]
         if sys.version_info[0] >= 3:
             readnames = [name.decode('ascii') for name in readnames]
@@ -2517,7 +2517,7 @@ class TestAlignmentFileUtilityFunctions(unittest.TestCase):
         for x in readnames:
             counts[x] += 1
 
-        with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"),
                                  "rb") as samfile:
 
             for read in samfile.fetch():
@@ -2541,7 +2541,7 @@ class TestAlignmentFileUtilityFunctions(unittest.TestCase):
     def testIndexStats(self):
         '''test if total number of mapped/unmapped reads is correct.'''
 
-        with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"),
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"),
                                  "rb") as samfile:
             self.assertEqual(samfile.mapped, 3235)
             self.assertEqual(samfile.unmapped, 35)
@@ -2551,9 +2551,9 @@ class TestAlignmentFileUtilityFunctions(unittest.TestCase):
 class TestMappedUnmapped(unittest.TestCase):
     filename = "test_mapped_unmapped.bam"
 
-    def testMapped(self):
+    def test_counts_of_mapped_and_unmapped_are_correct(self):
 
-        with pysam.AlignmentFile(os.path.join(DATADIR,
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR,
                                               self.filename)) as inf:
             unmapped_flag = 0
             unmapped_nopos = 0
@@ -2586,6 +2586,31 @@ class TestMappedUnmapped(unittest.TestCase):
             self.assertEqual(inf.count(until_eof=True, read_callback="all"),
                              inf.mapped)
 
+    def test_counts_of_mapped_and_unmapped_are_correct_per_chromosome(self):
+
+        with pysam.AlignmentFile(os.path.join(BAM_DATADIR,
+                                              self.filename)) as inf:
+
+            counts = inf.get_index_statistics()
+
+            counts_contigs = [x.contig for x in counts]
+            self.assertEqual(sorted(counts_contigs),
+                             sorted(inf.references))
+            
+            for contig in inf.references:
+                unmapped_flag = 0
+                unmapped_nopos = 0
+                mapped_flag = 0
+                for x in inf.fetch(contig=contig):
+                    if x.is_unmapped:
+                        unmapped_flag += 1
+                    else:
+                        mapped_flag += 1
+
+                cc = [c for c in counts if c.contig == contig][0]
+                self.assertEqual(cc.mapped, mapped_flag)
+                self.assertEqual(cc.unmapped, unmapped_flag)
+                self.assertEqual(cc.total, mapped_flag + unmapped_flag)
 
 class TestSamtoolsProxy(unittest.TestCase):
 
@@ -2610,7 +2635,7 @@ class TestAlignmentFileIndex(unittest.TestCase):
 
     def testIndex(self):
         samfile = pysam.AlignmentFile(
-            os.path.join(DATADIR, "ex1.bam"),
+            os.path.join(BAM_DATADIR, "ex1.bam"),
             "rb")
         index = pysam.IndexedReads(samfile)
         index.build()
@@ -2630,16 +2655,16 @@ class TestExplicitIndex(unittest.TestCase):
 
     def testExplicitIndexBAM(self):
         with pysam.AlignmentFile(
-                os.path.join(DATADIR, "explicit_index.bam"),
+                os.path.join(BAM_DATADIR, "explicit_index.bam"),
                 "rb",
-                filepath_index=os.path.join(DATADIR, 'ex1.bam.bai')) as samfile:
+                filepath_index=os.path.join(BAM_DATADIR, 'ex1.bam.bai')) as samfile:
             samfile.fetch("chr1")
 
     def testExplicitIndexCRAM(self):
         with pysam.AlignmentFile(
-                os.path.join(DATADIR, "explicit_index.cram"),
+                os.path.join(BAM_DATADIR, "explicit_index.cram"),
                 "rc",
-                filepath_index=os.path.join(DATADIR, 'ex1.cram.crai')) as samfile:
+                filepath_index=os.path.join(BAM_DATADIR, 'ex1.cram.crai')) as samfile:
             samfile.fetch("chr1")
 
     def testRemoteExplicitIndexBAM(self):
@@ -2650,7 +2675,7 @@ class TestExplicitIndex(unittest.TestCase):
         with pysam.AlignmentFile(
                 "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam",
                 "rb",
-                filepath_index=os.path.join(DATADIR, 'ex1.bam.bai')) as samfile:
+                filepath_index=os.path.join(BAM_DATADIR, 'ex1.bam.bai')) as samfile:
             samfile.fetch("chr1")
 
 
@@ -2668,16 +2693,16 @@ class TestVerbosity(unittest.TestCase):
 
 
 class TestSanityCheckingBAM(unittest.TestCase):
-    
+
     mode = "wb"
 
     def check_write(self, read):
-        
+
         fn = "tmp_test_sanity_check.bam"
         names = ["chr1"]
         lengths = [10000]
         with pysam.AlignmentFile(
-                fn, 
+                fn,
                 self.mode,
                 reference_names=names,
                 reference_lengths=lengths) as outf:
@@ -2685,7 +2710,7 @@ class TestSanityCheckingBAM(unittest.TestCase):
 
         if os.path.exists(fn):
             os.unlink(fn)
-            
+
     def test_empty_read_gives_value_error(self):
         read = pysam.AlignedSegment()
         self.check_write(read)
@@ -2693,12 +2718,12 @@ class TestSanityCheckingBAM(unittest.TestCase):
 # SAM writing fails, as query length is 0
 # class TestSanityCheckingSAM(TestSanityCheckingSAM):
 #     mode = "w"
-    
+
 
 if __name__ == "__main__":
     # build data files
     print ("building data files")
-    subprocess.call("make -C %s" % DATADIR, shell=True)
+    subprocess.call("make -C %s" % BAM_DATADIR, shell=True)
     print ("starting tests")
     unittest.main()
     print ("completed tests")
diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py
index de54de5..f6c5ced 100644
--- a/tests/StreamFiledescriptors_test.py
+++ b/tests/StreamFiledescriptors_test.py
@@ -9,9 +9,7 @@ from pysam import AlignmentFile
 
 IS_PYTHON2 = sys.version_info[0] == 2
 
-DATADIR = os.path.abspath(os.path.join(
-    os.path.dirname(__file__),
-    "pysam_data"))
+from TestUtils import BAM_DATADIR
 
 
 def alignmentfile_writer_thread(infile, outfile):
@@ -52,7 +50,7 @@ class StreamTest(unittest.TestCase):
                               stdout=subprocess.PIPE,
                               shell=True) as proc:
 
-            in_stream = AlignmentFile('pysam_data/ex1.bam')
+            in_stream = AlignmentFile(os.path.join(BAM_DATADIR, 'ex1.bam'))
             out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header)
             writer = alignmentfile_writer_thread(in_stream,
                                                  out_stream)
@@ -75,7 +73,7 @@ class StreamTest(unittest.TestCase):
                               stdout=subprocess.PIPE,
                               shell=True) as proc:
         
-            in_stream = AlignmentFile('pysam_data/ex1.bam')
+            in_stream = AlignmentFile(os.path.join(BAM_DATADIR, 'ex1.bam'))
             out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header)
             writer = alignmentfile_writer_thread(in_stream,
                                                  out_stream)
diff --git a/tests/TestUtils.py b/tests/TestUtils.py
index 1168926..dc95e09 100644
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -6,6 +6,21 @@ import gzip
 import inspect
 import tempfile
 
+WORKDIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                       "pysam_test_work"))
+
+BAM_DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             "pysam_data"))
+
+TABIX_DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             "tabix_data"))
+
+CBCF_DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             "cbcf_data"))
+
+LINKDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "linker_tests"))
+
+
 IS_PYTHON3 = sys.version_info[0] >= 3
 
 if IS_PYTHON3:
@@ -181,7 +196,7 @@ def get_temp_filename(suffix=""):
         prefix="tmp_{}_".format(caller_name),
         suffix=suffix,
         delete=False,
-        dir=".")
+        dir="tests")
     f.close()
     return f.name
 
diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py
index 93307e9..dd8df5b 100644
--- a/tests/VariantFile_test.py
+++ b/tests/VariantFile_test.py
@@ -10,9 +10,7 @@ try:
 except ImportError:
     Path = None
 
-from TestUtils import get_temp_filename, check_lines_equal, load_and_convert
-
-DATADIR="cbcf_data"
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR
 
 
 def read_header(filename):
@@ -37,12 +35,12 @@ class TestMissingGenotypes(unittest.TestCase):
 
     def setUp(self):
         self.compare = load_and_convert(
-            os.path.join(DATADIR, self.filename),
+            os.path.join(CBCF_DATADIR, self.filename),
             encode=False)
 
     def check(self, filename):
         """see issue 203 - check for segmentation fault"""
-        fn = os.path.join(DATADIR, filename)
+        fn = os.path.join(CBCF_DATADIR, filename)
         self.assertEqual(True, os.path.exists(fn))
         v = pysam.VariantFile(fn)
         for site in v:
@@ -73,67 +71,67 @@ class TestOpening(unittest.TestCase):
                           "missing_file.vcf.gz")
 
     def testEmptyFileVCF(self):
-        with open("tmp_testEmptyFile.vcf", "w"):
+        with open("tests/tmp_testEmptyFile.vcf", "w"):
             pass
 
         self.assertRaises(ValueError, pysam.VariantFile,
-                          "tmp_testEmptyFile.vcf")
+                          "tests/tmp_testEmptyFile.vcf")
 
-        os.unlink("tmp_testEmptyFile.vcf")
+        os.unlink("tests/tmp_testEmptyFile.vcf")
 
 
     if Path and sys.version_info >= (3,6):
         def testEmptyFileVCFFromPath(self):
-            with open("tmp_testEmptyFile.vcf", "w"):
+            with open("tests/tmp_testEmptyFile.vcf", "w"):
                 pass
 
             self.assertRaises(ValueError, pysam.VariantFile,
-                              Path("tmp_testEmptyFile.vcf"))
+                              Path("tests/tmp_testEmptyFile.vcf"))
 
-            os.unlink("tmp_testEmptyFile.vcf")
+            os.unlink("tests/tmp_testEmptyFile.vcf")
 
     def testEmptyFileVCFGZWithIndex(self):
-        with open("tmp_testEmptyFile.vcf", "w"):
+        with open("tests/tmp_testEmptyFile.vcf", "w"):
             pass
 
-        pysam.tabix_index("tmp_testEmptyFile.vcf",
+        pysam.tabix_index("tests/tmp_testEmptyFile.vcf",
                           preset="vcf",
                           force=True)
 
         self.assertRaises(ValueError, pysam.VariantFile,
-                          "tmp_testEmptyFile.vcf.gz")
+                          "tests/tmp_testEmptyFile.vcf.gz")
 
-        os.unlink("tmp_testEmptyFile.vcf.gz")
-        os.unlink("tmp_testEmptyFile.vcf.gz.tbi")
+        os.unlink("tests/tmp_testEmptyFile.vcf.gz")
+        os.unlink("tests/tmp_testEmptyFile.vcf.gz.tbi")
 
     def testEmptyFileVCFGZWithoutIndex(self):
-        with open("tmp_testEmptyFileWithoutIndex.vcf", "w"):
+        with open("tests/tmp_testEmptyFileWithoutIndex.vcf", "w"):
             pass
 
-        pysam.tabix_compress("tmp_testEmptyFileWithoutIndex.vcf",
-                             "tmp_testEmptyFileWithoutIndex.vcf.gz",
+        pysam.tabix_compress("tests/tmp_testEmptyFileWithoutIndex.vcf",
+                             "tests/tmp_testEmptyFileWithoutIndex.vcf.gz",
                              force=True)
 
         self.assertRaises(ValueError, pysam.VariantFile,
-                          "tmp_testEmptyFileWithoutIndex.vcf.gz")
+                          "tests/tmp_testEmptyFileWithoutIndex.vcf.gz")
 
-        os.unlink("tmp_testEmptyFileWithoutIndex.vcf")
-        os.unlink("tmp_testEmptyFileWithoutIndex.vcf.gz")
+        os.unlink("tests/tmp_testEmptyFileWithoutIndex.vcf")
+        os.unlink("tests/tmp_testEmptyFileWithoutIndex.vcf.gz")
 
     def testEmptyFileVCFOnlyHeader(self):
         with pysam.VariantFile(os.path.join(
-                DATADIR,
+                CBCF_DATADIR,
                 "example_vcf42_only_header.vcf")) as inf:
             self.assertEqual(len(list(inf.fetch())), 0)
 
     def testEmptyFileVCFGZOnlyHeader(self):
         with pysam.VariantFile(os.path.join(
-                DATADIR,
+                CBCF_DATADIR,
                 "example_vcf42_only_header.vcf")) as inf:
             self.assertEqual(len(list(inf.fetch())), 0)
 
     def testDetectVCF(self):
-        with pysam.VariantFile(os.path.join(DATADIR,
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR,
             "example_vcf40.vcf")) as inf:
             self.assertEqual(inf.category, 'VARIANTS')
             self.assertEqual(inf.format, 'VCF')
@@ -143,7 +141,7 @@ class TestOpening(unittest.TestCase):
             self.assertEqual(len(list(inf.fetch())), 5)
 
     def testDetectVCFGZ(self):
-        with pysam.VariantFile(os.path.join(DATADIR,
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR,
             "example_vcf40.vcf.gz")) as inf:
             self.assertEqual(inf.category, 'VARIANTS')
             self.assertEqual(inf.format, 'VCF')
@@ -154,7 +152,7 @@ class TestOpening(unittest.TestCase):
 
     def testDetectBCF(self):
         with pysam.VariantFile(os.path.join(
-                DATADIR,
+                CBCF_DATADIR,
                 "example_vcf40.bcf")) as inf:
             self.assertEqual(inf.category, 'VARIANTS')
             self.assertEqual(inf.format, 'BCF')
@@ -170,7 +168,7 @@ class TestHeader(unittest.TestCase):
 
     def testStr(self):
 
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
 
         ref = read_header(fn)
@@ -181,7 +179,7 @@ class TestHeader(unittest.TestCase):
 
     def testIterator(self):
 
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
 
         ref = read_header(fn)
@@ -207,74 +205,74 @@ class TestParsing(unittest.TestCase):
     filename = "example_vcf40.vcf.gz"
 
     def testChrom(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         chrom = [rec.chrom for rec in v]
         self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
 
     if Path and sys.version_info >= (3,6):
         def testChromFromPath(self):
-            fn = os.path.join(DATADIR, self.filename)
+            fn = os.path.join(CBCF_DATADIR, self.filename)
             v = pysam.VariantFile(Path(fn))
             chrom = [rec.chrom for rec in v]
             self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
 
     def testPos(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         pos = [rec.pos for rec in v]
         self.assertEqual(pos, [1230237, 14370, 17330, 1110696, 1234567])
 
     def testStart(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         start = [rec.start for rec in v]
         self.assertEqual(start, [1230236, 14369, 17329, 1110695, 1234566])
 
     def testStop(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         stop = [rec.stop for rec in v]
         self.assertEqual(stop, [1230237, 14370, 17330, 1110696, 1234570])
 
     def testId(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         ids = [rec.id for rec in v]
         self.assertEqual(ids, [None, 'rs6054257', None, 'rs6040355', 'microsat1'])
 
     def testRef(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         ref = [rec.ref for rec in v]
         self.assertEqual(ref, ['T', 'G', 'T', 'A', 'GTCT'])
 
     def testAlt(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         alts = [rec.alts for rec in v]
         self.assertEqual(alts, [None, ('A',), ('A',), ('G', 'T'), ('G', 'GTACT')])
 
     def testAlleles(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         alleles = [rec.alleles for rec in v]
         self.assertEqual(alleles, [('T',), ('G', 'A'), ('T', 'A'), ('A', 'G', 'T'), ('GTCT', 'G', 'GTACT')])
 
     def testQual(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         qual = [rec.qual for rec in v]
         self.assertEqual(qual, [47.0, 29.0, 3.0, 67.0, 50.0])
 
     def testFilter(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         filter = [rec.filter.keys() for rec in v]
         self.assertEqual(filter, [['PASS'], ['PASS'], ['q10'], ['PASS'], ['PASS']])
 
     def testInfo(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         info = [rec.info.items() for rec in v]
         self.assertEqual(info, [[('NS', 3), ('DP', 13), ('AA', 'T')],
@@ -285,7 +283,7 @@ class TestParsing(unittest.TestCase):
                                 [('NS', 3), ('DP', 9), ('AA', 'G')]])
 
     def testFormat(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         format = [rec.format.keys() for rec in v]
         self.assertEqual(format, [['GT', 'GQ', 'DP', 'HQ'],
@@ -295,7 +293,7 @@ class TestParsing(unittest.TestCase):
                                   ['GT', 'GQ', 'DP']])
 
     def testSampleAlleles(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         alleles = [s.alleles for rec in v for s in rec.samples.values()]
         self.assertEqual(alleles, [('T', 'T'), ('T', 'T'), ('T', 'T'),
@@ -306,7 +304,7 @@ class TestParsing(unittest.TestCase):
                                    ('G', 'G')])
 
     def testSampleFormats(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         format = [s.items() for rec in v for s in rec.samples.values()]
         self.assertEqual(format, [[('GT', (0, 0)), ('GQ', 54), ('DP', 7), ('HQ', (56, 60))],
@@ -326,7 +324,7 @@ class TestParsing(unittest.TestCase):
                                   [('GT', (1, 1)), ('GQ', 40), ('DP', 3)]])
 
     def testSampleAlleleIndices(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(CBCF_DATADIR, self.filename)
         v = pysam.VariantFile(fn)
         indices = [s.allele_indices for rec in v for s in rec.samples.values()]
         self.assertEqual(indices, [(0, 0), (0, 0), (0, 0), (0, 0), (1, 0),
@@ -342,8 +340,8 @@ class TestIndexFilename(unittest.TestCase):
 
     def testOpen(self):
         for fn, idx_fn in self.filenames:
-            fn = os.path.join(DATADIR, fn)
-            idx_fn = os.path.join(DATADIR, idx_fn)
+            fn = os.path.join(CBCF_DATADIR, fn)
+            idx_fn = os.path.join(CBCF_DATADIR, idx_fn)
 
             v = pysam.VariantFile(fn, index_filename=idx_fn)
 
@@ -358,7 +356,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
     description = 'VCF version 4.2 variant calling text'
 
     def testBase(self):
-        with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR, self.filename)) as inf:
             self.assertEqual(inf.category, 'VARIANTS')
             self.assertEqual(inf.format, 'VCF')
             self.assertEqual(inf.version, (4, 2))
@@ -377,7 +375,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
 
     def testConstructionWithRecords(self):
 
-        fn_in = os.path.join(DATADIR, self.filename)
+        fn_in = os.path.join(CBCF_DATADIR, self.filename)
         fn_out = get_temp_filename(suffix=".vcf")
         vcf_in = pysam.VariantFile(fn_in)
 
@@ -400,7 +398,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
 
     def testConstructionFromCopy(self):
 
-        fn_in = os.path.join(DATADIR, self.filename)
+        fn_in = os.path.join(CBCF_DATADIR, self.filename)
         fn_out = get_temp_filename(suffix=".vcf")
         vcf_in = pysam.VariantFile(fn_in)
 
@@ -415,7 +413,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
 
     def testConstructionWithLines(self):
 
-        fn_in = os.path.join(DATADIR, self.filename)
+        fn_in = os.path.join(CBCF_DATADIR, self.filename)
         fn_out = get_temp_filename(suffix=".vcf")
         vcf_in = pysam.VariantFile(fn_in)
 
@@ -463,7 +461,7 @@ class TestSettingRecordValues(unittest.TestCase):
     filename = "example_vcf40.vcf"
 
     def testBase(self):
-        with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR, self.filename)) as inf:
             self.assertEqual(inf.category, 'VARIANTS')
             self.assertEqual(inf.format, 'VCF')
             self.assertEqual(inf.version, (4, 0))
@@ -474,7 +472,7 @@ class TestSettingRecordValues(unittest.TestCase):
             self.assertEqual(inf.is_write, False)
 
     def testSetQual(self):
-        with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR, self.filename)) as inf:
             record = next(inf)
             self.assertEqual(record.qual, 47)
             record.qual = record.qual
@@ -484,7 +482,7 @@ class TestSettingRecordValues(unittest.TestCase):
             self.assertEqual(str(record).split("\t")[5], "10")
 
     def testGenotype(self):
-        with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR, self.filename)) as inf:
             record = next(inf)
             sample = record.samples["NA00001"]
             print (sample["GT"])
@@ -496,7 +494,7 @@ class TestSubsetting(unittest.TestCase):
     filename = "example_vcf42.vcf.gz"
     
     def testSubsetting(self):
-        with pysam.VariantFile(os.path.join(DATADIR,
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR,
                                             self.filename)) as inf:
             inf.subset_samples(["NA00001"])
 
@@ -504,7 +502,7 @@ class TestSubsetting(unittest.TestCase):
 if __name__ == "__main__":
     # build data files
     print ("building data files")
-    subprocess.call("make -C %s" % DATADIR, shell=True)
+    subprocess.call("make -C %s" % CBCF_DATADIR, shell=True)
     print ("starting tests")
     unittest.main()
     print ("completed tests")
diff --git a/tests/compile_test.py b/tests/compile_test.py
index 5744dbe..f91e180 100644
--- a/tests/compile_test.py
+++ b/tests/compile_test.py
@@ -1,15 +1,16 @@
 '''
-compile_test.py - check pyximport
-=================================
+compile_test.py - check pyximport functionality with pysam
+==========================================================
 
 test script for checking if compilation against
 pysam and tabix works.
 '''
+
 # clean up previous compilation
 import os
 try:
-    os.unlink('_compile_test.c')
-    os.unlink('_compile_test.pyxbldc')
+    os.unlink('tests/_compile_test.c')
+    os.unlink('tests/_compile_test.pyxbldc')
 except OSError:
     pass
 
@@ -20,11 +21,12 @@ import _compile_test
 
 import unittest
 import pysam
+from TestUtils import BAM_DATADIR, TABIX_DATADIR
 
 
 class BAMTest(unittest.TestCase):
 
-    input_filename = "pysam_data/ex1.bam"
+    input_filename = os.path.join(BAM_DATADIR, "ex1.bam")
 
     def testCount(self):
 
@@ -35,12 +37,13 @@ class BAMTest(unittest.TestCase):
 
 class GTFTest(unittest.TestCase):
 
-    input_filename = "tabix_data/example.gtf.gz"
+    input_filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
 
     def testCount(self):
         nread = _compile_test.testCountGTF(
             pysam.Tabixfile(self.input_filename))
         self.assertEqual(nread, 237)
+        
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/faidx_test.py b/tests/faidx_test.py
index c87394d..9df34b6 100644
--- a/tests/faidx_test.py
+++ b/tests/faidx_test.py
@@ -2,11 +2,10 @@ import pysam
 import unittest
 import os
 import gzip
+import copy
 import shutil
 
-from TestUtils import checkURL
-
-DATADIR = "pysam_data"
+from TestUtils import checkURL, BAM_DATADIR
 
 
 class TestFastaFile(unittest.TestCase):
@@ -19,7 +18,7 @@ class TestFastaFile(unittest.TestCase):
     }
 
     def setUp(self):
-        self.file = pysam.FastaFile(os.path.join(DATADIR, "ex1.fa"))
+        self.file = pysam.FastaFile(os.path.join(BAM_DATADIR, "ex1.fa"))
 
     def testFetch(self):
         for id, seq in list(self.sequences.items()):
@@ -59,7 +58,7 @@ class TestFastaFile(unittest.TestCase):
 
 class TestFastaFilePathIndex(unittest.TestCase):
 
-    filename = os.path.join(DATADIR, "ex1.fa")
+    filename = os.path.join(BAM_DATADIR, "ex1.fa")
 
     def testGarbageIndex(self):
         self.assertRaises(NotImplementedError,
@@ -101,7 +100,7 @@ class TestFastaFilePathIndex(unittest.TestCase):
 
 class TestFastaFilePathIndexCompressed(TestFastaFilePathIndex):
     
-    filename = os.path.join(DATADIR, "ex1.fa.gz")
+    filename = os.path.join(BAM_DATADIR, "ex1.fa.gz")
 
 
 class TestFastxFileFastq(unittest.TestCase):
@@ -111,7 +110,7 @@ class TestFastxFileFastq(unittest.TestCase):
     persist = True
 
     def setUp(self):
-        self.file = self.filetype(os.path.join(DATADIR, self.filename),
+        self.file = self.filetype(os.path.join(BAM_DATADIR, self.filename),
                                   persist=self.persist)
         self.has_quality = self.filename.endswith('.fq')
 
@@ -170,7 +169,7 @@ class TestFastxFileFastq(unittest.TestCase):
             self.checkLast(first)
 
     def testManager(self):
-        with self.filetype(os.path.join(DATADIR, self.filename),
+        with self.filetype(os.path.join(BAM_DATADIR, self.filename),
                            persist=self.persist) as inf:
             first = inf.__next__()
             self.checkFirst(first)
@@ -206,7 +205,7 @@ class TestFastxFileWithEmptySequence(unittest.TestCase):
     filename = "faidx_empty_seq.fq.gz"
 
     def testIteration(self):
-        fn = os.path.join(DATADIR, self.filename)
+        fn = os.path.join(BAM_DATADIR, self.filename)
 
         with gzip.open(fn) as inf:
             ref_num = len(list(inf)) / 4
@@ -242,7 +241,80 @@ class TestRemoteFileFTP(unittest.TestCase):
                              248956422)
             self.assertEqual(f.get_reference_length("chr1"),
                              248956422)
+
+class TestFastqRecord(unittest.TestCase):
+
+    filetype = pysam.FastxFile
+    filename = "faidx_ex1.fq"
+    
+    def setUp(self):
+
+        with self.filetype(os.path.join(BAM_DATADIR, self.filename), persist=True) as inf:
+            self.record = next(inf)
         
+    def test_fastx_record_sequence_can_be_modified(self):
+        old_sequence = self.record.sequence
+        new_record = copy.copy(self.record)
+        new_sequence = "AAAC"
+        new_record.set_sequence(new_sequence)
+        self.assertEqual(str(new_record), ">{}\n{}".format(self.record.name, new_sequence))
+        self.assertEqual(self.record.sequence, old_sequence)
+        self.assertEqual(new_record.sequence, new_sequence)
+
+    def test_fastx_record_name_can_be_modified(self):
+        old_name = self.record.name
+        new_name = "new_name"
+        new_record = copy.copy(self.record)
+        new_record.set_name(new_name)
+        self.assertEqual(new_record.name, new_name)
+        self.assertEqual(self.record.name, old_name)
+
+    def test_fastx_record_fail_if_name_is_None(self):
+        self.assertRaises(ValueError,
+                          self.record.set_name,
+                          None)
+        
+    def test_fastx_record_comment_can_be_modified(self):
+        old_comment = self.record.comment
+        new_comment = "this is  a new comment"
+        new_record = copy.copy(self.record)
+        new_record.set_comment(new_comment)
+        self.assertEqual(new_record.comment, new_comment)
+        self.assertEqual(self.record.comment, old_comment)
+
+    def test_fastx_record_comment_can_be_None(self):
+        old_comment = self.record.comment
+        new_comment = None
+        new_record = copy.copy(self.record)
+        new_record.set_comment(new_comment)
+        self.assertEqual(new_record.comment, new_comment)
+        self.assertEqual(self.record.comment, old_comment)
+        
+    def test_fastx_record_quality_can_be_modified(self):
+        old_quality = self.record.quality
+        new_quality = "A" * len(old_quality)
+        new_record = copy.copy(self.record)
+        new_record.set_sequence(self.record.sequence, new_quality)
+        self.assertEqual(new_record.quality, new_quality)
+        self.assertEqual(self.record.quality, old_quality)
+
+    def test_fastx_record_fail_if_quality_is_wrong_length(self):
+        self.assertRaises(ValueError,
+                          self.record.set_sequence,
+                          self.record.sequence, self.record.quality * 2)
 
+    def test_fastx_record_can_be_created_from_scratch(self):
+        fastx_record = pysam.FastxRecord()
+        self.assertRaises(ValueError,
+                          str,
+                          fastx_record)
+        fastx_record.set_name("name")
+        self.assertRaises(ValueError,
+                          str,
+                          fastx_record)
+        fastx_record.set_sequence("sequence")
+        self.assertEqual(str(fastx_record), ">name\nsequence")
+        
+        
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/linking_test.py b/tests/linking_test.py
new file mode 100644
index 0000000..623c3a2
--- /dev/null
+++ b/tests/linking_test.py
@@ -0,0 +1,79 @@
+"""test linking against pysam.
+"""
+
+import unittest
+import os
+import subprocess
+import pysam
+
+from TestUtils import LINKDIR
+
+
+def check_import(statement):
+    try:
+        output = subprocess.check_output(
+            statement, stderr=subprocess.STDOUT, shell=True)
+    except subprocess.CalledProcessError as exc:
+        if b"ImportError" in exc.output:
+            raise ImportError("module could not be imported: {}".format(str(exc.output)))
+        else:
+            raise
+
+
+def check_tests_pass(statement):
+    try:
+        output = subprocess.check_output(
+            statement, stderr=subprocess.STDOUT, shell=True)
+    except subprocess.CalledProcessError as exc:
+        raise ValueError("{}: {}".format(exc, exc.output))
+    if b"FAILED" in output:
+        raise ValueError("module tests failed")
+    return True
+
+
+class TestLinking(unittest.TestCase):
+
+    package_name = "link_with_rpath"
+
+    def setUp(self):
+        self.workdir = os.path.join(LINKDIR, self.package_name)
+        
+    def test_package_can_be_installed(self):
+        subprocess.check_output(
+            "cd {} && rm -rf build && python setup.py install".format(self.workdir),
+                shell=True)
+
+class TestLinkWithRpath(TestLinking):
+
+    package_name = "link_with_rpath"
+    
+    def test_package_tests_pass(self):
+        self.assertTrue(check_tests_pass(
+            "cd {} && python test_module.py".format(os.path.join(self.workdir, "tests"))))
+
+
+class TestLinkWithoutRpath(TestLinking):
+
+    package_name = "link_without_rpath"
+
+    def test_package_tests_fail_on_import(self):
+
+        self.assertRaises(
+            ImportError,
+            check_import,
+            "cd {} && python test_module.py".format(os.path.join(self.workdir, "tests")))
+
+    def test_package_tests_pass_if_ld_library_path_set(self):
+
+        pysam_libraries = pysam.get_libraries()
+        pysam_libdirs, pysam_libs = zip(*[os.path.split(x) for x in pysam_libraries])
+        pysam_libdir = pysam_libdirs[0]
+
+        self.assertTrue(check_tests_pass(
+            "export LD_LIBRARY_PATH={}:$PATH && cd {} && python test_module.py".format(
+                pysam_libdir,
+                os.path.join(self.workdir, "tests"))))
+        
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/pysam_data/test_mapped_unmapped.sam b/tests/pysam_data/test_mapped_unmapped.sam
index c0b6230..f5ca805 100644
--- a/tests/pysam_data/test_mapped_unmapped.sam
+++ b/tests/pysam_data/test_mapped_unmapped.sam
@@ -1,5 +1,6 @@
 @HD	VN:1.0
 @SQ	SN:chr1	LN:100
+ at SQ	SN:chr2	LN:100
 @CO	Test counting of mapped/unmapped reads
 read1_mapped	0	chr1	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
 read2_unmapped	4	chr1	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
@@ -11,6 +12,16 @@ pair2b_mapped	139	chr1	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGG
 pair3a_unmapped	77	chr1	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
 pair3b_unmapped	141	chr1	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
 noseq2b_mapped	139	chr1	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bread1_mapped	0	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bread2_unmapped	4	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bread3_unmapped	20	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bpair1a_mapped	67	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bpair1b_mapped	131	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bpair2a_unmapped	71	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bpair2b_mapped	139	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bpair3a_unmapped	77	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bpair3b_unmapped	141	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
+bnoseq2b_mapped	139	chr2	21	20	10M1D25M	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
 noseq1_unmapped	4	*	0	20	*	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
 noseq2a_unmapped	71	*	0	20	*	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
 pair3a_unmapped	77	*	0	20	*	=	200	167	AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG	<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<	NM:i:1
diff --git a/tests/samtools_test.py b/tests/samtools_test.py
index 7eec832..5494e1b 100644
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -16,12 +16,11 @@ import sys
 import subprocess
 import shutil
 from TestUtils import checkBinaryEqual, check_lines_equal, \
-    check_samtools_view_equal, get_temp_filename, force_bytes
+    check_samtools_view_equal, get_temp_filename, force_bytes, WORKDIR, \
+    BAM_DATADIR
 
-IS_PYTHON3 = sys.version_info[0] >= 3
 
-WORKDIR = "pysam_test_work"
-DATADIR = "pysam_data"
+IS_PYTHON3 = sys.version_info[0] >= 3
 
 
 def run_command(cmd):
@@ -152,7 +151,7 @@ class SamtoolsTest(unittest.TestCase):
             os.makedirs(WORKDIR)
 
         for f in self.requisites:
-            shutil.copy(os.path.join(DATADIR, f),
+            shutil.copy(os.path.join(BAM_DATADIR, f),
                         os.path.join(WORKDIR, f))
 
         self.savedir = os.getcwd()
@@ -184,6 +183,7 @@ class SamtoolsTest(unittest.TestCase):
         pysam_targets = [x % r_pysam for x in targets]
 
         pysam_method = getattr(self.module, command)
+        
         # run samtools
         full_statement = re.sub("%\(out\)s", self.executable, statement)
         run_command(" ".join((self.executable, full_statement)))
@@ -204,7 +204,6 @@ class SamtoolsTest(unittest.TestCase):
             with open(pysam_targets[-1], "wb") as outfile:
                 if output is not None:
                     outfile.write(force_bytes(output))
-
         for samtools_target, pysam_target in zip(samtools_targets,
                                                  pysam_targets):
             if os.path.isdir(samtools_target):
@@ -236,7 +235,11 @@ class SamtoolsTest(unittest.TestCase):
 
     def testStatements(self):
         for statement in self.statements:
-            if (statement.startswith("calmd") and 
+            command = self.get_command(statement, map_to_internal=False)
+            if command in ("bedcov", "stats", "dict"):
+                continue
+            
+            if (command == "calmd" and 
                 list(sys.version_info[:2]) == [3, 3]):
                 # skip calmd test, fails only on python 3.3.5
                 # in linux (empty output). Works in OsX and passes
@@ -245,14 +248,18 @@ class SamtoolsTest(unittest.TestCase):
             self.check_statement(statement)
 
     @unittest.skipIf(sys.platform == "darwin", "not supported, pattern does not match")
+    @unittest.skipIf(not sys.stdin.isatty(), "skipping usage tests, stdin is not a tty")
     def testUsage(self):
         if self.executable == "bcftools":
             # bcftools usage messages end with exit(1)
             return
-
+        
         for statement in self.statements:
             command = self.get_command(statement, map_to_internal=False)
-            if command == "bam2fq":
+            # ignore commands that exit or cause other failures
+            # TODO: check - if reheader or phase is run in testStatements, sort fails
+            # here
+            if command in ("view", "sort", "bam2fq", "flagstat", "reheader", "stats"):
                 continue
             mapped_command = self.get_command(statement, map_to_internal=True)
             pysam_method = getattr(self.module, mapped_command)
@@ -272,121 +279,124 @@ class EmptyIndexTest(unittest.TestCase):
         self.assertRaises(IOError, pysam.samtools.index,
                           "exdoesntexist.bam")
 
-class TestReturnType(unittest.TestCase):
-    
-    def testReturnValueString(self):
-        retval = pysam.idxstats(os.path.join(DATADIR, "ex1.bam"))
-        if IS_PYTHON3:
-            self.assertFalse(isinstance(retval, bytes))
-            self.assertTrue(isinstance(retval, str))
-        else:
-            self.assertTrue(isinstance(retval, bytes))
-            self.assertTrue(isinstance(retval, basestring))
-
-    def testReturnValueData(self):
-        args = "-O BAM {}".format(os.path.join(DATADIR, "ex1.bam")).split(" ")
-        retval = pysam.view(*args)
-
-        if IS_PYTHON3:
-            self.assertTrue(isinstance(retval, bytes))
-            self.assertFalse(isinstance(retval, str))
-        else:
-            self.assertTrue(isinstance(retval, bytes))
-            self.assertTrue(isinstance(retval, basestring))
-
-
-class StdoutTest(unittest.TestCase):
-    '''test if stdout can be redirected.'''
-
-    def testWithRedirectedStdout(self):
-        r = pysam.samtools.flagstat(
-            os.path.join(DATADIR, "ex1.bam"))
-        self.assertTrue(len(r) > 0)
-
-    def testWithoutRedirectedStdout(self):
-        r = pysam.samtools.flagstat(
-            os.path.join(DATADIR, "ex1.bam"),
-            catch_stdout=False)
-        self.assertEqual(r, None)
-
-    def testDoubleCalling(self):
-        # The following would fail if there is an
-        # issue with stdout being improperly caught.
-        retvals = pysam.idxstats(
-            os.path.join(DATADIR, "ex1.bam"))
-        retvals = pysam.idxstats(
-            os.path.join(DATADIR, "ex1.bam"))
-
-    def testSaveStdout(self):
-        outfile = get_temp_filename(suffix=".tsv")
-        r = pysam.samtools.flagstat(
-            os.path.join(DATADIR, "ex1.bam"),
-            save_stdout=outfile)
-        self.assertEqual(r, None)
-        with open(outfile) as inf:
-            r = inf.read()
-        self.assertTrue(len(r) > 0)
 
+if sys.platform != "darwin":
+    # fails with segfault with htslib 1.5 on Osx, an issue with flockfile
+    # issue seems to be with repeated calls to interface
 
-class PysamTest(SamtoolsTest):
-    """check access to samtools command in the pysam 
-    main package.
+    class TestReturnType(unittest.TestCase):
 
-    This is for backwards capability.
-    """
-
-    module = pysam
-
-
-class BcftoolsTest(SamtoolsTest):
-
-    requisites = [
-        "ex1.fa",
-        "ex1.vcf.gz",
-        "ex1.vcf.gz.tbi",
-    ]
-    # a list of statements to test
-    # should contain at least one %(out)s component indicating
-    # an output file.
-    statements = [
-        # "index -n ex1.vcf.gz > %(out)s_ex1.index",
-
-        "annotate -x ID ex1.vcf.gz > %(out)s_ex1.annotate",
-        "concat -a ex1.vcf.gz ex1.vcf.gz > %(out)s_ex1.concat",
-        "isec -p %(out)s_ex1.isec ex1.vcf.gz ex1.vcf.gz",
-        "merge --force-samples ex1.vcf.gz ex1.vcf.gz > %(out)s_ex1.norm",
-        "norm -m +both ex1.vcf.gz > %(out)s_ex1.norm",
-
-        # "plugin",
-        # "query -f '%CHROM\n' ex1.vcf.gz > %(out)s_ex1.query",
-        # "reheader -s A > %(out)s_ex1.reheader",
-        # "view ex1.vcf.gz > %(out)s_ex1.view",
-        # "call -m ex1.vcf.gz > %(out)s_ex1.call",
-        # bad file descriptor
-        # "consensus -f ex1.fa ex1.vcf.gz  > %(out)s_ex1.consensus"
-        # need appropriate VCF file
-        # "cnv",
-        # segfault
-        # "filter -s A ex1.vcf.gz  > %(out)s_ex1.filter",
-        # exit
-        # "gtcheck -s A ex1.vcf.gz  > %(out)s_ex1.gtcheck",
-        # segfauld, used to work wit bcftools 1.3
-        # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
-        "stats ex1.vcf.gz > %(out)s_ex1.stats",
-    ]
-
-    map_command = {
-        "import": "samimport"}
+        def testReturnValueString(self):
+            retval = pysam.idxstats(os.path.join(BAM_DATADIR, "ex1.bam"))
+            if IS_PYTHON3:
+                self.assertFalse(isinstance(retval, bytes))
+                self.assertTrue(isinstance(retval, str))
+            else:
+                self.assertTrue(isinstance(retval, bytes))
+                self.assertTrue(isinstance(retval, basestring))
 
-    executable = "bcftools"
+        def testReturnValueData(self):
+            args = "-O BAM {}".format(os.path.join(BAM_DATADIR, "ex1.bam")).split(" ")
+            retval = pysam.view(*args)
 
-    module = pysam.bcftools
+            if IS_PYTHON3:
+                self.assertTrue(isinstance(retval, bytes))
+                self.assertFalse(isinstance(retval, str))
+            else:
+                self.assertTrue(isinstance(retval, bytes))
+                self.assertTrue(isinstance(retval, basestring))
+
+
+    class StdoutTest(unittest.TestCase):
+        '''test if stdout can be redirected.'''
+
+        def testWithRedirectedStdout(self):
+            r = pysam.samtools.flagstat(
+                os.path.join(BAM_DATADIR, "ex1.bam"))
+            self.assertTrue(len(r) > 0)
+
+        def testWithoutRedirectedStdout(self):
+            r = pysam.samtools.flagstat(
+                os.path.join(BAM_DATADIR, "ex1.bam"),
+                catch_stdout=False)
+            self.assertEqual(r, None)
+
+        def testDoubleCalling(self):
+            # The following would fail if there is an
+            # issue with stdout being improperly caught.
+            retvals = pysam.idxstats(
+                os.path.join(BAM_DATADIR, "ex1.bam"))
+            retvals = pysam.idxstats(
+                os.path.join(BAM_DATADIR, "ex1.bam"))
+
+        def testSaveStdout(self):
+            outfile = get_temp_filename(suffix=".tsv")
+            r = pysam.samtools.flagstat(
+                os.path.join(BAM_DATADIR, "ex1.bam"),
+                save_stdout=outfile)
+            self.assertEqual(r, None)
+            with open(outfile) as inf:
+                r = inf.read()
+            self.assertTrue(len(r) > 0)
+
+    class PysamTest(SamtoolsTest):
+        """check access to samtools command in the pysam 
+        main package.
+        
+        This is for backwards capability.
+        """
+
+        module = pysam
+
+# class BcftoolsTest(SamtoolsTest):
+
+#     requisites = [
+#         "ex1.fa",
+#         "ex1.vcf.gz",
+#         "ex1.vcf.gz.tbi",
+#     ]
+#     # a list of statements to test
+#     # should contain at least one %(out)s component indicating
+#     # an output file.
+#     statements = [
+#         # "index -n ex1.vcf.gz > %(out)s_ex1.index",
+
+#         "annotate -x ID ex1.vcf.gz > %(out)s_ex1.annotate",
+#         "concat -a ex1.vcf.gz ex1.vcf.gz > %(out)s_ex1.concat",
+#         "isec -p %(out)s_ex1.isec ex1.vcf.gz ex1.vcf.gz",
+#         "merge --force-samples ex1.vcf.gz ex1.vcf.gz > %(out)s_ex1.norm",
+#         "norm -m +both ex1.vcf.gz > %(out)s_ex1.norm",
+
+#         # "plugin",
+#         # "query -f '%CHROM\n' ex1.vcf.gz > %(out)s_ex1.query",
+#         # "reheader -s A > %(out)s_ex1.reheader",
+#         # "view ex1.vcf.gz > %(out)s_ex1.view",
+#         # "call -m ex1.vcf.gz > %(out)s_ex1.call",
+#         # bad file descriptor
+#         # "consensus -f ex1.fa ex1.vcf.gz  > %(out)s_ex1.consensus"
+#         # need appropriate VCF file
+#         # "cnv",
+#         # segfault
+#         # "filter -s A ex1.vcf.gz  > %(out)s_ex1.filter",
+#         # exit
+#         # "gtcheck -s A ex1.vcf.gz  > %(out)s_ex1.gtcheck",
+#         # segfauld, used to work wit bcftools 1.3
+#         # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
+#         "stats ex1.vcf.gz > %(out)s_ex1.stats",
+#     ]
+
+#     map_command = {
+#         "import": "samimport"}
+
+#     executable = "bcftools"
+
+#     module = pysam.bcftools
 
 
 if __name__ == "__main__":
     # build data files
-    print ("building data files")
-    subprocess.call("make -C %s" % DATADIR, shell=True)
-    print ("starting tests")
+    print("building data files")
+    subprocess.call("make -C %s" % BAM_DATADIR, shell=True)
+    print("starting tests")
     unittest.main()
-    print ("completed tests")
+    print("completed tests")
diff --git a/tests/tabix_test.py b/tests/tabix_test.py
index 87de282..1b6d450 100644
--- a/tests/tabix_test.py
+++ b/tests/tabix_test.py
@@ -14,9 +14,8 @@ import unittest
 import glob
 import re
 import copy
-from TestUtils import checkURL, load_and_convert
-
-DATADIR = 'tabix_data'
+import tempfile
+from TestUtils import checkURL, load_and_convert, TABIX_DATADIR, get_temp_filename
 
 IS_PYTHON3 = sys.version_info[0] >= 3
 
@@ -47,10 +46,13 @@ def checkBinaryEqual(filename1, filename2):
 
     with open(filename1, "rb") as infile:
         d1 = infile.read()
-       
+
     with open(filename2, "rb") as infile:
         d2 = infile.read()
- 
+
+    if len(d1) != len(d2):
+        return False
+        
     found = False
     for c1, c2 in zip(d1, d2):
         if c1 != c2:
@@ -62,33 +64,53 @@ def checkBinaryEqual(filename1, filename2):
 
 
 class TestIndexing(unittest.TestCase):
-    filename = os.path.join(DATADIR, "example.gtf.gz")
-    filename_idx = os.path.join(DATADIR, "example.gtf.gz.tbi")
+    filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
+    filename_idx = os.path.join(TABIX_DATADIR, "example.gtf.gz.tbi")
 
     def setUp(self):
 
-        self.tmpfilename = "tmp_%i.gtf.gz" % id(self)
+        self.tmpfilename = get_temp_filename(suffix="gtf.gz")
         shutil.copyfile(self.filename, self.tmpfilename)
 
-    def testIndexPreset(self):
+    def test_indexing_with_preset_works(self):
         '''test indexing via preset.'''
 
         pysam.tabix_index(self.tmpfilename, preset="gff")
-        checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx)
+        self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))
 
+    def test_indexing_with_explict_columns_works(self):
+        '''test indexing via preset.'''
+
+        pysam.tabix_index(self.tmpfilename,
+                          seq_col=0,
+                          start_col=3,
+                          end_col=4,
+                          line_skip=0,
+                          zerobased=False)
+        self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))
+
+    def test_indexing_with_lineskipping_works(self):
+        '''test indexing via preset and lineskip.'''
+        pysam.tabix_index(self.tmpfilename,
+                          seq_col=0,
+                          start_col=3,
+                          end_col=4,
+                          line_skip=1,
+                          zerobased=False)
+        self.assertFalse(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))
+        
     def tearDown(self):
         os.unlink(self.tmpfilename)
         os.unlink(self.tmpfilename + ".tbi")
 
 
 class TestCompression(unittest.TestCase):
-    filename = os.path.join(DATADIR, "example.gtf.gz")
-    filename_idx = os.path.join(DATADIR, "example.gtf.gz.tbi")
+    filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
+    filename_idx = os.path.join(TABIX_DATADIR, "example.gtf.gz.tbi")
     preset = "gff"
 
     def setUp(self):
-
-        self.tmpfilename = "tmp_TestCompression_%i" % id(self)
+        self.tmpfilename = get_temp_filename(suffix="gtf")
         with gzip.open(self.filename, "rb") as infile, \
              open(self.tmpfilename, "wb") as outfile:
             outfile.write(infile.read())
@@ -116,30 +138,29 @@ class TestCompression(unittest.TestCase):
         checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
 
     def tearDown(self):
-
-        try:
+        if os.path.exists(self.tmpfilename):
             os.unlink(self.tmpfilename)
+        if os.path.exists(self.tmpfilename + ".gz"):
             os.unlink(self.tmpfilename + ".gz")
+        if os.path.exists(self.tmpfilename + ".gz.tbi"):
             os.unlink(self.tmpfilename + ".gz.tbi")
-        except OSError:
-            pass
 
 
 class TestCompressionSam(TestCompression):
-    filename = os.path.join(DATADIR, "example.sam.gz")
-    filename_index = os.path.join(DATADIR, "example.sam.gz.tbi")
+    filename = os.path.join(TABIX_DATADIR, "example.sam.gz")
+    filename_index = os.path.join(TABIX_DATADIR, "example.sam.gz.tbi")
     preset = "sam"
 
 
 class TestCompressionBed(TestCompression):
-    filename = os.path.join(DATADIR, "example.bed.gz")
-    filename_index = os.path.join(DATADIR, "example.bed.gz.tbi")
+    filename = os.path.join(TABIX_DATADIR, "example.bed.gz")
+    filename_index = os.path.join(TABIX_DATADIR, "example.bed.gz.tbi")
     preset = "bed"
 
 
 class TestCompressionVCF(TestCompression):
-    filename = os.path.join(DATADIR, "example.vcf.gz")
-    filename_index = os.path.join(DATADIR, "example.vcf.gz.tbi")
+    filename = os.path.join(TABIX_DATADIR, "example.vcf.gz")
+    filename_index = os.path.join(TABIX_DATADIR, "example.vcf.gz.tbi")
     preset = "vcf"
 
 
@@ -219,7 +240,7 @@ class IterationTest(unittest.TestCase):
 
 class TestGZFile(IterationTest):
 
-    filename = os.path.join(DATADIR, "example.gtf.gz")
+    filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
     with_comments = True
 
     def setUp(self):
@@ -238,7 +259,7 @@ class TestIterationWithoutComments(IterationTest):
     '''test iterating with TabixFile.fetch() when
     there are no comments in the file.'''
 
-    filename = os.path.join(DATADIR,
+    filename = os.path.join(TABIX_DATADIR,
                             "example.gtf.gz")
 
     def setUp(self):
@@ -365,15 +386,14 @@ class TestIterationWithComments(TestIterationWithoutComments):
     Tests will create plenty of warnings on stderr.
     '''
 
-    filename = os.path.join(DATADIR, "example_comments.gtf.gz")
+    filename = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz")
 
     def setUp(self):
         TestIterationWithoutComments.setUp(self)
 
-
             
 class TestIterators(unittest.TestCase):
-    filename = os.path.join(DATADIR, "example.gtf.gz")
+    filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
 
     iterator = pysam.tabix_generic_iterator
     parser = pysam.asTuple
@@ -461,7 +481,7 @@ class TestIterationMalformattedGTFFiles(unittest.TestCase):
     def testGTFTooManyFields(self):
 
         with gzip.open(os.path.join(
-                DATADIR,
+                TABIX_DATADIR,
                 "gtf_toomany_fields.gtf.gz")) as infile:
             iterator = self.iterator(
                 infile,
@@ -471,7 +491,7 @@ class TestIterationMalformattedGTFFiles(unittest.TestCase):
     def testGTFTooFewFields(self):
 
         with gzip.open(os.path.join(
-                DATADIR,
+                TABIX_DATADIR,
                 "gtf_toofew_fields.gtf.gz")) as infile:
             iterator = self.iterator(
                 infile,
@@ -480,7 +500,7 @@ class TestIterationMalformattedGTFFiles(unittest.TestCase):
 
 
 class TestBed(unittest.TestCase):
-    filename = os.path.join(DATADIR, "example.bed.gz")
+    filename = os.path.join(TABIX_DATADIR, "example.bed.gz")
 
     def setUp(self):
 
@@ -525,10 +545,10 @@ class TestBed(unittest.TestCase):
 
 class TestVCF(unittest.TestCase):
 
-    filename = os.path.join(DATADIR, "example.vcf40")
+    filename = os.path.join(TABIX_DATADIR, "example.vcf40")
 
     def setUp(self):
-        self.tmpfilename = "tmp_%s.vcf" % id(self)
+        self.tmpfilename = get_temp_filename(suffix="vcf")
         shutil.copyfile(self.filename, self.tmpfilename)
         pysam.tabix_index(self.tmpfilename, preset="vcf")
 
@@ -543,13 +563,18 @@ if IS_PYTHON3:
 
         '''test reading from a file with non-ascii characters.'''
 
-        filename = os.path.join(DATADIR, "example_unicode.vcf")
+        filename = os.path.join(TABIX_DATADIR, "example_unicode.vcf")
 
         def setUp(self):
-            self.tmpfilename = "tmp_%s.vcf" % id(self)
+            self.tmpfilename = get_temp_filename(suffix="vcf")
             shutil.copyfile(self.filename, self.tmpfilename)
             pysam.tabix_index(self.tmpfilename, preset="vcf")
 
+        def tearDown(self):
+            os.unlink(self.tmpfilename + ".gz")
+            if os.path.exists(self.tmpfilename + ".gz.tbi"):
+                os.unlink(self.tmpfilename + ".gz.tbi")
+
         def testFromTabix(self):
 
             # use ascii encoding - should raise error
@@ -587,7 +612,8 @@ class TestVCFFromTabix(TestVCF):
 
     def tearDown(self):
         self.tabix.close()
-
+        TestVCF.tearDown(self)
+        
     def testRead(self):
 
         ncolumns = len(self.columns)
@@ -668,6 +694,7 @@ class TestVCFFromVCF(TestVCF):
     fail_on_parsing = (
         (5, "Flag fields should not have a value"),
         (9, "aouao"),
+        (12, "Error BAD_NUMBER_OF_PARAMETERS"),
         (13, "aoeu"),
         (18, "Error BAD_NUMBER_OF_PARAMETERS"),
         (24, "Error HEADING_NOT_SEPARATED_BY_TABS"))
@@ -693,6 +720,7 @@ class TestVCFFromVCF(TestVCF):
         self.compare = load_and_convert(self.filename, encode=False)
 
     def tearDown(self):
+        TestVCF.tearDown(self)
         self.vcf.close()
 
     def open_vcf(self, fn):
@@ -891,7 +919,7 @@ class TestVCFFromVCF(TestVCF):
 # Two samples are created -
 # 1. Testing pysam/tabix access
 # 2. Testing the VCF class
-vcf_files = glob.glob(os.path.join(DATADIR, "vcf", "*.vcf"))
+vcf_files = glob.glob(os.path.join(TABIX_DATADIR, "vcf", "*.vcf"))
 
 for vcf_file in vcf_files:
     n = "VCFFromTabixTest_%s" % os.path.basename(vcf_file[:-4])
@@ -973,6 +1001,7 @@ class TestVCFFromVariantFile(TestVCFFromVCF):
         if self.vcf:
             self.vcf.close()
         self.vcf = None
+        TestVCF.tearDown(self)
 
     def get_iterator(self):
         self.vcf = pysam.VariantFile(self.filename)
@@ -996,7 +1025,7 @@ class TestRemoteFileHTTP(unittest.TestCase):
 
     url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example_htslib.gtf.gz"
     region = "chr1:1-1000"
-    local = os.path.join(DATADIR, "example.gtf.gz")
+    local = os.path.join(TABIX_DATADIR, "example.gtf.gz")
 
     def setUp(self):
         if not checkURL(self.url):
@@ -1037,9 +1066,9 @@ class TestRemoteFileHTTP(unittest.TestCase):
 
 class TestIndexArgument(unittest.TestCase):
 
-    filename_src = os.path.join(DATADIR, "example.vcf.gz")
+    filename_src = os.path.join(TABIX_DATADIR, "example.vcf.gz")
     filename_dst = "tmp_example.vcf.gz"
-    index_src = os.path.join(DATADIR, "example.vcf.gz.tbi")
+    index_src = os.path.join(TABIX_DATADIR, "example.vcf.gz.tbi")
     index_dst = "tmp_index_example.vcf.gz.tbi"
     preset = "vcf"
 
@@ -1086,34 +1115,34 @@ class TestBackwardsCompatibility(unittest.TestCase):
                 self.assertRaises(raises, tf.fetch)
 
     def testVCF0v23(self):
-        self.check(os.path.join(DATADIR, "example_0v23.vcf.gz"),
+        self.check(os.path.join(TABIX_DATADIR, "example_0v23.vcf.gz"),
                    ValueError)
 
     def testBED0v23(self):
-        self.check(os.path.join(DATADIR, "example_0v23.bed.gz"),
+        self.check(os.path.join(TABIX_DATADIR, "example_0v23.bed.gz"),
                    ValueError)
 
     def testVCF0v26(self):
-        self.check(os.path.join(DATADIR, "example_0v26.vcf.gz"),
+        self.check(os.path.join(TABIX_DATADIR, "example_0v26.vcf.gz"),
                    ValueError)
 
     def testBED0v26(self):
-        self.check(os.path.join(DATADIR, "example_0v26.bed.gz"),
+        self.check(os.path.join(TABIX_DATADIR, "example_0v26.bed.gz"),
                    ValueError)
 
     def testVCF(self):
-        self.check(os.path.join(DATADIR, "example.vcf.gz"))
+        self.check(os.path.join(TABIX_DATADIR, "example.vcf.gz"))
 
     def testBED(self):
-        self.check(os.path.join(DATADIR, "example.bed.gz"))
+        self.check(os.path.join(TABIX_DATADIR, "example.bed.gz"))
 
     def testEmpty(self):
-        self.check(os.path.join(DATADIR, "empty.bed.gz"))
+        self.check(os.path.join(TABIX_DATADIR, "empty.bed.gz"))
 
 
 class TestMultipleIterators(unittest.TestCase):
 
-    filename = os.path.join(DATADIR, "example.gtf.gz")
+    filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
 
     def testJoinedIterators(self):
 
@@ -1155,7 +1184,7 @@ class TestMultipleIterators(unittest.TestCase):
 
 class TestContextManager(unittest.TestCase):
 
-    filename = os.path.join(DATADIR, "example.gtf.gz")
+    filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
 
     def testManager(self):
 
diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py
index cff0e59..ff68c81 100644
--- a/tests/tabixproxies_test.py
+++ b/tests/tabixproxies_test.py
@@ -5,14 +5,12 @@ import sys
 import re
 import copy
 import gzip
-from TestUtils import load_and_convert
-
-DATADIR = 'tabix_data'
+from TestUtils import load_and_convert, TABIX_DATADIR
 
 
 class TestParser(unittest.TestCase):
 
-    filename = os.path.join(DATADIR, "example.gtf.gz")
+    filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
 
     def setUp(self):
 
@@ -126,6 +124,13 @@ class TestGTF(TestParser):
 
     parser = pysam.asGTF
 
+    def build_attribute_string(self, d):
+        """build attribute string from dictionary d"""
+        s = "; ".join(["{} \"{}\"".format(x, y) for (x, y) in d.items()]) + ";"
+        # remove quotes around numeric values
+        s = re.sub("\"(\d+)\"", r"\1", s)
+        return s
+    
     def testRead(self):
 
         for x, r in enumerate(self.tabix.fetch(parser=self.parser())):
@@ -266,11 +271,34 @@ class TestGTF(TestParser):
         r.score = -12
         self.assertEqual(str(r).split("\t")[5], "-12")
 
+    def test_asdict_contains_attributes(self):
+        r = self.tabix.fetch(parser=self.parser()).next()
+        d = r.as_dict()
+        c = self.compare[0]
+        s = self.build_attribute_string(d)
+        self.assertEqual(s, c[8])
+
+    def test_asdict_can_be_modified(self):
+        r = self.tabix.fetch(parser=self.parser()).next()
+        d = r.as_dict()
+        d["gene_id"] = "new_gene_id"
+        self.assertTrue("gene_id \"new_gene_id\"", str(r))
+
 
 class TestGFF3(TestGTF):
 
     parser = pysam.asGFF3
-    filename = os.path.join(DATADIR, "example.gff3.gz")
+    filename = os.path.join(TABIX_DATADIR, "example.gff3.gz")
+
+    def build_attribute_string(self, d):
+        """build attribute string from dictionary d"""
+        s = ";".join(["{}={}".format(x, y) for (x, y) in d.items()]) + ";"
+        return s
+
+    def build_attribute_string(self, d):
+        """build attribute string from dictionary d"""
+        s = ";".join(["{}={}".format(x, y) for (x, y) in d.items()]) + ";"
+        return s
 
     def testRead(self):
         for x, r in enumerate(self.tabix.fetch(parser=self.parser())):
@@ -313,6 +341,6 @@ class TestGFF3(TestGTF):
         r.new_text_attribute = "abc"
         self.assertTrue("new_text_attribute=abc" in str(r).split("\t")[8])
 
-
+        
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_samtools_python.py b/tests/test_samtools_python.py
index 1b915fd..f7a351b 100644
--- a/tests/test_samtools_python.py
+++ b/tests/test_samtools_python.py
@@ -1,24 +1,32 @@
 import pysam
+import os
+from TestUtils import BAM_DATADIR
+
 
 def test_idxstats_parse_split_lines():
-    bam_filename = "./pysam_data/ex2.bam"
-    lines = pysam.idxstats(bam_filename, split_lines=True)  # Test pysam 0.8.X style output, which returns a list of lines
+    bam_filename = os.path.join(BAM_DATADIR, "ex2.bam")
+    # Test pysam 0.8.X style output, which returns a list of lines
+    lines = pysam.idxstats(bam_filename, split_lines=True)  
     for line in lines:
         _seqname, _seqlen, nmapped, _nunmapped = line.split()
 
 
 def test_bedcov_split_lines():
-    bam_filename = "./pysam_data/ex1.bam"
-    bed_filename = "./pysam_data/ex1.bed"
-    lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True)  # Test pysam 0.8.X style output, which returns a list of lines
+    bam_filename = os.path.join(BAM_DATADIR, "ex1.bam")
+    bed_filename = os.path.join(BAM_DATADIR, "ex1.bed")
+    # Test pysam 0.8.X style output, which returns a list of lines
+    lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True)
     for line in lines:
         fields = line.split('\t')
-        assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields.  Split line (%s) gives %d fields." % (fields, len(fields))
+        assert len(fields) in [4, 5], \
+            ("bedcov should give tab delimited output with 4 or 5 fields. "
+             "Split line (%s) gives %d fields." % (fields, len(fields)))
 
 
 def test_idxstats_parse():
-    bam_filename = "./pysam_data/ex2.bam"
-    idxstats_string = pysam.idxstats(bam_filename, split_lines=False)  # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
+    bam_filename = os.path.join(BAM_DATADIR, "ex2.bam")
+    # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
+    idxstats_string = pysam.idxstats(bam_filename, split_lines=False)
     lines = idxstats_string.splitlines()
     for line in lines:
         splt = line.split("\t")
@@ -26,10 +34,13 @@ def test_idxstats_parse():
 
 
 def test_bedcov():
-    bam_filename = "./pysam_data/ex1.bam"
-    bed_filename = "./pysam_data/ex1.bed"
-    bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False)  # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
+    bam_filename = os.path.join(BAM_DATADIR, "ex1.bam")
+    bed_filename = os.path.join(BAM_DATADIR, "ex1.bed")
+    # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
+    bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False)  
     lines = bedcov_string.splitlines()
     for line in lines:
         fields = line.split('\t')
-        assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields.  Split line (%s) gives %d fields." % (fields, len(fields))
+        assert len(fields) in [4, 5], \
+            ("bedcov should give tab delimited output with 4 or 5 fields. "
+             "Split line (%s) gives %d fields." % (fields, len(fields)))

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pysam.git



More information about the debian-med-commit mailing list