[med-svn] [python-pysam] 01/04: Imported Upstream version 0.10.0+ds
Afif Elghraoui
afif at moszumanska.debian.org
Tue Jan 24 07:36:01 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository python-pysam.
commit dadf905780b2c683a23346e1eece6aa56717ca88
Author: Afif Elghraoui <afif at debian.org>
Date: Mon Jan 23 19:09:05 2017 -0800
Imported Upstream version 0.10.0+ds
---
.travis.yml | 4 +-
bcftools/vcfisec.c | 2 +-
bcftools/vcfisec.c.pysam.c | 2 +-
benchmark/cython_flagstat.py | 23 +
benchmark/python_flagstat.py | 23 +
buildwheels.sh | 69 +
cy_build.py | 2 -
doc/api.rst | 2 +-
doc/release.rst | 29 +
doc/usage.rst | 14 +-
pysam/__init__.py | 77 +-
pysam/chtslib.pyx | 19 -
...{calignedsegment.pxd => libcalignedsegment.pxd} | 4 +-
...{calignedsegment.pyx => libcalignedsegment.pyx} | 88 +-
.../{calignmentfile.pxd => libcalignmentfile.pxd} | 40 +-
.../{calignmentfile.pyx => libcalignmentfile.pyx} | 343 ++---
pysam/{cbcf.pxd => libcbcf.pxd} | 33 +-
pysam/{cbcf.pyx => libcbcf.pyx} | 1505 +++++++++++---------
pysam/libcbgzf.pyx | 209 +++
pysam/{cfaidx.pxd => libcfaidx.pxd} | 4 +-
pysam/{cfaidx.pyx => libcfaidx.pyx} | 13 +-
pysam/{chtslib.pxd => libchtslib.pxd} | 18 +
pysam/libchtslib.pyx | 265 ++++
pysam/{csamfile.pxd => libcsamfile.pxd} | 4 +-
pysam/{csamfile.pyx => libcsamfile.pyx} | 2 +-
pysam/{ctabix.pxd => libctabix.pxd} | 28 +-
pysam/{ctabix.pyx => libctabix.pyx} | 94 +-
pysam/{ctabixproxies.pxd => libctabixproxies.pxd} | 0
pysam/{ctabixproxies.pyx => libctabixproxies.pyx} | 4 +-
pysam/{cutils.pxd => libcutils.pxd} | 0
pysam/{cutils.pyx => libcutils.pyx} | 44 +-
pysam/{cvcf.pxd => libcvcf.pxd} | 0
pysam/{cvcf.pyx => libcvcf.pyx} | 12 +-
pysam/utils.py | 2 +-
pysam/version.py | 6 +-
requirements.txt | 2 +-
run_tests_travis.sh | 82 +-
samtools/sam_view.c.pysam.c | 4 +-
setup.py | 76 +-
tests/AlignedSegment_test.py | 39 +-
tests/AlignmentFile_test.py | 130 +-
tests/SamFile_test.py | 12 +-
tests/StreamFiledescriptors_test.py | 82 ++
tests/VariantFile_test.py | 80 +-
tests/_compile_test.pyx | 4 +-
tests/_cython_flagstat.pyx | 6 +-
tests/cython_flagstat.py | 11 -
tests/pysam_data/Makefile | 8 +-
tests/pysam_data/ex3.sam | 4 +-
tests/pysam_data/ex_spliced.sam | 297 ++++
tests/python_flagstat.py | 11 -
tests/samtools_test.py | 1 +
52 files changed, 2539 insertions(+), 1294 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 1482ed7..bfc5d1c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,14 +3,14 @@ os:
- osx
language: c
-sudo: required
+sudo: false
env:
matrix:
- CONDA_PY=2.7
- - CONDA_PY=3.3
- CONDA_PY=3.4
- CONDA_PY=3.5
+ - CONDA_PY=3.6
addons:
apt:
diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c
index 9afe620..9eb3a7c 100644
--- a/bcftools/vcfisec.c
+++ b/bcftools/vcfisec.c
@@ -317,7 +317,7 @@ static void init_data(args_t *args)
while (*p && *p!=',') p++;
if ( *p==',' ) p++;
}
- if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files);
+ if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files);
if ( args->isec_op==OP_COMPLEMENT && args->nwrite )
{
if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n");
diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c
index 758d475..e3890d5 100644
--- a/bcftools/vcfisec.c.pysam.c
+++ b/bcftools/vcfisec.c.pysam.c
@@ -319,7 +319,7 @@ static void init_data(args_t *args)
while (*p && *p!=',') p++;
if ( *p==',' ) p++;
}
- if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files);
+ if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files);
if ( args->isec_op==OP_COMPLEMENT && args->nwrite )
{
if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n");
diff --git a/benchmark/cython_flagstat.py b/benchmark/cython_flagstat.py
new file mode 100644
index 0000000..6a9b7df
--- /dev/null
+++ b/benchmark/cython_flagstat.py
@@ -0,0 +1,23 @@
+"""compute number of reads/alignments from BAM file
+===================================================
+
+This is a benchmarking utility script with limited functionality.
+
+Compute simple flag stats on a BAM-file using
+the pysam cython interface.
+
+"""
+
+import sys
+import pysam
+import pyximport
+pyximport.install()
+import _cython_flagstat
+
+assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0])
+
+is_paired, is_proper = _cython_flagstat.count(
+ pysam.AlignmentFile(sys.argv[1], "rb"))
+
+print ("there are alignments of %i paired reads" % is_paired)
+print ("there are %i proper paired alignments" % is_proper)
diff --git a/benchmark/python_flagstat.py b/benchmark/python_flagstat.py
new file mode 100644
index 0000000..0a14d80
--- /dev/null
+++ b/benchmark/python_flagstat.py
@@ -0,0 +1,23 @@
+"""compute number of reads/alignments from BAM file
+===================================================
+
+This is a benchmarking utility script with limited functionality.
+
+Compute simple flag stats on a BAM-file using
+the pysam python interface.
+"""
+
+import sys
+import pysam
+
+assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0])
+
+is_paired = 0
+is_proper = 0
+
+for read in pysam.AlignmentFile(sys.argv[1], "rb"):
+ is_paired += read.is_paired
+ is_proper += read.is_proper_pair
+
+print ("there are alignments of %i paired reads" % is_paired)
+print ("there are %i proper paired alignments" % is_proper)
diff --git a/buildwheels.sh b/buildwheels.sh
new file mode 100755
index 0000000..a5987f1
--- /dev/null
+++ b/buildwheels.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#
+# Build manylinux1 wheels for pysam. Based on the example at
+# <https://github.com/pypa/python-manylinux-demo>
+#
+# It is best to run this in a fresh clone of the repository!
+#
+# Run this within the repository root:
+# docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh
+#
+# The wheels will be put into the wheelhouse/ subdirectory.
+#
+# For interactive tests:
+# docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash
+
+set -xeuo pipefail
+
+# For convenience, if this script is called from outside of a docker container,
+# it starts a container and runs itself inside of it.
+if ! grep -q docker /proc/1/cgroup; then
+ # We are not inside a container
+ exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0
+fi
+
+yum install -y zlib-devel
+
+# Python 2.6 is not supported
+rm -r /opt/python/cp26*
+
+# Python 3.3 builds fail with:
+# /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib
+rm -r /opt/python/cp33*
+
+# Without libcurl support, htslib can open files from HTTP and FTP URLs.
+# With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a
+# current version of OpenSSL, and we do not want to be responsible for
+# updating the wheels as soon as there are any security issues. So disable
+# libcurl for now.
+# See also <https://github.com/pypa/manylinux/issues/74>.
+#
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
+
+PYBINS="/opt/python/*/bin"
+for PYBIN in ${PYBINS}; do
+ ${PYBIN}/pip install -r /io/requirements.txt
+ ${PYBIN}/pip wheel /io/ -w wheelhouse/
+done
+
+# Bundle external shared libraries into the wheels
+#
+# The '-L .' option is a workaround. By default, auditwheel puts all external
+# libraries (.so files) into a .libs directory and sets the RUNPATH to $ORIGIN/.libs.
+# When HTSLIB_MODE is 'shared' (now the default), then all so libraries part of
+# pysam require that RUNPATH is set to $ORIGIN (without the .libs). It seems
+# auditwheel overwrites $ORIGIN with $ORIGIN/.libs. This workaround makes
+# auditwheel set the RUNPATH to "$ORIGIN/." and it will work as desired.
+#
+for whl in wheelhouse/*.whl; do
+ auditwheel repair -L . $whl -w /io/wheelhouse/
+done
+
+# Created files are owned by root, so fix permissions.
+chown -R --reference=/io/setup.py /io/wheelhouse/
+
+# TODO Install packages and test them
+#for PYBIN in ${PYBINS}; do
+# ${PYBIN}/pip install pysam --no-index -f /io/wheelhouse
+# (cd $HOME; ${PYBIN}/nosetests ...)
+#done
diff --git a/cy_build.py b/cy_build.py
index 880b5cc..29af588 100644
--- a/cy_build.py
+++ b/cy_build.py
@@ -16,7 +16,6 @@ if sys.platform == 'darwin':
config_vars = get_config_vars()
config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '')
config_vars['SHLIB_EXT'] = '.so'
- config_vars['SO'] = '.so'
def is_pip_install():
@@ -61,7 +60,6 @@ class cy_build_ext(build_ext):
ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
if sys.platform == 'darwin':
-
relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"]
if "develop" in sys.argv or "test" in sys.argv:
diff --git a/doc/api.rst b/doc/api.rst
index 671fe4e..686c60d 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -201,7 +201,7 @@ Fastq files
:members:
-.. autoclass:: pysam.cfaidx.FastqProxy
+.. autoclass:: pysam.FastqProxy
:members:
diff --git a/doc/release.rst b/doc/release.rst
index f49b8f0..1d378f3 100644
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,6 +2,35 @@
Release notes
=============
+Release 0.10.0
+==============
+
+This release implements further functionality in the VariantFile API
+and includes several bugfixes:
+
+* treat special case -c option in samtools view outputs to stdout even
+ if -o given, fixes #315
+* permit reading BAM files with CSI index, closes #370
+* raise Error if query name exceeds maximum length, fixes #373
+* new method to compute hash value for AlignedSegment
+* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile
+* Avoid segfault by detecting out of range reference_id and
+ next_reference in AlignedSegment.tostring
+* Issue #355: Implement streams using file descriptors for VariantFile
+* upgrade to htslib 1.3.2
+* fix compilation with musl libc
+* Issue #316, #360: Rename all Cython modules to have lib as a prefix
+* Issue #332, hardclipped bases in cigar included by
+ pysam.AlignedSegment.infer_query_length()
+* Added support for Python 3.6 filename encoding protocol
+* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord
+* Issue #331, fix failure in VariantFile.reset() method
+* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and
+ VariantRecord.copy() methods to create new VariantRecord objects
+* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters
+* Preliminary (potentially unsafe) support for removing and altering header metadata
+* Many minor fixes and improvements to VariantFile and related objects
+
Release 0.9.1
=============
diff --git a/doc/usage.rst b/doc/usage.rst
index 90e7688..936f3bd 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -197,12 +197,22 @@ be retrieved by numeric index:
for row in tbx.fetch("chr1", 1000, 2000):
print ("chromosome is", row[0])
-By providing a parser argument to :class:`~pysam.AlignmentFile.fetch`
+By providing a parser to :class:`~pysam.AlignmentFile.fetch`
or :class:`~pysam.TabixFile`, the data will we presented in parsed
-form:
+form::
for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asTuple()):
print ("chromosome is", row.contig)
+ print ("first field (chrom)=", row[0])
+
+Pre-built parsers are available for :term:`bed`
+(:class:`~pysam.asBed`) formatted files and :term:`gtf`
+(:class:`~pysam.asGTF`) formatted files. Thus, additional fields
+become available through named access, for example::
+
+ for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asBed()):
+ print ("name is", row.name)
+
.. Currently inactivated as pileup deprecated
.. Using the samtools SNP caller
diff --git a/pysam/__init__.py b/pysam/__init__.py
index d1b5d41..ed17e04 100644
--- a/pysam/__init__.py
+++ b/pysam/__init__.py
@@ -3,22 +3,24 @@ import sys
import sysconfig
from pysam.libchtslib import *
-from pysam.cutils import *
-import pysam.cutils as cutils
-import pysam.cfaidx as cfaidx
-from pysam.cfaidx import *
-import pysam.ctabix as ctabix
-from pysam.ctabix import *
-import pysam.csamfile as csamfile
-from pysam.csamfile import *
-import pysam.calignmentfile as calignmentfile
-from pysam.calignmentfile import *
-import pysam.calignedsegment as calignedsegment
-from pysam.calignedsegment import *
-import pysam.cvcf as cvcf
-from pysam.cvcf import *
-import pysam.cbcf as cbcf
-from pysam.cbcf import *
+from pysam.libcutils import *
+import pysam.libcutils as libcutils
+import pysam.libcfaidx as libcfaidx
+from pysam.libcfaidx import *
+import pysam.libctabix as libctabix
+from pysam.libctabix import *
+import pysam.libcsamfile as libcsamfile
+from pysam.libcsamfile import *
+import pysam.libcalignmentfile as libcalignmentfile
+from pysam.libcalignmentfile import *
+import pysam.libcalignedsegment as libcalignedsegment
+from pysam.libcalignedsegment import *
+import pysam.libcvcf as libcvcf
+from pysam.libcvcf import *
+import pysam.libcbcf as libcbcf
+from pysam.libcbcf import *
+import pysam.libcbgzf as libcbgzf
+from pysam.libcbgzf import *
from pysam.utils import SamtoolsError
import pysam.Pileup as Pileup
from pysam.samtools import *
@@ -28,14 +30,15 @@ import pysam.config
# export all the symbols from separate modules
__all__ = \
libchtslib.__all__ +\
- cutils.__all__ +\
- ctabix.__all__ +\
- cvcf.__all__ +\
- cbcf.__all__ +\
- cfaidx.__all__ +\
- calignmentfile.__all__ +\
- calignedsegment.__all__ +\
- csamfile.__all__ +\
+ libcutils.__all__ +\
+ libctabix.__all__ +\
+ libcvcf.__all__ +\
+ libcbcf.__all__ +\
+ libcbgzf.__all__ +\
+ libcfaidx.__all__ +\
+ libcalignmentfile.__all__ +\
+ libcalignedsegment.__all__ +\
+ libcsamfile.__all__ +\
["SamtoolsError"] +\
["Pileup"]
@@ -75,25 +78,17 @@ def get_defines():
def get_libraries():
'''return a list of libraries to link against.'''
- # Note that this list does not include csamtools.so as there are
+ # Note that this list does not include libcsamtools.so as there are
# numerous name conflicts with libchtslib.so.
dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
- pysam_libs = ['ctabixproxies',
- 'cfaidx',
- 'csamfile',
- 'cvcf',
- 'cbcf',
- 'ctabix']
+ pysam_libs = ['libctabixproxies',
+ 'libcfaidx',
+ 'libcsamfile',
+ 'libcvcf',
+ 'libcbcf',
+ 'libctabix']
if pysam.config.HTSLIB == "builtin":
pysam_libs.append('libchtslib')
- if sys.version_info.major >= 3:
- if sys.version_info.minor >= 5:
- return [os.path.join(dirname, x + ".{}.so".format(
- sysconfig.get_config_var('SOABI'))) for x in pysam_libs]
- else:
- return [os.path.join(dirname, x + ".{}{}.so".format(
- sys.implementation.cache_tag,
- sys.abiflags)) for x in pysam_libs]
- else:
- return [os.path.join(dirname, x + ".so") for x in pysam_libs]
+ so = sysconfig.get_config_var('SO')
+ return [os.path.join(dirname, x + so) for x in pysam_libs]
diff --git a/pysam/chtslib.pyx b/pysam/chtslib.pyx
deleted file mode 100644
index eab229f..0000000
--- a/pysam/chtslib.pyx
+++ /dev/null
@@ -1,19 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-# adds doc-strings for sphinx
-from pysam.chtslib cimport *
-
-cpdef set_verbosity(int verbosity):
- u"""Set htslib's hts_verbose global variable to the specified value.
- """
- return hts_set_verbosity(verbosity)
-
-cpdef get_verbosity():
- u"""Return the value of htslib's hts_verbose global variable.
- """
- return hts_get_verbosity()
-
-__all__ = [
- "get_verbosity",
- "set_verbosity"]
-
diff --git a/pysam/calignedsegment.pxd b/pysam/libcalignedsegment.pxd
similarity index 97%
rename from pysam/calignedsegment.pxd
rename to pysam/libcalignedsegment.pxd
index 0880bef..f1d59d1 100644
--- a/pysam/calignedsegment.pxd
+++ b/pysam/libcalignedsegment.pxd
@@ -1,4 +1,4 @@
-from pysam.chtslib cimport *
+from pysam.libchtslib cimport *
cdef extern from "htslib_util.h":
@@ -32,7 +32,7 @@ cdef extern from "htslib_util.h":
void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
-from pysam.calignmentfile cimport AlignmentFile
+from pysam.libcalignmentfile cimport AlignmentFile
ctypedef AlignmentFile AlignmentFile_t
diff --git a/pysam/calignedsegment.pyx b/pysam/libcalignedsegment.pyx
similarity index 97%
rename from pysam/calignedsegment.pyx
rename to pysam/libcalignedsegment.pyx
index f4e0750..c95bb13 100644
--- a/pysam/calignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -65,9 +65,9 @@ from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
from libc.string cimport strchr
from cpython cimport array as c_array
-from pysam.cutils cimport force_bytes, force_str, \
+from pysam.libcutils cimport force_bytes, force_str, \
charptr_to_str, charptr_to_bytes
-from pysam.cutils cimport qualities_to_qualitystring, qualitystring_to_array, \
+from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
array_to_qualitystring
# Constants for binary tag conversion
@@ -88,6 +88,12 @@ else:
CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
#####################################################################
+# C multiplication with wrapping around
+cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
+ return (a * b) & 0xffffffff
+
+
+#####################################################################
# typecode guessing
cdef inline char map_typecode_htslib_to_python(uint8_t s):
"""map an htslib typecode to the corresponding python typecode
@@ -230,7 +236,7 @@ cdef inline packTags(tags):
to be used in a call to struct.pack_into.
"""
fmts, args = ["<"], []
-
+
cdef char array_typecode
datatype2format = {
@@ -284,7 +290,7 @@ cdef inline packTags(tags):
.format(value.typecode))
valuetype = force_bytes(chr(array_typecode))
-
+
if valuetype not in datatype2format:
raise ValueError("invalid value type '%s' (%s)" %
(valuetype, type(valuetype)))
@@ -337,9 +343,12 @@ cdef inline int32_t calculateQueryLength(bam1_t * src):
for k from 0 <= k < pysam_get_n_cigar(src):
op = cigar_p[k] & BAM_CIGAR_MASK
- if op == BAM_CMATCH or op == BAM_CINS or \
+ if op == BAM_CMATCH or \
+ op == BAM_CINS or \
op == BAM_CSOFT_CLIP or \
- op == BAM_CEQUAL or op == BAM_CDIFF:
+ op == BAM_CHARD_CLIP or \
+ op == BAM_CEQUAL or \
+ op == BAM_CDIFF:
qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
return qpos
@@ -513,7 +522,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src):
Positions corresponding to `N` (skipped region from the reference)
in the CIGAR string will not appear in the returned sequence. The
MD should correspondingly not contain these. Thus proper tags are::
-
+
Deletion from the reference: cigar=5M1D5M MD=5^C5
Skipped region from reference: cigar=5M1N5M MD=10
@@ -555,7 +564,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src):
l = cigar_p[k] >> BAM_CIGAR_SHIFT
if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
for i from 0 <= i < l:
- s[s_idx] = read_sequence[r_idx]
+ s[s_idx] = read_sequence[r_idx]
r_idx += 1
s_idx += 1
elif op == BAM_CDEL:
@@ -579,7 +588,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src):
"Padding (BAM_CPAD, 6) is currently not supported. "
"Please implement. Sorry about that.")
- cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
+ cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
if md_tag_ptr == NULL:
seq = PyBytes_FromStringAndSize(s, s_idx)
free(s)
@@ -665,6 +674,13 @@ cdef class AlignedSegment:
self._delegate.data = <uint8_t *>calloc(
self._delegate.m_data, 1)
self._delegate.l_data = 0
+ # set some data to make read approximately legit.
+ # Note, SAM writing fails with q_name of length 0
+ self._delegate.core.l_qname = 0
+ self._delegate.core.tid = -1
+ self._delegate.core.pos = -1
+ self._delegate.core.mtid = -1
+ self._delegate.core.mpos = -1
# caching for selected fields
self.cache_query_qualities = None
@@ -751,18 +767,17 @@ cdef class AlignedSegment:
return NotImplemented
def __hash__(self):
- cdef bam1_t * src
- src = self._delegate
- # shift and xor values in the core structure
- # make sure tid and mtid are shifted by different amounts
- # should variable length data be included?
- cdef uint32_t hash_value = src.core.tid << 24 ^ \
- src.core.pos << 16 ^ \
- src.core.qual << 8 ^ \
- src.core.flag ^ \
- src.core.isize << 24 ^ \
- src.core.mtid << 16 ^ \
- src.core.mpos << 8
+ cdef bam1_t * src = self._delegate
+ cdef int x
+
+ # see http://effbot.org/zone/python-hash.htm
+ cdef uint8_t * c = <uint8_t *>&src.core
+ cdef uint32_t hash_value = c[0]
+ for x from 1 <= x < sizeof(bam1_core_t):
+ hash_value = c_mul(hash_value, 1000003) ^ c[x]
+ c = <uint8_t *>src.data
+ for x from 0 <= x < src.l_data:
+ hash_value = c_mul(hash_value, 1000003) ^ c[x]
return hash_value
@@ -775,8 +790,13 @@ cdef class AlignedSegment:
----------
htsfile -- AlignmentFile object to map numerical
- identifers to chromosome names.
+ identifiers to chromosome names.
"""
+ cdef int n_targets = htsfile.header.n_targets
+
+ if self._delegate.core.tid >= n_targets \
+ or self._delegate.core.mtid >= n_targets:
+ raise ValueError('htsfile does not match aligned segment')
cdef kstring_t line
line.l = line.m = 0
@@ -788,7 +808,7 @@ cdef class AlignedSegment:
raise ValueError('sam_format failed')
ret = force_str(line.s[:line.l])
-
+
if line.m:
free(line.s)
@@ -808,6 +828,11 @@ cdef class AlignedSegment:
def __set__(self, qname):
if qname is None or len(qname) == 0:
return
+
+ if len(qname) >= 255:
+ raise ValueError("query length out of range {} > 254".format(
+ len(qname)))
+
qname = force_bytes(qname)
cdef bam1_t * src
cdef int l
@@ -823,7 +848,6 @@ cdef class AlignedSegment:
l,
<uint8_t*>p)
-
pysam_set_l_qname(src, l)
# re-acquire pointer to location in memory
@@ -955,7 +979,7 @@ cdef class AlignedSegment:
in the BAM/SAM file. The length of a query is 0 if there is no
sequence in the BAM/SAM file. In those cases, the read length
can be inferred from the CIGAR alignment, see
- :meth:`pysam.AlignmentFile.infer_query_length.`.
+ :meth:`pysam.AlignedSegment.infer_query_length`.
The length includes soft-clipped bases and is equal to
``len(query_sequence)``.
@@ -1669,7 +1693,7 @@ cdef class AlignedSegment:
each cigar operation.
"""
-
+
cdef int nfields = NCIGAR_CODES + 1
cdef c_array.array base_counts = array.array(
@@ -2288,7 +2312,7 @@ cdef class AlignedSegment:
cdef class PileupColumn:
- '''A pileup of reads at a particular reference sequence postion
+ '''A pileup of reads at a particular reference sequence position
(:term:`column`). A pileup column contains all the reads that map
to a certain target base.
@@ -2416,11 +2440,11 @@ cdef class PileupRead:
return self._qpos
property indel:
- """indel length for the position follwing the current pileup site.
+ """indel length for the position following the current pileup site.
This quantity peeks ahead to the next cigar operation in this
- alignment. If the next operation is and insertion, indel will
- be positve. If the next operation is a deletion, it will be
+ alignment. If the next operation is an insertion, indel will
+ be positive. If the next operation is a deletion, it will be
negation. 0 if the next operation is not an indel.
"""
@@ -2428,7 +2452,8 @@ cdef class PileupRead:
return self._indel
property level:
- """the level of the read in the "viewer" mode"""
+ """the level of the read in the "viewer" mode. Note that this value
+ is currently not computed."""
def __get__(self):
return self._level
@@ -2448,6 +2473,7 @@ cdef class PileupRead:
return self._is_tail
property is_refskip:
+ """1 iff the base on the padded read is part of CIGAR N op."""
def __get__(self):
return self._is_refskip
diff --git a/pysam/calignmentfile.pxd b/pysam/libcalignmentfile.pxd
similarity index 83%
rename from pysam/calignmentfile.pxd
rename to pysam/libcalignmentfile.pxd
index 3384e7e..6f32f47 100644
--- a/pysam/calignmentfile.pxd
+++ b/pysam/libcalignmentfile.pxd
@@ -4,9 +4,9 @@ from libc.stdlib cimport malloc, calloc, realloc, free
from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
from libc.stdio cimport FILE, printf
-from pysam.cfaidx cimport faidx_t, Fastafile
-from pysam.calignedsegment cimport AlignedSegment
-from pysam.chtslib cimport *
+from pysam.libcfaidx cimport faidx_t, Fastafile
+from pysam.libcalignedsegment cimport AlignedSegment
+from pysam.libchtslib cimport *
from cpython cimport array
cimport cython
@@ -36,33 +36,16 @@ ctypedef struct __iterdata:
int seq_len
-cdef class AlignmentFile:
-
- cdef object _filename
- cdef object _reference_filename
-
- # pointer to htsFile structure
- cdef htsFile * htsfile
+cdef class AlignmentFile(HTSFile):
+ cdef readonly object reference_filename
# pointer to index
cdef hts_idx_t *index
# header structure
cdef bam_hdr_t * header
- # true if file is bam format
- cdef readonly bint is_bam
- # true if file is bam format
- cdef readonly bint is_cram
- # true if not a file but a stream
- cdef readonly bint is_stream
- # true if file is not on the local filesystem
- cdef readonly bint is_remote
+
# current read within iteration
cdef bam1_t * b
- # file opening mode
- cdef char * mode
-
- # beginning of read section
- cdef int64_t start_offset
cdef bam1_t * getCurrent(self)
cdef int cnext(self)
@@ -70,12 +53,14 @@ cdef class AlignmentFile:
# write an aligned read
cpdef int write(self, AlignedSegment read) except -1
+
cdef class PileupColumn:
cdef bam_pileup1_t ** plp
cdef int tid
cdef int pos
cdef int n_pu
+
cdef class PileupRead:
cdef AlignedSegment _alignment
cdef int32_t _qpos
@@ -86,6 +71,7 @@ cdef class PileupRead:
cdef uint32_t _is_tail
cdef uint32_t _is_refskip
+
cdef class IteratorRow:
cdef int retval
cdef bam1_t * b
@@ -94,6 +80,7 @@ cdef class IteratorRow:
cdef bam_hdr_t * header
cdef int owns_samfile
+
cdef class IteratorRowRegion(IteratorRow):
cdef hts_itr_t * iter
cdef bam1_t * getCurrent(self)
@@ -109,16 +96,19 @@ cdef class IteratorRowAll(IteratorRow):
cdef bam1_t * getCurrent(self)
cdef int cnext(self)
+
cdef class IteratorRowAllRefs(IteratorRow):
cdef int tid
cdef IteratorRowRegion rowiter
+
cdef class IteratorRowSelection(IteratorRow):
cdef int current_pos
cdef positions
cdef bam1_t * getCurrent(self)
cdef int cnext(self)
+
cdef class IteratorColumn:
# result of the last plbuf_push
@@ -147,18 +137,20 @@ cdef class IteratorColumn:
cdef reset(self, tid, start, end)
cdef _free_pileup_iter(self)
+
cdef class IteratorColumnRegion(IteratorColumn):
cdef int start
cdef int end
cdef int truncate
+
cdef class IteratorColumnAllRefs(IteratorColumn):
pass
+
cdef class IndexedReads:
cdef AlignmentFile samfile
cdef htsFile * htsfile
cdef index
cdef int owns_samfile
cdef bam_hdr_t * header
-
diff --git a/pysam/calignmentfile.pyx b/pysam/libcalignmentfile.pyx
similarity index 92%
rename from pysam/calignmentfile.pyx
rename to pysam/libcalignmentfile.pyx
index ed5e584..2161f87 100644
--- a/pysam/calignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -58,13 +58,15 @@ import re
import warnings
import array
+from libc.errno cimport errno, EPIPE
+from libc.string cimport strcmp, strpbrk, strerror
from cpython cimport array as c_array
from cpython.version cimport PY_MAJOR_VERSION
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-from pysam.calignedsegment cimport makeAlignedSegment, makePileupColumn
-from pysam.chtslib cimport hisremote
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
+from pysam.libchtslib cimport HTSFile, hisremote
if PY_MAJOR_VERSION >= 3:
from io import StringIO
@@ -97,7 +99,8 @@ VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
# default type conversions within SAM header records
KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
"SQ" : {"SN" : str, "LN" : int, "AS" : str,
- "M5" : str, "SP" : str, "UR" : str,},
+ "M5" : str, "SP" : str, "UR" : str,
+ "AH" : str,},
"RG" : {"ID" : str, "CN" : str, "DS" : str,
"DT" : str, "FO" : str, "KS" : str,
"LB" : str, "PG" : str, "PI" : str,
@@ -110,7 +113,7 @@ KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
# the end as parsing a CL will ignore any subsequent records.
VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"),
"SQ" : ("SN", "LN", "AS", "M5",
- "UR", "SP"),
+ "UR", "SP", "AH"),
"RG" : ("ID", "CN", "SM", "LB",
"PU", "PI", "DT", "DS",
"PL", "FO", "KS", "PG",
@@ -216,11 +219,11 @@ cdef bam_hdr_t * build_header(new_header):
return dest
-cdef class AlignmentFile:
+cdef class AlignmentFile(HTSFile):
"""AlignmentFile(filepath_or_object, mode=None, template=None,
reference_names=None, reference_lengths=None, text=NULL,
header=None, add_sq_text=False, check_header=True, check_sq=True,
- reference_filename=None, filename=None)
+ reference_filename=None, filename=None, duplicate_filehandle=True)
A :term:`SAM`/:term:`BAM` formatted file.
@@ -323,16 +326,23 @@ cdef class AlignmentFile:
Alternative to filepath_or_object. Filename of the file
to be opened.
+ duplicate_filehandle: bool
+ By default, file handles passed either directly or through
+ File-like objects will be duplicated before passing them to
+ htslib. The duplication prevents issues where the same stream
+ will be closed by htslib and through destruction of the
+ high-level python object. Set to False to turn off
+ duplication.
+
"""
def __cinit__(self, *args, **kwargs):
-
self.htsfile = NULL
- self._filename = None
- self.is_bam = False
+ self.filename = None
+ self.mode = None
self.is_stream = False
- self.is_cram = False
self.is_remote = False
+ self.index = NULL
if "filename" in kwargs:
args = [kwargs["filename"]]
@@ -343,10 +353,6 @@ cdef class AlignmentFile:
# allocate memory for iterator
self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
- def is_open(self):
- '''return true if htsfile has been opened.'''
- return self.htsfile != NULL
-
def has_index(self):
"""return true if htsfile has an existing (and opened) index.
"""
@@ -365,7 +371,7 @@ cdef class AlignmentFile:
if htsfile is closed or index could not be opened.
"""
- if not self.is_open():
+ if not self.is_open:
raise ValueError("I/O operation on closed file")
if not self.is_bam and not self.is_cram:
raise AttributeError(
@@ -391,16 +397,17 @@ cdef class AlignmentFile:
check_sq=True,
filepath_index=None,
referencenames=None,
- referencelengths=None):
+ referencelengths=None,
+ duplicate_filehandle=True):
'''open a sam, bam or cram formatted file.
If _open is called on an existing file, the current file
will be closed and a new file will be opened.
'''
- cdef char *cfilename
- cdef char *creference_filename
- cdef char *cindexname
- cdef char *cmode
+ cdef char *cfilename = NULL
+ cdef char *creference_filename = NULL
+ cdef char *cindexname = NULL
+ cdef char *cmode = NULL
# for backwards compatibility:
if referencenames is not None:
@@ -408,6 +415,10 @@ cdef class AlignmentFile:
if referencelengths is not None:
reference_lengths = referencelengths
+ # close a previously opened file
+ if self.is_open:
+ self.close()
+
# autodetection for read
if mode is None:
mode = "r"
@@ -417,38 +428,46 @@ cdef class AlignmentFile:
"rc", "wc"), \
"invalid file opening mode `%s`" % mode
- # close a previously opened file
- if self.htsfile != NULL:
- self.close()
+ self.duplicate_filehandle = duplicate_filehandle
# StringIO not supported
if isinstance(filepath_or_object, StringIO):
- filename = "stringio"
raise NotImplementedError(
"access from StringIO objects not supported")
- if filepath_or_object.closed:
- raise ValueError('I/O operation on closed StringIO object')
- # check if we are working with a File object
+ # reading from a file descriptor
+ elif isinstance(filepath_or_object, int):
+ self.filename = filepath_or_object
+ filename = None
+ self.is_remote = False
+ self.is_stream = True
+ # reading from a File object or other object with fileno
elif hasattr(filepath_or_object, "fileno"):
- filename = filepath_or_object.name
if filepath_or_object.closed:
raise ValueError('I/O operation on closed file')
+ self.filename = filepath_or_object
+ # .name can be TextIOWrapper
+ try:
+ filename = encode_filename(str(filepath_or_object.name))
+ cfilename = filename
+ except AttributeError:
+ filename = None
+ self.is_remote = False
+ self.is_stream = True
+ # what remains is a filename
else:
- filename = filepath_or_object
+ self.filename = filename = encode_filename(filepath_or_object)
+ cfilename = filename
+ self.is_remote = hisremote(cfilename)
+ self.is_stream = self.filename == b'-'
# for htslib, wbu seems to not work
if mode == "wbu":
mode = "wb0"
- cdef bytes bmode = mode.encode('ascii')
- self._filename = filename = encode_filename(filename)
- self._reference_filename = reference_filename = encode_filename(
+ self.mode = force_bytes(mode)
+ self.reference_filename = reference_filename = encode_filename(
reference_filename)
- # FIXME: Use htsFormat when it is available
- self.is_stream = filename == b"-"
- self.is_remote = hisremote(filename)
-
cdef char * ctext
cdef hFILE * fp
ctext = NULL
@@ -477,10 +496,8 @@ cdef class AlignmentFile:
n = 0
for x in reference_names:
n += len(x) + 1
- self.header.target_name = <char**>calloc(
- n, sizeof(char*))
- self.header.target_len = <uint32_t*>calloc(
- n, sizeof(uint32_t))
+ self.header.target_name = <char**>calloc(n, sizeof(char*))
+ self.header.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
for x from 0 <= x < self.header.n_targets:
self.header.target_len[x] = reference_lengths[x]
name = reference_names[x]
@@ -507,57 +524,34 @@ cdef class AlignmentFile:
strlen(ctext), sizeof(char))
memcpy(self.header.text, ctext, strlen(ctext))
- # open file (hts_open is synonym with sam_open)
- cfilename, cmode = filename, bmode
- if hasattr(filepath_or_object, "fileno"):
- fp = hdopen(filepath_or_object.fileno(), cmode)
- with nogil:
- self.htsfile = hts_hopen(fp, cfilename, cmode)
- else:
- with nogil:
- self.htsfile = hts_open(cfilename, cmode)
-
- # htsfile.format does not get set until writing, so use
- # the format specifier explicitely given by the user.
- self.is_bam = "b" in mode
- self.is_cram = "c" in mode
+ self.htsfile = self._open_htsfile()
# set filename with reference sequences. If no filename
# is given, the CRAM reference arrays will be built from
# the @SQ header in the header
- if self.is_cram and reference_filename:
+ if "c" in mode and reference_filename:
# note that fn_aux takes ownership, so create a copy
- self.htsfile.fn_aux = strdup(self._reference_filename)
+ self.htsfile.fn_aux = strdup(self.reference_filename)
# write header to htsfile
- if self.is_bam or self.is_cram or "h" in mode:
+ if "b" in mode or "c" in mode or "h" in mode:
with nogil:
sam_hdr_write(self.htsfile, self.header)
elif mode[0] == "r":
# open file for reading
- if (filename != b"-"
- and not self.is_remote
- and not os.path.exists(filename)):
- raise IOError("file `%s` not found" % filename)
-
- # open file (hts_open is synonym with sam_open)
- cfilename, cmode = filename, bmode
- if hasattr(filepath_or_object, "fileno"):
- fp = hdopen(filepath_or_object.fileno(), cmode)
- with nogil:
- self.htsfile = hts_hopen(fp, cfilename, cmode)
- else:
- with nogil:
- self.htsfile = hts_open(cfilename, cmode)
+ if not self._exists():
+ raise IOError("file `%s` not found" % self.filename)
+
+ self.htsfile = self._open_htsfile()
if self.htsfile == NULL:
raise ValueError(
"could not open file (mode='%s') - "
"is it SAM/BAM format?" % mode)
- self.is_bam = self.htsfile.format.format == bam
- self.is_cram = self.htsfile.format.format == cram
+ if self.htsfile.format.category != sequence_data:
+ raise ValueError("file does not contain alignment data")
# bam files require a valid header
if self.is_bam or self.is_cram:
@@ -581,7 +575,7 @@ cdef class AlignmentFile:
# set filename with reference sequences
if self.is_cram and reference_filename:
- creference_filename = self._reference_filename
+ creference_filename = self.reference_filename
hts_set_opt(self.htsfile,
CRAM_OPT_REFERENCE,
creference_filename)
@@ -592,8 +586,7 @@ cdef class AlignmentFile:
"is it SAM/BAM format? Consider opening with "
"check_sq=False") % mode)
- if self.htsfile == NULL:
- raise IOError("could not open file `%s`" % filename )
+ assert self.htsfile != NULL
# check for index and open if present
cdef int format_index = -1
@@ -603,11 +596,8 @@ cdef class AlignmentFile:
format_index = HTS_FMT_CRAI
if mode[0] == "r" and (self.is_bam or self.is_cram):
-
# open index for remote files
if self.is_remote and not filepath_index:
- cfilename = filename
-
with nogil:
self.index = hts_idx_load(cfilename, format_index)
if self.index == NULL:
@@ -615,17 +605,18 @@ cdef class AlignmentFile:
"unable to open remote index for '%s'" % cfilename)
else:
has_index = True
- cfilename = filename
if filepath_index:
if not os.path.exists(filepath_index):
warnings.warn(
"unable to open index at %s" % cfilename)
self.index = NULL
has_index = False
- else:
+ elif filename is not None:
if self.is_bam \
and not os.path.exists(filename + b".bai") \
- and not os.path.exists(filename[:-4] + b".bai"):
+ and not os.path.exists(filename[:-4] + b".bai") \
+ and not os.path.exists(filename + b".csi") \
+ and not os.path.exists(filename[:-4] + b".csi"):
self.index = NULL
has_index = False
elif self.is_cram \
@@ -633,6 +624,9 @@ cdef class AlignmentFile:
and not os.path.exists(filename[:-5] + b".crai"):
self.index = NULL
has_index = False
+ else:
+ self.index = NULL
+ has_index = False
if has_index:
# returns NULL if there is no index or index could
@@ -643,7 +637,6 @@ cdef class AlignmentFile:
self.index = sam_index_load2(self.htsfile,
cfilename,
cindexname)
-
else:
with nogil:
self.index = sam_index_load(self.htsfile,
@@ -664,7 +657,7 @@ cdef class AlignmentFile:
returns -1 if reference is not known.
"""
- if not self.is_open():
+ if not self.is_open:
raise ValueError("I/O operation on closed file")
reference = force_bytes(reference)
return bam_name2id(self.header, reference)
@@ -673,78 +666,13 @@ cdef class AlignmentFile:
"""
return :term:`reference` name corresponding to numerical :term:`tid`
"""
- if not self.is_open():
+ if not self.is_open:
raise ValueError("I/O operation on closed file")
if not 0 <= tid < self.header.n_targets:
raise ValueError("reference_id %i out of range 0<=tid<%i" %
(tid, self.header.n_targets))
return charptr_to_str(self.header.target_name[tid])
- def reset(self):
- """reset file position to beginning of file just after
- the header.
-
- Returns
- -------
-
- The file position after moving the file pointer.
-
- """
- return self.seek(self.start_offset, 0)
-
- def seek(self, uint64_t offset, int where=0):
- """move file pointer to position `offset`, see
- :meth:`pysam.AlignmentFile.tell`.
-
- Parameters
- ----------
-
- offset : int
-
- position of the read/write pointer within the file.
-
- where : int
-
- optional and defaults to 0 which means absolute file
- positioning, other values are 1 which means seek relative to
- the current position and 2 means seek relative to the file's
- end.
-
- Returns
- -------
-
- the file position after moving the file pointer
-
- """
-
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- if not self.is_bam:
- raise NotImplementedError(
- "seek only available in bam files")
- if self.is_stream:
- raise OSError("seek no available in streams")
-
- cdef uint64_t pos
- with nogil:
- pos = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, where)
- return pos
-
- def tell(self):
- """
- return current file position.
- """
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- if not (self.is_bam or self.is_cram):
- raise NotImplementedError(
- "seek only available in bam files")
-
- cdef uint64_t pos
- with nogil:
- pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
- return pos
-
def parse_region(self,
reference=None,
start=None,
@@ -891,7 +819,7 @@ cdef class AlignmentFile:
"""
cdef int rtid, rstart, rend, has_coord
- if not self.is_open():
+ if not self.is_open:
raise ValueError( "I/O operation on closed file" )
has_coord, rtid, rstart, rend = self.parse_region(
@@ -1055,7 +983,7 @@ cdef class AlignmentFile:
----------
stepper : string
- The stepper controlls how the iterator advances.
+ The stepper controls how the iterator advances.
Possible options for the stepper are
``all``
@@ -1092,7 +1020,7 @@ cdef class AlignmentFile:
"""
cdef int rtid, rstart, rend, has_coord
- if not self.is_open():
+ if not self.is_open:
raise ValueError("I/O operation on closed file")
has_coord, rtid, rstart, rend = self.parse_region(
@@ -1177,8 +1105,8 @@ cdef class AlignmentFile:
cdef AlignedSegment read
cdef long counter = 0
- if not self.is_open():
- raise ValueError( "I/O operation on closed file" )
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
cdef int filter_method = 0
if read_callback == "all":
@@ -1329,13 +1257,48 @@ cdef class AlignmentFile:
return count_a, count_c, count_g, count_t
+ def find_introns(self, read_iterator):
+ """Return a dictionary {(start, stop): count}
+ Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+ and their support ( = number of reads ).
+
+ read_iterator can be the result of a .fetch(...) call.
+ Or it can be a generator filtering such reads. Example
+ samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+ """
+ import collections
+ res = collections.Counter()
+ for r in read_iterator:
+ if 'N' in r.cigarstring:
+ last_read_pos = False
+ for read_loc, genome_loc in r.get_aligned_pairs():
+ if read_loc is None and last_read_pos:
+ start = genome_loc
+ elif read_loc and last_read_pos is None:
+ stop = genome_loc # we are right exclusive ,so this is correct
+ res[(start, stop)] += 1
+ del start
+ del stop
+ last_read_pos = read_loc
+ return res
+
def close(self):
'''
closes the :class:`pysam.AlignmentFile`.'''
- if self.htsfile != NULL:
- hts_close(self.htsfile)
- hts_idx_destroy(self.index);
- self.htsfile = NULL
+
+ if self.htsfile == NULL:
+ return
+
+ cdef int ret = hts_close(self.htsfile)
+ hts_idx_destroy(self.index)
+ self.htsfile = NULL
+
+ if ret < 0:
+ global errno
+ if errno == EPIPE:
+ errno = 0
+ else:
+ raise OSError(errno, force_str(strerror(errno)))
def __dealloc__(self):
# remember: dealloc cannot call other methods
@@ -1349,14 +1312,24 @@ cdef class AlignmentFile:
# AH: I have removed the call to close. Even though it is working,
# it seems to be dangerous according to the documentation as the
# object be partially deconstructed already.
+ cdef int ret = 0
+
if self.htsfile != NULL:
- hts_close(self.htsfile)
+ ret = hts_close(self.htsfile)
hts_idx_destroy(self.index);
self.htsfile = NULL
bam_destroy1(self.b)
if self.header != NULL:
bam_hdr_destroy(self.header)
+
+
+ if ret < 0:
+ global errno
+ if errno == EPIPE:
+ errno = 0
+ else:
+ raise OSError(errno, force_str(strerror(errno)))
cpdef int write(self, AlignedSegment read) except -1:
'''
@@ -1373,7 +1346,7 @@ cdef class AlignmentFile:
int : the number of bytes written. If the file is closed,
this will be 0.
'''
- if not self.is_open():
+ if not self.is_open:
return 0
cdef int ret
@@ -1387,7 +1360,8 @@ cdef class AlignmentFile:
# when ret == -1 we get a "SystemError: error return without
# exception set".
if ret < 0:
- raise ValueError('sam write failed')
+ raise IOError(
+ "sam_write1 failed with error code {}".format(ret))
return ret
@@ -1404,23 +1378,11 @@ cdef class AlignmentFile:
###############################################################
## properties
###############################################################
- property closed:
- """bool indicating the current state of the file object.
- This is a read-only attribute; the close() method changes the value.
- """
- def __get__(self):
- return not self.is_open()
-
- property filename:
- """filename associated with this object. This is a read-only attribute."""
- def __get__(self):
- return self._filename
-
property nreferences:
""""int with the number of :term:`reference` sequences in the file.
This is a read-only attribute."""
def __get__(self):
- if not self.is_open():
+ if not self.is_open:
raise ValueError("I/O operation on closed file")
return self.header.n_targets
@@ -1428,7 +1390,7 @@ cdef class AlignmentFile:
"""tuple with the names of :term:`reference` sequences. This is a
read-only attribute"""
def __get__(self):
- if not self.is_open(): raise ValueError( "I/O operation on closed file" )
+ if not self.is_open: raise ValueError( "I/O operation on closed file" )
t = []
for x from 0 <= x < self.header.n_targets:
t.append(charptr_to_str(self.header.target_name[x]))
@@ -1441,7 +1403,7 @@ cdef class AlignmentFile:
"""
def __get__(self):
- if not self.is_open():
+ if not self.is_open:
raise ValueError("I/O operation on closed file")
t = []
for x from 0 <= x < self.header.n_targets:
@@ -1491,13 +1453,6 @@ cdef class AlignmentFile:
n = hts_idx_get_n_no_coor(self.index)
return n
- property format:
- '''string describing the file format'''
- def __get__(self):
- if not self.is_open():
- raise ValueError( "I/O operation on closed file" )
- return hts_format_description(&self.htsfile.format)
-
property text:
'''string with the full contents of the :term:`sam file` header as a
string.
@@ -1508,7 +1463,7 @@ cdef class AlignmentFile:
representation of the header.
'''
def __get__(self):
- if not self.is_open():
+ if not self.is_open:
raise ValueError( "I/O operation on closed file" )
return from_string_and_size(self.header.text, self.header.l_text)
@@ -1535,7 +1490,7 @@ cdef class AlignmentFile:
"""
def __get__(self):
- if not self.is_open():
+ if not self.is_open:
raise ValueError( "I/O operation on closed file" )
result = {}
@@ -1613,7 +1568,7 @@ cdef class AlignmentFile:
## and multiple_iterators)
## Possible solutions: deprecate or open new file handle
def __iter__(self):
- if not self.is_open():
+ if not self.is_open:
raise ValueError("I/O operation on closed file")
if not self.is_bam and self.header.n_targets == 0:
@@ -1683,7 +1638,7 @@ cdef class IteratorRow:
cdef char *cfilename
cdef char *creference_filename
- if not samfile.is_open():
+ if not samfile.is_open:
raise ValueError("I/O operation on closed file")
# makes sure that samfile stays alive as long as the
@@ -1693,7 +1648,7 @@ cdef class IteratorRow:
# reopen the file - note that this makes the iterator
# slow and causes pileup to slow down significantly.
if multiple_iterators:
- cfilename = samfile._filename
+ cfilename = samfile.filename
with nogil:
self.htsfile = hts_open(cfilename, 'r')
assert self.htsfile != NULL
@@ -1704,8 +1659,8 @@ cdef class IteratorRow:
assert self.header != NULL
self.owns_samfile = True
# options specific to CRAM files
- if samfile.is_cram and samfile._reference_filename:
- creference_filename = samfile._reference_filename
+ if samfile.is_cram and samfile.reference_filename:
+ creference_filename = samfile.reference_filename
hts_set_opt(self.htsfile,
CRAM_OPT_REFERENCE,
creference_filename)
@@ -2462,7 +2417,7 @@ cdef class IndexedReads:
# multiple_iterators the file - note that this makes the iterator
# slow and causes pileup to slow down significantly.
if multiple_iterators:
- cfilename = samfile._filename
+ cfilename = samfile.filename
with nogil:
self.htsfile = hts_open(cfilename, 'r')
assert self.htsfile != NULL
diff --git a/pysam/cbcf.pxd b/pysam/libcbcf.pxd
similarity index 72%
rename from pysam/cbcf.pxd
rename to pysam/libcbcf.pxd
index b6e210a..fc7f56c 100644
--- a/pysam/cbcf.pxd
+++ b/pysam/libcbcf.pxd
@@ -3,18 +3,9 @@
## Cython wrapper for htslib VCF/BCF reader/writer
###############################################################################
#
-# NOTICE: This code is incomplete and preliminary. It is nearly complete as
-# an immutable interface, but has no capability (yet) to mutate the
-# resulting data (beyond dropping all samples). Documentation still
-# needs to be written and a unit test suite is in the works. The
-# code is also specific to Python 2 and will require a bit of work
-# to properly adapt to Python 3.
-#
-###############################################################################
-#
# The MIT License
#
-# Copyright (c) 2015 Kevin Jacobs (jacobs at bioinformed.com)
+# Copyright (c) 2015, 2016 Kevin Jacobs (jacobs at bioinformed.com)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
@@ -39,14 +30,15 @@
from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.string cimport memcpy, memcmp, memmove, strncpy, strlen, strdup
-from pysam.chtslib cimport *
+from pysam.libchtslib cimport *
cdef class VariantHeader(object):
cdef bcf_hdr_t *ptr
+ cpdef VariantRecord new_record(self)
cdef _subset_samples(self, include_samples)
@@ -137,23 +129,16 @@ cdef class TabixIterator(BaseIterator):
cdef kstring_t line_buffer
-cdef class VariantFile(object):
- cdef htsFile *htsfile # pointer to htsFile structure
- cdef int64_t start_offset # BGZF offset of first record
-
- cdef readonly object filename # filename as supplied by user
- cdef readonly object mode # file opening mode
- cdef readonly object index_filename # filename of index, if supplied by user
-
+cdef class VariantFile(HTSFile):
cdef readonly VariantHeader header
cdef readonly BaseIndex index
cdef readonly bint drop_samples # true if sample information is to be ignored
# FIXME: Temporary, use htsFormat when it is available
- cdef readonly bint is_bcf # true if file is a bcf file
- cdef readonly bint is_stream # true if not a seekable file but a stream
- cdef readonly bint is_remote # true if file is not on the local filesystem
- cdef readonly bint is_reading # true if file has begun reading records
+ cdef readonly bint is_reading # true if file has begun reading records
+ cdef readonly bint header_written # true if header has already been written
+
+ cpdef VariantRecord new_record(self)
cpdef int write(self, VariantRecord record) except -1
diff --git a/pysam/cbcf.pyx b/pysam/libcbcf.pyx
similarity index 76%
rename from pysam/cbcf.pyx
rename to pysam/libcbcf.pyx
index 41fd44f..8f40451 100644
--- a/pysam/cbcf.pyx
+++ b/pysam/libcbcf.pyx
@@ -7,8 +7,7 @@
#
# NOTICE: This code is incomplete and preliminary. It offers a nearly
# complete Pythonic interface to VCF/BCF metadata and data with
-# reading and writing capability. It has limited capability to to
-# mutate the resulting data. Documentation and a unit test suite
+# reading and writing capability. Documentation and a unit test suite
# are in the works. The code is best tested under Python 2, but
# should also work with Python 3. Please report any remaining
# str/bytes issues on the github site when using Python 3 and I'll
@@ -43,126 +42,22 @@
# user 1m3.502s
# sys 0m3.459s
#
-# Here is a quick tour through the API::
-#
-# VariantFile(filename, mode=None, header=None, drop_samples=False)
-#
-# Attributes / Properties
-#
-# htsfile: htsFile* [private]
-# start_offset: BGZF offset of first record [private]
-# filename: filename [read only]
-# mode: mode [read only]
-# header: VariantHeader object [read only]
-# index: TabixIndex, BCFIndex or None [read only]
-# drop_samples: sample information is to be ignored [read only]
-#
-# is_stream: file is stdin/stdout [read only]
-# is_remote: file is not on the local filesystem [read only]
-# is_reading: file has begun reading records [read only]
-# category: file format general category [read only]
-# format: file format [read only]
-# version: tuple of (major, minor) format version [read only]
-# compression: file compression
-# description: vaguely human readable description of [read only]
-# file format.
-#
-# Methods:
-# copy()
-# close()
-# open(filename, mode=None, header=None, drop_samples=False)
-# reset()
-# seek(offset)
-# tell()
-# fetch(contig=None, start=None, stop=None, region=None, reopen=False)
-# subset_samples(include_samples)
-#
-# VariantHeader()
-#
-# version: VCF version
-# samples: sequence-like access to samples
-# records: sequence-like access to partially parsed headers
-# contigs: mapping-like object for contig name -> VariantContig
-#
-# filters: mapping-like object for filter name -> VariantMetadata
-# info: mapping-like object for info name -> VariantMetadata
-# formats: mapping-like object for formats name -> VariantMetadata
-#
-# VariantRecord(...)
-#
-# header: VariantHeader object
-# rid: reference id (i.e. tid)
-# chrom: chromosome/contig string
-# contig: synonym for chrom
-# pos: 1-based start position (inclusive)
-# start: 0-based start position (inclusive)
-# stop: 0-based stop position (exclusive)
-# rlen: reference length (stop - start)
-# id: record identifier
-# ref: reference allele
-# alleles: alleles (ref followed by alts)
-# alts: alt alleles
-# qual: quality (float)
-# filter: mapping-like object for filter name -> type info
-# info: mapping-like object for info name -> value
-# format: mapping-like object for format name -> type info
-# samples: mapping-like object of sample genotypes & attrs
-#
-# VariantRecordSample(...)
-#
-# name: sample name
-# index: sample index
-# allele_indices: tuple of allele indices (ref=0, alt=1..len(alts), missing=-1)
-# alleles: tuple of alleles (missing=None)
-#
-# VariantRecordSample is also a mapping object from formats to values
-#
-# VariantContig(...)
-#
-# id: reference id (i.e. tid)
-# name: chromosome/contig string
-# length: contig length if provided, else None
-# header: defining VariantHeaderRecord
-#
-# VariantMetadata(...) # for FILTER, INFO and FORMAT metadata
-#
-# id: internal id
-# name: metadata name
-# type: value data type
-# number: number of values
-# header: defining VariantHeaderRecord
-#
-# VariantHeaderRecord(...) # replace with single tuple of key/value pairs?
-#
-# type: record type
-# key: first record key
-# value: first record value
-# attrs: remaining key/value pairs
-#
###############################################################################
#
-# TODO list for next major sprint:
+# TODO list:
#
# * more genotype methods
# * unit test suite (perhaps py.test based)
# * documentation
-# * htslib 1.2 format info
-#
-# For later sprints:
-#
-# * ability to create indices
-# * mutable header and record data
# * pickle support
-# * Python 3 support
# * left/right locus normalization
-# * parallel iteration (like synced_bcf_reader)
# * fix reopen to re-use fd
#
###############################################################################
#
# The MIT License
#
-# Copyright (c) 2015 Kevin Jacobs (jacobs at bioinformed.com)
+# Copyright (c) 2015,2016 Kevin Jacobs (jacobs at bioinformed.com)
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
@@ -189,7 +84,8 @@ from __future__ import division, print_function
import os
import sys
-from libc.string cimport strcmp, strpbrk
+from libc.errno cimport errno, EPIPE
+from libc.string cimport strcmp, strpbrk, strerror
from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
cimport cython
@@ -202,7 +98,7 @@ from cpython.bytes cimport PyBytes_FromStringAndSize
from cpython.unicode cimport PyUnicode_DecodeASCII
from cpython.version cimport PY_MAJOR_VERSION
-from pysam.chtslib cimport hisremote
+from pysam.libchtslib cimport HTSFile, hisremote
from warnings import warn
@@ -223,18 +119,14 @@ cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
-cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
-cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
- 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
-cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
########################################################################
########################################################################
## Python 3 compatibility functions
########################################################################
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.cutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libcutils cimport encode_filename, from_string_and_size
########################################################################
@@ -268,6 +160,10 @@ cdef inline bcf_str_cache_get_charptr(const char* s):
########################################################################
+cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
+ return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
+
+
cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0
@@ -497,8 +393,15 @@ cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar):
+ if record is None:
+ raise ValueError('record must not be None')
+
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf1_t *r = record.ptr
+
+ if not check_header_id(hdr, hl_type, id):
+ raise ValueError('Invalid header')
+
cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
@@ -523,6 +426,9 @@ cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *cou
cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
+ if record is None:
+ raise ValueError('record must not be None')
+
cdef bcf_hdr_t *hdr = record.header.ptr
cdef char *s
@@ -540,33 +446,13 @@ cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
value = ()
elif z.len == 1:
if z.type == BCF_BT_INT8:
- if z.v1.i == bcf_int8_missing:
- value = None
- elif z.v1.i == bcf_int8_vector_end:
- value = ()
- else:
- value = z.v1.i
+ value = z.v1.i if z.v1.i != bcf_int8_missing else None
elif z.type == BCF_BT_INT16:
- if z.v1.i == bcf_int16_missing:
- value = None
- elif z.v1.i == bcf_int16_vector_end:
- value = ()
- else:
- value = z.v1.i
+ value = z.v1.i if z.v1.i != bcf_int16_missing else None
elif z.type == BCF_BT_INT32:
- if z.v1.i == bcf_int32_missing:
- value = None
- elif z.v1.i == bcf_int32_vector_end:
- value = ()
- else:
- value = z.v1.i
+ value = z.v1.i if z.v1.i != bcf_int32_missing else None
elif z.type == BCF_BT_FLOAT:
- if bcf_float_is_missing(z.v1.f):
- value = None
- elif bcf_float_is_vector_end(z.v1.f):
- value = ()
- else:
- value = z.v1.f
+ value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
elif z.type == BCF_BT_CHAR:
value = force_str(chr(z.v1.i))
else:
@@ -581,17 +467,23 @@ cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type,
- int id, int bt_type, ssize_t bt_len, ssize_t *value_count,
- int *scalar, int *realloc):
+ int id, int bt_type, ssize_t bt_len,
+ ssize_t *value_count, int *scalar, int *realloc):
+
+ if record is None:
+ raise ValueError('record must not be None')
bcf_get_value_count(record, hl_type, id, value_count, scalar)
# Validate values now that we know the type and size
- values = (value,) if not isinstance(value, tuple) else value
+ values = (value,) if not isinstance(value, (list, tuple)) else value
# Validate values now that we know the type and size
if ht_type == BCF_HT_FLAG:
value_count[0] = 1
+ elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
+ # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
+ value_count[0] = -1
if value_count[0] != -1 and value_count[0] != len(values):
if scalar[0]:
@@ -638,13 +530,16 @@ cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_ty
cdef bcf_encode_alleles(VariantRecord record, values):
+ if record is None:
+ raise ValueError('record must not be None')
+
cdef bcf1_t *r = record.ptr
cdef int32_t nalleles = r.n_allele
cdef list gt_values = []
cdef char *s
cdef int i
- if not values:
+ if values is None:
return ()
if not isinstance(values, (list, tuple)):
@@ -652,7 +547,7 @@ cdef bcf_encode_alleles(VariantRecord record, values):
for value in values:
if value is None:
- gt_values.append(None)
+ gt_values.append(bcf_gt_missing)
elif isinstance(value, (str, bytes)):
bvalue = force_bytes(value)
s = bvalue
@@ -672,6 +567,9 @@ cdef bcf_encode_alleles(VariantRecord record, values):
cdef bcf_info_set_value(VariantRecord record, key, value):
+ if record is None:
+ raise ValueError('record must not be None')
+
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf1_t *r = record.ptr
cdef vdict_t *d
@@ -696,9 +594,13 @@ cdef bcf_info_set_value(VariantRecord record, key, value):
info_id = kh_val_vdict(d, k).id
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id,
- info.type if info else -1, info.len if info else -1,
+ info.type if info else -1,
+ info.len if info else -1,
&value_count, &scalar, &realloc)
if info_type == BCF_HT_FLAG:
@@ -752,6 +654,9 @@ cdef bcf_info_set_value(VariantRecord record, key, value):
cdef bcf_info_del_value(VariantRecord record, key):
+ if record is None:
+ raise ValueError('record must not be None')
+
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf1_t *r = record.ptr
cdef ssize_t value_count
@@ -779,6 +684,9 @@ cdef bcf_info_del_value(VariantRecord record, key):
cdef bcf_format_get_value(VariantRecordSample sample, key):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef ssize_t count
@@ -809,6 +717,9 @@ cdef bcf_format_get_value(VariantRecordSample sample, key):
cdef bcf_format_set_value(VariantRecordSample sample, key, value):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int fmt_id
@@ -834,6 +745,9 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
fmt_id = kh_val_vdict(d, k).id
+ if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
+ raise ValueError('Invalid header')
+
fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
if fmt_type == BCF_HT_FLAG:
@@ -841,9 +755,12 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
if is_gt_fmt(hdr, fmt_id):
value = bcf_encode_alleles(sample.record, value)
+ # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
+ fmt_type = BCF_HT_INT
values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id,
- fmt.type if fmt else -1, fmt.n if fmt else -1,
+ fmt.type if fmt else -1,
+ fmt.n if fmt else -1,
&value_count, &scalar, &realloc)
vlen = value_count < 0
@@ -888,6 +805,9 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
cdef bcf_format_del_value(VariantRecordSample sample, key):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef ssize_t value_count
@@ -915,6 +835,9 @@ cdef bcf_format_del_value(VariantRecordSample sample, key):
cdef bcf_format_get_allele_indices(VariantRecordSample sample):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int32_t n = bcf_hdr_nsamples(hdr)
@@ -942,7 +865,7 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample):
for i in range(fmt0.n):
if data8[i] == bcf_int8_vector_end:
break
- elif data8[i] == bcf_int8_missing:
+ elif data8[i] == bcf_gt_missing:
a = -1
else:
a = bcf_gt_allele(data8[i])
@@ -952,7 +875,7 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample):
for i in range(fmt0.n):
if data16[i] == bcf_int16_vector_end:
break
- elif data16[i] == bcf_int16_missing:
+ elif data16[i] == bcf_gt_missing:
a = -1
else:
a = bcf_gt_allele(data16[i])
@@ -962,7 +885,7 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample):
for i in range(fmt0.n):
if data32[i] == bcf_int32_vector_end:
break
- elif data32[i] == bcf_int32_missing:
+ elif data32[i] == bcf_gt_missing:
a = -1
else:
a = bcf_gt_allele(data32[i])
@@ -972,6 +895,9 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample):
cdef bcf_format_get_alleles(VariantRecordSample sample):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
@@ -1020,6 +946,9 @@ cdef bcf_format_get_alleles(VariantRecordSample sample):
cdef bint bcf_sample_get_phased(VariantRecordSample sample):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int32_t n = bcf_hdr_nsamples(hdr)
@@ -1080,6 +1009,9 @@ cdef bint bcf_sample_get_phased(VariantRecordSample sample):
cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int32_t n = bcf_hdr_nsamples(hdr)
@@ -1134,59 +1066,89 @@ cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
## Variant Header objects
########################################################################
+
+cdef bcf_header_remove_hrec(VariantHeader header, int i):
+ if header is None:
+ raise ValueError('header must not be None')
+
+ cdef bcf_hdr_t *hdr = header.ptr
+
+ if i < 0 or i >= hdr.nhrec:
+ raise ValueError('Invalid header record index')
+
+ cdef bcf_hrec_t *hrec = hdr.hrec[i]
+ hdr.nhrec -= 1
+
+ if i < hdr.nhrec:
+ memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
+
+ bcf_hrec_destroy(hrec)
+ hdr.hrec[hdr.nhrec] = NULL
+ hdr.dirty = 1
+
+
#FIXME: implement a full mapping interface
-#FIXME: passing bcf_hrec_t* may not be the safest approach once mutating
-# operations are allowed.
+#FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
+# object lifetime.
cdef class VariantHeaderRecord(object):
"""header record from a :class:`VariantHeader` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
- property type:
+ @property
+ def type(self):
"""header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
- def __get__(self):
- cdef bcf_hrec_t *r = self.ptr
- return METADATA_TYPES[r.type]
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return None
+ return METADATA_TYPES[r.type]
- property key:
+ @property
+ def key(self):
"""header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
- def __get__(self):
- cdef bcf_hrec_t *r = self.ptr
- return bcf_str_cache_get_charptr(r.key) if r.key else None
+ cdef bcf_hrec_t *r = self.ptr
+ return bcf_str_cache_get_charptr(r.key) if r and r.key else None
- property value:
+ @property
+ def value(self):
"""header value. Set only for generic lines, None for FILTER/INFO, etc."""
- def __get__(self):
- cdef bcf_hrec_t *r = self.ptr
- return charptr_to_str(r.value) if r.value else None
+ cdef bcf_hrec_t *r = self.ptr
+ return charptr_to_str(r.value) if r and r.value else None
- property attrs:
+ @property
+ def attrs(self):
"""sequence of additional header attributes"""
- def __get__(self):
- cdef bcf_hrec_t *r = self.ptr
- cdef int i
- return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
- charptr_to_str(r.vals[i]) if r.vals[i] else None)
- for i in range(r.nkeys))
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return ()
+ cdef int i
+ return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
+ charptr_to_str(r.vals[i]) if r.vals[i] else None)
+ for i in range(r.nkeys))
def __len__(self):
cdef bcf_hrec_t *r = self.ptr
- return r.nkeys
+ return r.nkeys if r else 0
def __bool__(self):
cdef bcf_hrec_t *r = self.ptr
- return r.nkeys != 0
+ return r != NULL and r.nkeys != 0
def __getitem__(self, key):
"""get attribute value"""
cdef bcf_hrec_t *r = self.ptr
cdef int i
- bkey = force_bytes(key)
- for i in range(r.nkeys):
- if r.keys[i] and r.keys[i] == bkey:
- return charptr_to_str(r.vals[i]) if r.vals[i] else None
+ if r:
+ bkey = force_bytes(key)
+ for i in range(r.nkeys):
+ if r.keys[i] and r.keys[i] == bkey:
+ return charptr_to_str(r.vals[i]) if r.vals[i] else None
raise KeyError('cannot find metadata key')
def __iter__(self):
cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
@@ -1214,6 +1176,8 @@ cdef class VariantHeaderRecord(object):
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
@@ -1222,6 +1186,8 @@ cdef class VariantHeaderRecord(object):
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return
cdef int i
for i in range(r.nkeys):
if r.keys[i]:
@@ -1246,11 +1212,34 @@ cdef class VariantHeaderRecord(object):
def __str__(self):
cdef bcf_hrec_t *r = self.ptr
- if r.type == BCF_HL_GEN:
- return '##{}={}'.format(self.key, self.value)
- else:
- attrs = ','.join('{}={}'.format(k, v) for k,v in self.attrs if k != 'IDX')
- return '##{}=<{}>'.format(self.key or self.type, attrs)
+
+ if not r:
+ raise ValueError('cannot convert deleted record to str')
+
+ cdef kstring_t hrec_str
+ hrec_str.l = hrec_str.m = 0
+ hrec_str.s = NULL
+
+ bcf_hrec_format(r, &hrec_str)
+
+ ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
+
+ if hrec_str.m:
+ free(hrec_str.s)
+
+ return ret
+
+ # FIXME: Not safe -- causes trivial segfaults at the moment
+ def remove(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return
+ assert(r.key)
+ cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
+ print('Removing header type={} key={} value={} hdr={}'.format(METADATA_TYPES[r.type], r.key, r.value, key))
+ bcf_hdr_remove(hdr, r.type, key)
+ self.ptr = NULL
cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
@@ -1269,6 +1258,8 @@ cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_
cdef class VariantHeaderRecords(object):
"""sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
return self.header.ptr.nhrec
@@ -1300,63 +1291,75 @@ cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
cdef class VariantMetadata(object):
- """filter, info or format metadata record from a :class:`VariantHeader`
- object"""
+ """filter, info or format metadata record from a :class:`VariantHeader` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
- property name:
+ @property
+ def name(self):
"""metadata name"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
# Q: Should this be exposed?
- property id:
+ @property
+ def id(self):
"""metadata internal header id number"""
- def __get__(self):
- return self.id
+ return self.id
- property number:
+ @property
+ def number(self):
"""metadata number (i.e. cardinality)"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or self.type == BCF_HL_FLT:
- return None
- cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
- if l == BCF_VL_FIXED:
- return bcf_hdr_id2number(hdr, self.type, self.id)
- elif l == BCF_VL_VAR:
- return '.'
- else:
- return METADATA_LENGTHS[l]
+ cdef bcf_hdr_t *hdr = self.header.ptr
+
+ if not check_header_id(hdr, self.type, self.id):
+ raise ValueError('Invalid header id')
+
+ if self.type == BCF_HL_FLT:
+ return None
- property type:
+ cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
+ if l == BCF_VL_FIXED:
+ return bcf_hdr_id2number(hdr, self.type, self.id)
+ elif l == BCF_VL_VAR:
+ return '.'
+ else:
+ return METADATA_LENGTHS[l]
+
+ @property
+ def type(self):
"""metadata value type"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or \
- self.type == BCF_HL_FLT:
- return None
- return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
-
- property description:
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ if not check_header_id(hdr, self.type, self.id):
+ raise ValueError('Invalid header id')
+
+ if self.type == BCF_HL_FLT:
+ return None
+ return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
+
+ @property
+ def description(self):
"""metadata description (or None if not set)"""
- def __get__(self):
- descr = self.record.get('Description')
- if descr:
- descr = descr.strip('"')
- return force_str(descr)
-
- property record:
- """:class:`VariantHeaderRecord` associated with this
- :class:`VariantMetadata` object"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- if not bcf_hdr_idinfo_exists(hdr, self.type, self.id):
- return None
- cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
- if not hrec:
- return None
- return makeVariantHeaderRecord(self.header, hrec)
+ descr = self.record.get('Description')
+ if descr:
+ descr = descr.strip('"')
+ return force_str(descr)
+
+ @property
+ def record(self):
+ """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ if not check_header_id(hdr, self.type, self.id):
+ raise ValueError('Invalid header id')
+ cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
+ if not hrec:
+ return None
+ return makeVariantHeaderRecord(self.header, hrec)
+
+ def remove_header(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef const char *bkey = hdr.id[BCF_DT_ID][self.id].key
+ bcf_hdr_remove(hdr, self.type, bkey)
cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
@@ -1379,6 +1382,8 @@ cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id)
cdef class VariantHeaderMetadata(object):
"""mapping from filter, info or format name to :class:`VariantMetadata` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
def add(self, id, number, type, description, **kwargs):
"""Add a new filter, info or format record"""
@@ -1436,10 +1441,28 @@ cdef class VariantHeaderMetadata(object):
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
- raise KeyError('invalid filter')
+ raise KeyError('invalid key')
return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
+ def remove_header(self, key):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
+
+ bkey = force_bytes(key)
+ cdef khiter_t k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
+ raise KeyError('invalid key')
+
+ bcf_hdr_remove(hdr, self.type, bkey)
+ #bcf_hdr_sync(hdr)
+
+ def clear_header(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ bcf_hdr_remove(hdr, self.type, NULL)
+ #bcf_hdr_sync(hdr)
+
def __iter__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef bcf_idpair_t *idpair
@@ -1510,31 +1533,38 @@ cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32
cdef class VariantContig(object):
"""contig metadata from a :class:`VariantHeader`"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
- property name:
+ @property
+ def name(self):
"""contig name"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
- property id:
+ @property
+ def id(self):
"""contig internal id number"""
- def __get__(self):
- return self.id
+ return self.id
- property length:
+ @property
+ def length(self):
"""contig length or None if not available"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
- return length if length else None
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
+ return length if length else None
- property header:
+ @property
+ def header(self):
""":class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
- return makeVariantHeaderRecord(self.header, hrec)
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
+ return makeVariantHeaderRecord(self.header, hrec)
+
+ def remove_header(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef const char *bkey = hdr.id[BCF_DT_CTG][self.id].key
+ bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
cdef VariantContig makeVariantContig(VariantHeader header, int id):
@@ -1553,6 +1583,8 @@ cdef VariantContig makeVariantContig(VariantHeader header, int id):
cdef class VariantHeaderContigs(object):
"""mapping from contig name or index to :class:`VariantContig` object."""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
@@ -1585,6 +1617,32 @@ cdef class VariantHeaderContigs(object):
return makeVariantContig(self.header, id)
+ def remove_header(self, key):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int index
+ cdef const char *bkey
+ cdef vdict_t *d
+ cdef khiter_t k
+
+ if isinstance(key, int):
+ index = key
+ if index < 0 or index >= hdr.n[BCF_DT_CTG]:
+ raise IndexError('invalid contig index')
+ bkey = hdr.id[BCF_DT_CTG][self.id].key
+ else:
+ d = <vdict_t *>hdr.dict[BCF_DT_CTG]
+ key = force_bytes(key)
+ if kh_get_vdict(d, key) == kh_end(d):
+ raise KeyError('invalid contig')
+ bkey = key
+
+ bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+
+ def clear_header(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
+ #bcf_hdr_sync(hdr)
+
def __iter__(self):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
@@ -1662,6 +1720,8 @@ cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
cdef class VariantHeaderSamples(object):
"""sequence of sample names from a :class:`VariantHeader` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
return bcf_hdr_nsamples(self.header.ptr)
@@ -1716,7 +1776,6 @@ cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
cdef class VariantHeader(object):
"""header information for a :class:`VariantFile` object"""
-
#FIXME: Add structured proxy
#FIXME: Add generic proxy
#FIXME: Add mutable methods
@@ -1743,42 +1802,48 @@ cdef class VariantHeader(object):
def copy(self):
return makeVariantHeader(bcf_hdr_dup(self.ptr))
- property version:
+ def merge(self, VariantHeader header):
+ if header is None:
+ raise ValueError('header must not be None')
+ bcf_hdr_merge(self.ptr, header.ptr)
+
+ @property
+ def version(self):
"""VCF version"""
- def __get__(self):
- return force_str(bcf_hdr_get_version(self.ptr))
+ return force_str(bcf_hdr_get_version(self.ptr))
- property samples:
+ @property
+ def samples(self):
"""samples (:class:`VariantHeaderSamples`)"""
- def __get__(self):
- return makeVariantHeaderSamples(self)
+ return makeVariantHeaderSamples(self)
- property records:
+ @property
+ def records(self):
"""header records (:class:`VariantHeaderRecords`)"""
- def __get__(self):
- return makeVariantHeaderRecords(self)
+ return makeVariantHeaderRecords(self)
- property contigs:
+ @property
+ def contigs(self):
"""contig information (:class:`VariantHeaderContigs`)"""
- def __get__(self):
- return makeVariantHeaderContigs(self)
+ return makeVariantHeaderContigs(self)
- property filters:
+ @property
+ def filters(self):
"""filter metadata (:class:`VariantHeaderMetadata`)"""
- def __get__(self):
- return makeVariantHeaderMetadata(self, BCF_HL_FLT)
+ return makeVariantHeaderMetadata(self, BCF_HL_FLT)
- property info:
+ @property
+ def info(self):
"""info metadata (:class:`VariantHeaderMetadata`)"""
- def __get__(self):
- return makeVariantHeaderMetadata(self, BCF_HL_INFO)
+ return makeVariantHeaderMetadata(self, BCF_HL_INFO)
- property formats:
+ @property
+ def formats(self):
"""format metadata (:class:`VariantHeaderMetadata`)"""
- def __get__(self):
- return makeVariantHeaderMetadata(self, BCF_HL_FMT)
+ return makeVariantHeaderMetadata(self, BCF_HL_FMT)
- property alts:
+ @property
+ def alts(self):
"""alt metadata (:class:`dict` ID->record).
The data returned just a snapshot of alt records, is created
@@ -1787,12 +1852,9 @@ cdef class VariantHeader(object):
i.e. it is just a dict that reflects the state of alt records
at the time it is created.
-
"""
- def __get__(self):
- return {record['ID']:record for record in self.records
- if record.key.upper() == 'ALT' }
-
+ return {record['ID']:record for record in self.records
+ if record.key.upper() == 'ALT' }
# only safe to do when opening an htsfile
cdef _subset_samples(self, include_samples):
@@ -1824,15 +1886,23 @@ cdef class VariantHeader(object):
finally:
free(hstr)
+ cpdef VariantRecord new_record(self):
+ """Create a new empty VariantRecord"""
+ r = makeVariantRecord(self, bcf_init())
+ r.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
+ return r
+
def add_record(self, VariantHeaderRecord record):
"""Add an existing :class:`VariantHeaderRecord` to this header"""
- cdef bcf_hrec_t *r = record.ptr
+ if record is None:
+ raise ValueError('record must not be None')
- if r.type == BCF_HL_GEN:
- self.add_meta(r.key, r.value)
- else:
- items = [(k,v) for k,v in record.attrs if k != 'IDX']
- self.add_meta(r.key, items=items)
+ cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
+
+ bcf_hdr_add_hrec(self.ptr, hrec)
+
+ if self.ptr.dirty:
+ bcf_hdr_sync(self.ptr)
def add_line(self, line):
"""Add a metadata line to this header"""
@@ -1901,6 +1971,8 @@ cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
cdef class VariantRecordFilter(object):
"""Filters set on a :class:`VariantRecord` object, presented as a mapping from
filter index or name to :class:`VariantMetadata` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
return self.record.ptr.d.n_flt
@@ -1928,12 +2000,28 @@ cdef class VariantRecordFilter(object):
bkey = force_bytes(key)
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
- if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \
- or not bcf_has_filter(hdr, self.record.ptr, bkey):
+ if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
raise KeyError('Invalid filter')
return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
+ def add(self, key):
+ """Add a new filter"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int id
+
+ if key == '.':
+ key = 'PASS'
+
+ bkey = force_bytes(key)
+ id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
+
+ if not check_header_id(hdr, BCF_HL_FLT, id):
+ raise KeyError('Invalid filter')
+
+ bcf_add_filter(hdr, r, id)
+
def __delitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
@@ -1954,8 +2042,7 @@ cdef class VariantRecordFilter(object):
bkey = force_bytes(key)
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
- if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \
- or not bcf_has_filter(hdr, self.record.ptr, bkey):
+ if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
raise KeyError('Invalid filter')
bcf_remove_filter(hdr, r, id, 0)
@@ -2032,6 +2119,8 @@ cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
cdef class VariantRecordFormat(object):
"""Format data present for each sample in a :class:`VariantRecord` object,
presented as mapping from format name to :class:`VariantMetadata` object."""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
@@ -2155,8 +2244,7 @@ cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
if not record:
raise ValueError('invalid VariantRecord')
- cdef VariantRecordFormat format = VariantRecordFormat.__new__(
- VariantRecordFormat)
+ cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
format.record = record
return format
@@ -2167,6 +2255,9 @@ cdef class VariantRecordInfo(object):
"""Info data stored in a :class:`VariantRecord` object, presented as a
mapping from info metadata name to value."""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
def __len__(self):
return self.record.ptr.n_info
@@ -2197,6 +2288,9 @@ cdef class VariantRecordInfo(object):
else:
info_id = info.key
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
return info != NULL and info.vptr != NULL
@@ -2331,6 +2425,8 @@ cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
cdef class VariantRecordSamples(object):
"""mapping from sample index or name to :class:`VariantRecordSample` object."""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
return bcf_hdr_nsamples(self.record.header.ptr)
@@ -2445,226 +2541,279 @@ cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
cdef class VariantRecord(object):
"""Variant record"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
def __dealloc__(self):
if self.ptr:
bcf_destroy1(self.ptr)
self.ptr = NULL
- property rid:
+ def copy(self):
+ """return a copy of this VariantRecord object"""
+ return makeVariantRecord(self.header, bcf_dup(self.ptr))
+
+ def translate(self, VariantHeader dst_header):
+ if dst_header is None:
+ raise ValueError('dst_header must not be None')
+
+ cdef bcf_hdr_t *src_hdr = self.header.ptr
+ cdef bcf_hdr_t *dst_hdr = dst_header.ptr
+
+ if src_hdr != dst_hdr:
+ if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
+ msg = 'Cannot translate record. Number of samples does not match header ({} vs {})'
+ raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
+
+ bcf_translate(dst_hdr, src_hdr, self.ptr)
+
+ @property
+ def rid(self):
"""internal reference id number"""
- def __get__(self):
- return self.ptr.rid
- def __set__(self, rid):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef int r = rid
- if rid < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
- raise ValueError('invalid reference id')
- self.ptr.rid = r
-
- property chrom:
+ return self.ptr.rid
+
+ @rid.setter
+ def rid(self, value):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int r = value
+ if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
+ raise ValueError('invalid reference id')
+ self.ptr.rid = r
+
+ @property
+ def chrom(self):
"""chromosome/contig name"""
- def __get__(self):
- return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid))
- def __set__(self, chrom):
- cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
- bchrom = force_bytes(chrom)
- cdef khint_t k = kh_get_vdict(d, bchrom)
- if k == kh_end(d):
- raise ValueError('Invalid chromosome/contig')
- self.ptr.rid = kh_val_vdict(d, k).id
-
- property contig:
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int rid = self.ptr.rid
+ if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
+ raise ValueError('Invalid header')
+ return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
+
+ @chrom.setter
+ def chrom(self, value):
+ cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
+ bchrom = force_bytes(value)
+ cdef khint_t k = kh_get_vdict(d, bchrom)
+ if k == kh_end(d):
+ raise ValueError('Invalid chromosome/contig')
+ self.ptr.rid = kh_val_vdict(d, k).id
+
+ @property
+ def contig(self):
"""chromosome/contig name"""
- def __get__(self):
- return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid))
- def __set__(self, chrom):
- cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
- bchrom = force_bytes(chrom)
- cdef khint_t k = kh_get_vdict(d, bchrom)
- if k == kh_end(d):
- raise ValueError('Invalid chromosome/contig')
- self.ptr.rid = kh_val_vdict(d, k).id
-
- property pos:
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int rid = self.ptr.rid
+ if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
+ raise ValueError('Invalid header')
+ return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
+
+ @contig.setter
+ def contig(self, value):
+ cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
+ bchrom = force_bytes(value)
+ cdef khint_t k = kh_get_vdict(d, bchrom)
+ if k == kh_end(d):
+ raise ValueError('Invalid chromosome/contig')
+ self.ptr.rid = kh_val_vdict(d, k).id
+
+ @property
+ def pos(self):
"""record start position on chrom/contig (1-based inclusive)"""
- def __get__(self):
- return self.ptr.pos + 1
- def __set__(self, pos):
- if pos < 1:
- raise ValueError('Position must be positive')
- # FIXME: check start <= stop?
- # KBJ: Can't or else certain mutating operations will become
- # difficult or impossible. e.g. having to delete
- # info['END'] before being able to reset pos is going to
- # create subtle bugs. Better to check this when writing
- # records.
- self.ptr.pos = pos - 1
-
- property start:
+ return self.ptr.pos + 1
+
+ @pos.setter
+ def pos(self, value):
+ cdef int p = value
+ if p < 1:
+ raise ValueError('Position must be positive')
+ self.ptr.pos = p - 1
+
+ @property
+ def start(self):
"""record start position on chrom/contig (0-based inclusive)"""
- def __get__(self):
- return self.ptr.pos
- def __set__(self, start):
- if start < 0:
- raise ValueError('Start coordinate must be non-negative')
- # FIXME: check start <= stop?
- # KBJ: See above.
- self.ptr.pos = start
-
- property stop:
+ return self.ptr.pos
+
+ @start.setter
+ def start(self, value):
+ cdef int s = value
+ if s < 0:
+ raise ValueError('Start coordinate must be non-negative')
+ self.ptr.pos = s
+
+ @property
+ def stop(self):
"""record stop position on chrom/contig (0-based exclusive)"""
- def __get__(self):
- return self.ptr.pos + self.ptr.rlen
- def __set__(self, stop):
- if stop < self.ptr.pos:
- raise ValueError('Stop coordinate must be greater than or equal to start')
- self.ptr.rlen = stop - self.ptr.pos
-
- property rlen:
+ return self.ptr.pos + self.ptr.rlen
+
+ @stop.setter
+ def stop(self, value):
+ cdef int s = value
+ if s < self.ptr.pos:
+ raise ValueError('Stop coordinate must be greater than or equal to start')
+ self.ptr.rlen = s - self.ptr.pos
+ if self.ptr.rlen != len(self.ref) or 'END' in self.info:
+ self.info['END'] = s
+
+ @property
+ def rlen(self):
"""record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)"""
- def __get__(self):
- return self.ptr.rlen
- def __set__(self, rlen):
- if rlen < 0:
- raise ValueError('Reference length must be non-negative')
- self.ptr.rlen = rlen
-
- property qual:
+ return self.ptr.rlen
+
+ @rlen.setter
+ def rlen(self, value):
+ cdef int r = value
+ if r < 0:
+ raise ValueError('Reference length must be non-negative')
+ self.ptr.rlen = r
+ if r != len(self.ref) or 'END' in self.info:
+ self.info['END'] = self.ptr.pos + r
+
+ @property
+ def qual(self):
"""phred scaled quality score or None if not available"""
- def __get__(self):
- return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
- def __set__(self, qual):
- if qual is not None:
- self.ptr.qual = qual
- else:
- bcf_float_set(&self.ptr.qual, bcf_float_missing)
+ return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
+
+ @qual.setter
+ def qual(self, value):
+ if value is not None:
+ self.ptr.qual = value
+ else:
+ bcf_float_set(&self.ptr.qual, bcf_float_missing)
+
-# property n_allele:
-# def __get__(self):
-# return self.ptr.n_allele
+# @property
+# def n_allele(self):
+# return self.ptr.n_allele
-# property n_sample:
-# def __get__(self):
-# return self.ptr.n_sample
+# @property
+# def n_sample(self):
+# return self.ptr.n_sample
- property id:
+ @property
+ def id(self):
"""record identifier or None if not available"""
- def __get__(self):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
- def __set__(self, id):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- cdef char *idstr = NULL
- if id is not None:
- bid = force_bytes(id)
- idstr = bid
- if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
- raise ValueError('Error updating id')
-
- property ref:
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+
+ @id.setter
+ def id(self, value):
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ cdef char *idstr = NULL
+ if value is not None:
+ bid = force_bytes(value)
+ idstr = bid
+ if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
+ raise ValueError('Error updating id')
+
+ @property
+ def ref(self):
"""reference allele"""
- def __get__(self):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return charptr_to_str(r.d.allele[0]) if r.d.allele else None
- def __set__(self, ref):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- #FIXME: Set alleles directly -- this is stupid
- if not ref:
- raise ValueError('ref allele cannot be null')
- ref = force_bytes(ref)
- if r.d.allele and r.n_allele:
- alleles = [r.d.allele[i] for i in range(r.n_allele)]
- alleles[0] = ref
- else:
- alleles = [ref]
- self.alleles = alleles
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return charptr_to_str(r.d.allele[0]) if r.d.allele else None
- property alleles:
+ @ref.setter
+ def ref(self, value):
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ #FIXME: Set alleles directly -- this is stupid
+ if not value:
+ raise ValueError('ref allele must not be null')
+ value = force_bytes(value)
+ if r.d.allele and r.n_allele:
+ alleles = [r.d.allele[i] for i in range(r.n_allele)]
+ alleles[0] = value
+ else:
+ alleles = [value]
+ self.alleles = alleles
+
+ @property
+ def alleles(self):
"""tuple of reference allele followed by alt alleles"""
- def __get__(self):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- if not r.d.allele:
- return None
- cdef tuple res = PyTuple_New(r.n_allele)
- for i in range(r.n_allele):
- a = charptr_to_str(r.d.allele[i])
- PyTuple_SET_ITEM(res, i, a)
- Py_INCREF(a)
- return res
- def __set__(self, values):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- values = [force_bytes(v) for v in values]
- if b'' in values:
- raise ValueError('cannot set null allele')
- values = b','.join(values)
- if bcf_update_alleles_str(self.header.ptr, r, values) < 0:
- raise ValueError('Error updating alleles')
-
- property alts:
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ if not r.d.allele:
+ return None
+ cdef tuple res = PyTuple_New(r.n_allele)
+ for i in range(r.n_allele):
+ a = charptr_to_str(r.d.allele[i])
+ PyTuple_SET_ITEM(res, i, a)
+ Py_INCREF(a)
+ return res
+
+ @alleles.setter
+ def alleles(self, value):
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ value = [force_bytes(v) for v in value]
+ if b'' in value:
+ raise ValueError('cannot set null allele')
+ value = b','.join(value)
+ if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
+ raise ValueError('Error updating alleles')
+
+ @property
+ def alts(self):
"""tuple of alt alleles"""
- def __get__(self):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- if r.n_allele < 2 or not r.d.allele:
- return None
- cdef tuple res = PyTuple_New(r.n_allele - 1)
- for i in range(1, r.n_allele):
- a = charptr_to_str(r.d.allele[i])
- PyTuple_SET_ITEM(res, i - 1, a)
- Py_INCREF(a)
- return res
- def __set__(self, values):
- #FIXME: Set alleles directly -- this is stupid
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- values = [force_bytes(v) for v in values]
- if b'' in values:
- raise ValueError('cannot set null alt allele')
- ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
- self.alleles = ref + values
-
- property filter:
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ if r.n_allele < 2 or not r.d.allele:
+ return None
+ cdef tuple res = PyTuple_New(r.n_allele - 1)
+ for i in range(1, r.n_allele):
+ a = charptr_to_str(r.d.allele[i])
+ PyTuple_SET_ITEM(res, i - 1, a)
+ Py_INCREF(a)
+ return res
+
+ @alts.setter
+ def alts(self, value):
+ #FIXME: Set alleles directly -- this is stupid
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ value = [force_bytes(v) for v in value]
+ if b'' in value:
+ raise ValueError('cannot set null alt allele')
+ ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
+ self.alleles = ref + value
+
+ @property
+ def filter(self):
"""filter information (see :class:`VariantRecordFilter`)"""
- def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return makeVariantRecordFilter(self)
+ if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordFilter(self)
- property info:
+ @property
+ def info(self):
"""info data (see :class:`VariantRecordInfo`)"""
- def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return makeVariantRecordInfo(self)
+ if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordInfo(self)
- property format:
+ @property
+ def format(self):
"""sample format metadata (see :class:`VariantRecordFormat`)"""
- def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return makeVariantRecordFormat(self)
+ if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordFormat(self)
- property samples:
+ @property
+ def samples(self):
"""sample data (see :class:`VariantRecordSamples`)"""
- def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return makeVariantRecordSamples(self)
+ if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordSamples(self)
def __str__(self):
cdef kstring_t line
@@ -2700,6 +2849,27 @@ cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
if not r:
raise ValueError('cannot create VariantRecord')
+ if r.errcode:
+ msg = []
+ #if r.errcode & BCF_ERR_CTG_UNDEF:
+ # msg.append('undefined contig')
+ #if r.errcode & BCF_ERR_TAG_UNDEF:
+ # msg.append('undefined tag')
+ if r.errcode & BCF_ERR_NCOLS:
+ msg.append('invalid number of columns')
+ if r.errcode & BCF_ERR_LIMITS:
+ msg.append('limits violated')
+ if r.errcode & BCF_ERR_CHAR:
+ msg.append('invalid character found')
+ if r.errcode & BCF_ERR_CTG_INVALID:
+ msg.append('invalid contig')
+ if r.errcode & BCF_ERR_TAG_INVALID:
+ msg.append('invalid tag')
+
+ if msg:
+ msg = ', '.join(msg)
+ raise ValueError('Error(s) reading record: {}'.format(msg))
+
cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
record.header = header
record.ptr = r
@@ -2718,43 +2888,55 @@ cdef class VariantRecordSample(object):
Provides data accessors for genotypes and a mapping interface
from format name to values.
"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
- property name:
+ @property
+ def name(self):
"""sample name"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
- if self.index < 0 or self.index >= n:
- raise ValueError('invalid sample index')
+ if self.index < 0 or self.index >= n:
+ raise ValueError('invalid sample index')
- return charptr_to_str(hdr.samples[self.index])
+ return charptr_to_str(hdr.samples[self.index])
- property allele_indices:
+ @property
+ def allele_indices(self):
"""allele indices for called genotype, if present. Otherwise None"""
- def __get__(self):
- return bcf_format_get_allele_indices(self)
- def __set__(self, values):
- self['GT'] = values
- def __del__(self):
- self['GT'] = ()
-
- property alleles:
+ return bcf_format_get_allele_indices(self)
+
+ @allele_indices.setter
+ def allele_indices(self, value):
+ self['GT'] = value
+
+ @allele_indices.deleter
+ def allele_indices(self):
+ self['GT'] = ()
+
+ @property
+ def alleles(self):
"""alleles for called genotype, if present. Otherwise None"""
- def __get__(self):
- return bcf_format_get_alleles(self)
- def __set__(self, values):
- self['GT'] = values
- def __del__(self):
- self['GT'] = ()
-
- property phased:
+ return bcf_format_get_alleles(self)
+
+ @alleles.setter
+ def alleles(self, value):
+ self['GT'] = value
+
+ @alleles.deleter
+ def alleles(self):
+ self['GT'] = ()
+
+ @property
+ def phased(self):
"""False if genotype is missing or any allele is unphased. Otherwise True."""
- def __get__(self):
- return bcf_sample_get_phased(self)
- def __set__(self, value):
- bcf_sample_set_phased(self, value)
+ return bcf_sample_get_phased(self)
+
+ @phased.setter
+ def phased(self, value):
+ bcf_sample_set_phased(self, value)
def __len__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
@@ -2956,10 +3138,7 @@ cdef class BCFIndex(object):
cdef int n
cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
- if not refs:
- raise ValueError('Cannot retrieve reference sequence names')
-
- self.refs = char_array_to_tuple(refs, n, free_after=1)
+ self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
self.refmap = { r:i for i,r in enumerate(self.refs) }
def __dealloc__(self):
@@ -2998,10 +3177,7 @@ cdef class TabixIndex(BaseIndex):
cdef int n
cdef const char **refs = tbx_seqnames(self.ptr, &n)
- if not refs:
- raise ValueError('Cannot retrieve reference sequence names')
-
- self.refs = char_array_to_tuple(refs, n, free_after=1)
+ self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
self.refmap = { r:i for i,r in enumerate(self.refs) }
def __dealloc__(self):
@@ -3046,6 +3222,8 @@ cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
cdef class BCFIterator(BaseIterator):
def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+ if bcf is None:
+ raise ValueError('bcf must not be None')
if not isinstance(bcf.index, BCFIndex):
raise ValueError('bcf index required')
@@ -3075,7 +3253,7 @@ cdef class BCFIterator(BaseIterator):
try:
rid = index.refmap[contig]
except KeyError:
- raise('Unknown contig specified')
+ raise ValueError('Unknown contig specified')
if start is None:
start = 0
@@ -3138,6 +3316,9 @@ cdef class TabixIterator(BaseIterator):
self.line_buffer.s = NULL
def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+ if bcf is None:
+ raise ValueError('bcf must not be None')
+
if not isinstance(bcf.index, TabixIndex):
raise ValueError('tabix index required')
@@ -3226,34 +3407,58 @@ cdef class TabixIterator(BaseIterator):
########################################################################
-cdef class VariantFile(object):
- """*(filename, mode=None, index_filename=None, header=None, drop_samples=False)*
+cdef class VariantFile(HTSFile):
+ """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
+ duplicate_filehandle=True)*
A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
opened.
- *mode* should be ``r`` for reading or ``w`` for writing. The default is
- text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
- ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
+ If an index for a variant file exists (.csi or .tbi), it will be
+ opened automatically. Without an index random access to records
+ via :meth:`fetch` is disabled.
+
+ For writing, a :class:`VariantHeader` object must be provided,
+ typically obtained from another :term:`VCF` file/:term:`BCF`
+ file.
+
+ Parameters
+ ----------
+ mode : string
+ *mode* should be ``r`` for reading or ``w`` for writing. The default is
+ text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
+ ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
+
+ If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
+ modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
+ For instance, to open a :term:`BCF` formatted file for reading, type::
- If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
- modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
- For instance, to open a :term:`BCF` formatted file for reading, type::
+ f = pysam.VariantFile('ex1.bcf','r')
- f = pysam.VariantFile('ex1.bcf','rb')
+ If mode is not specified, we will try to auto-detect the file type. All
+ of the following should work::
- If mode is not specified, we will try to auto-detect in the order 'rb',
- 'r', thus both the following should work::
+ f1 = pysam.VariantFile('ex1.bcf')
+ f2 = pysam.VariantFile('ex1.vcf')
+ f3 = pysam.VariantFile('ex1.vcf.gz')
- f1 = pysam.VariantFile('ex1.bcf')
- f2 = pysam.VariantFile('ex1.vcf')
+ index_filename : string
+ Explicit path to an index file.
- If an index for a variant file exists (.csi or .tbi), it will be opened
- automatically. Without an index random access to records via
- :meth:`fetch` is disabled.
+ header : VariantHeader
+ :class:`VariantHeader` object required for writing.
+
+ drop_samples: bool
+ Ignore sample information when reading.
+
+ duplicate_filehandle: bool
+ By default, file handles passed either directly or through
+ File-like objects will be duplicated before passing them to
+ htslib. The duplication prevents issues where the same stream
+ will be closed by htslib and through destruction of the
+ high-level python object. Set to False to turn off
+ duplication.
- For writing, a :class:`VariantHeader` object must be provided, typically
- obtained from another :term:`VCF` file/:term:`BCF` file.
"""
def __cinit__(self, *args, **kwargs):
self.htsfile = NULL
@@ -3268,88 +3473,38 @@ cdef class VariantFile(object):
self.is_remote = False
self.is_reading = False
self.drop_samples = False
+ self.header_written = False
self.start_offset = -1
self.open(*args, **kwargs)
- def __dealloc__(self):
- if self.htsfile:
- hts_close(self.htsfile)
- self.htsfile = NULL
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
- property category:
- """General file format category. One of UNKNOWN, ALIGNMENTS,
- VARIANTS, INDEX, REGIONS"""
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return FORMAT_CATEGORIES[self.htsfile.format.category]
-
- property format:
- """File format.
-
- One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
- BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
- """
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return FORMATS[self.htsfile.format.format]
-
- property version:
- """Tuple of file format version numbers (major, minor)"""
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return (self.htsfile.format.version.major,
- self.htsfile.format.version.minor)
-
- property compression:
- """File compression.
-
- One of NONE, GZIP, BGZF, CUSTOM."""
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return COMPRESSION[self.htsfile.format.compression]
-
- property description:
- """Vaguely human readable description of the file format"""
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- cdef char *desc = hts_format_description(&self.htsfile.format)
- try:
- return charptr_to_str(desc)
- finally:
- free(desc)
-
def close(self):
"""closes the :class:`pysam.VariantFile`."""
+ cdef int ret = 0
+ self.header = self.index = None
if self.htsfile:
- hts_close(self.htsfile)
+ # Write header if no records were written
+ if self.htsfile.is_write and not self.header_written:
+ self.header_written = True
+ with nogil:
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
+ ret = hts_close(self.htsfile)
self.htsfile = NULL
- self.header = self.index = None
- property is_open:
- def __get__(self):
- """return True if VariantFile is open and in a valid state."""
- return self.htsfile != NULL
+ if ret < 0:
+ global errno
+ if errno == EPIPE:
+ errno = 0
+ else:
+ raise OSError(errno, force_str(strerror(errno)))
def __iter__(self):
if not self.is_open:
raise ValueError('I/O operation on closed file')
- if not self.mode.startswith(b'r'):
- raise ValueError(
- 'cannot iterate over Variantfile opened for writing')
+ if self.htsfile.is_write:
+ raise ValueError('cannot iterate over Variantfile opened for writing')
self.is_reading = 1
return self
@@ -3382,13 +3537,9 @@ cdef class VariantFile(object):
cdef VariantFile vars = VariantFile.__new__(VariantFile)
cdef bcf_hdr_t *hdr
- cdef char *cfilename
- cdef char *cmode
# FIXME: re-open using fd or else header and index could be invalid
- cfilename, cmode = self.filename, self.mode
- with nogil:
- vars.htsfile = hts_open(cfilename, cmode)
+ vars.htsfile = self._open_htsfile()
if not vars.htsfile:
raise ValueError('Cannot re-open htsfile')
@@ -3406,6 +3557,7 @@ cdef class VariantFile(object):
vars.is_remote = self.is_remote
vars.is_reading = self.is_reading
vars.start_offset = self.start_offset
+ vars.header_written = self.header_written
if self.htsfile.is_bin:
vars.seek(self.tell())
@@ -3416,10 +3568,11 @@ cdef class VariantFile(object):
return vars
- def open(self, filename, mode='rb',
+ def open(self, filename, mode='r',
index_filename=None,
VariantHeader header=None,
- drop_samples=False):
+ drop_samples=False,
+ duplicate_filehandle=True):
"""open a vcf/bcf file.
If open is called on an existing VariantFile, the current file will be
@@ -3437,24 +3590,51 @@ cdef class VariantFile(object):
if self.is_open:
self.close()
- if mode not in ('r','w','rb','wb', 'wh', 'wbu', 'rU', 'wb0'):
- raise ValueError('invalid file opening mode `{}`'.format(mode))
+ if not mode or mode[0] not in 'rwa':
+ raise ValueError('mode must begin with r, w or a')
+
+ self.duplicate_filehandle = duplicate_filehandle
+
+ format_modes = [m for m in mode[1:] if m in 'bcguz']
+ if len(format_modes) > 1:
+ raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
+
+ invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
+ if invalid_modes:
+ raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
+
+ # Autodetect mode from filename
+ if mode == 'w' and isinstance(filename, str):
+ if filename.endswith('.gz'):
+ mode = 'wz'
+ elif filename.endswith('.bcf'):
+ mode = 'wb'
# for htslib, wbu seems to not work
if mode == 'wbu':
mode = 'wb0'
self.mode = mode = force_bytes(mode)
- self.filename = filename = encode_filename(filename)
+ try:
+ filename = encode_filename(filename)
+ self.is_remote = hisremote(filename)
+ self.is_stream = filename == b'-'
+ except TypeError:
+ filename = filename
+ self.is_remote = False
+ self.is_stream = True
+
+ self.filename = filename
+
if index_filename is not None:
self.index_filename = index_filename = encode_filename(index_filename)
else:
self.index_filename = None
+
self.drop_samples = bool(drop_samples)
self.header = None
- self.is_remote = hisremote(filename)
- self.is_stream = filename == b'-'
+ self.header_written = False
if mode.startswith(b'w'):
# open file for writing
@@ -3465,29 +3645,22 @@ cdef class VariantFile(object):
if header:
self.header = header.copy()
else:
- raise ValueError('a VariantHeader must be specified')
+ self.header = VariantHeader()
+ #raise ValueError('a VariantHeader must be specified')
- # open file. Header gets written to file at the same time
- # for bam files and sam files (in the latter case, the
- # mode needs to be wh)
- cfilename, cmode = filename, mode
- with nogil:
- self.htsfile = hts_open(cfilename, cmode)
+ # Header is not written until the first write or on close
+ self.htsfile = self._open_htsfile()
if not self.htsfile:
- raise ValueError("could not open file `{}` (mode='{}')".format((filename, mode)))
-
- with nogil:
- bcf_hdr_write(self.htsfile, self.header.ptr)
+ raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
elif mode.startswith(b'r'):
# open file for reading
- if filename != b'-' and not self.is_remote and not os.path.exists(filename):
+
+ if not self._exists():
raise IOError('file `{}` not found'.format(filename))
- cfilename, cmode = filename, mode
- with nogil:
- self.htsfile = hts_open(cfilename, cmode)
+ self.htsfile = self._open_htsfile()
if not self.htsfile:
raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
@@ -3508,15 +3681,20 @@ cdef class VariantFile(object):
except ValueError:
raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+ if isinstance(self.filename, bytes):
+ cfilename = self.filename
+ else:
+ cfilename = NULL
+
# check for index and open if present
- if self.htsfile.format.format == bcf:
+ if self.htsfile.format.format == bcf and cfilename:
if index_filename is not None:
cindex_filename = index_filename
with nogil:
idx = bcf_index_load2(cfilename, cindex_filename)
self.index = makeBCFIndex(self.header, idx)
- elif self.htsfile.format.compression == bgzf:
+ elif self.htsfile.format.compression == bgzf and cfilename:
if index_filename is not None:
cindex_filename = index_filename
with nogil:
@@ -3530,40 +3708,8 @@ cdef class VariantFile(object):
def reset(self):
"""reset file position to beginning of file just after the header."""
- return self.seek(self.start_offset, 0)
+ return self.seek(self.start_offset)
- def seek(self, uint64_t offset):
- """move file pointer to position *offset*, see
- :meth:`pysam.VariantFile.tell`."""
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
- if self.is_stream:
- raise OSError('seek not available in streams')
-
- cdef int64_t ret
- if self.htsfile.format.compression != no_compression:
- with nogil:
- ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
- else:
- with nogil:
- ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
- return ret
-
- def tell(self):
- """return current file position, see :meth:`pysam.VariantFile.seek`."""
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
- if self.is_stream:
- raise OSError('tell not available in streams')
-
- cdef int64_t ret
- if self.htsfile.format.compression != no_compression:
- with nogil:
- ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
- else:
- with nogil:
- ret = hts_utell(self.htsfile)
- return ret
def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
"""fetch records in a :term:`region` using 0-based indexing. The
@@ -3589,9 +3735,8 @@ cdef class VariantFile(object):
if not self.is_open:
raise ValueError('I/O operation on closed file')
- if not self.mode.startswith(b'r'):
- raise ValueError('cannot fetch from Variantfile opened '
- 'for writing')
+ if self.htsfile.is_write:
+ raise ValueError('cannot fetch from Variantfile opened for writing')
if contig is None and region is None:
self.is_reading = 1
@@ -3605,28 +3750,45 @@ cdef class VariantFile(object):
self.is_reading = 1
return self.index.fetch(self, contig, start, stop, region, reopen)
+ cpdef VariantRecord new_record(self):
+ """Create a new empty VariantRecord"""
+ return self.header.new_record()
+
cpdef int write(self, VariantRecord record) except -1:
"""
write a single :class:`pysam.VariantRecord` to disk.
returns the number of bytes written.
"""
+ if record is None:
+ raise ValueError('record must not be None')
+
if not self.is_open:
return ValueError('I/O operation on closed file')
- if not self.mode.startswith(b'w'):
+ if not self.htsfile.is_write:
raise ValueError('cannot write to a Variantfile opened for reading')
+ if not self.header_written:
+ self.header_written = True
+ with nogil:
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
#if record.header is not self.header:
+ # record.translate(self.header)
# raise ValueError('Writing records from a different VariantFile is not yet supported')
+ if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
+ msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
+ raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
+
cdef int ret
with nogil:
ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
if ret < 0:
- raise ValueError('write failed')
+ raise IOError(errno, strerror(errno))
return ret
@@ -3638,9 +3800,8 @@ cdef class VariantFile(object):
if not self.is_open:
raise ValueError('I/O operation on closed file')
- if not self.mode.startswith(b'r'):
- raise ValueError('cannot subset samples from Variantfile '
- 'opened for writing')
+ if self.htsfile.is_write:
+ raise ValueError('cannot subset samples from Variantfile opened for writing')
if self.is_reading:
raise ValueError('cannot subset samples after fetching records')
diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx
new file mode 100644
index 0000000..558ceff
--- /dev/null
+++ b/pysam/libcbgzf.pyx
@@ -0,0 +1,209 @@
+"""Functions that read and write block gzipped files.
+
+The user of the file doesn't have to worry about the compression
+and random access is allowed if an index file is present."""
+
+# based on Python 3.5's gzip module
+
+import io
+
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+
+from cpython.object cimport PyObject
+from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libchtslib cimport *
+
+
+__all__ = ["BGZFile"]
+
+
+BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE
+
+
+cdef class BGZFile(object):
+ """The BGZFile class simulates most of the methods of a file object with
+ the exception of the truncate() method.
+
+ This class only supports opening files in binary mode. If you need to open a
+ compressed file in text mode, use the gzip.open() function.
+ """
+ cdef BGZF* bgzf
+ cdef bytes name, index
+
+ def __init__(self, filename, mode=None, index=None):
+ """Constructor for the BGZFile class.
+
+ The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
+ 'xb' depending on whether the file will be read or written. The default
+ is the mode of fileobj if discernible; otherwise, the default is 'rb'.
+ A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
+ 'wb', 'a' and 'ab', and 'x' and 'xb'.
+ """
+ if mode and ('t' in mode or 'U' in mode):
+ raise ValueError("Invalid mode: {!r}".format(mode))
+ if not mode:
+ mode = 'rb'
+ if mode and 'b' not in mode:
+ mode += 'b'
+ self.name = force_bytes(filename)
+ self.index = force_bytes(index) if index is not None else None
+ self.bgzf = bgzf_open(self.name, mode)
+
+ if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0:
+ raise IOError('Error building bgzf index')
+
+ def __dealloc__(self):
+ self.close()
+
+ def write(self,data):
+ if not self.bgzf:
+ raise ValueError("write() on closed BGZFile object")
+
+ if not self.bgzf.is_write:
+ import errno
+ raise OSError(errno.EBADF, "write() on read-only BGZFile object")
+
+ if isinstance(data, bytes):
+ length = len(data)
+ else:
+ # accept any data that supports the buffer protocol
+ data = memoryview(data)
+ length = data.nbytes
+
+ if length > 0 and bgzf_write(self.bgzf, <char *>data, length) < 0:
+ raise IOError('BGZFile write failed')
+
+ return length
+
+ def read(self, size=-1):
+ cdef ssize_t read_size
+
+ if not self.bgzf:
+ raise ValueError("read() on closed BGZFile object")
+
+ if self.bgzf.is_write:
+ import errno
+ raise OSError(errno.EBADF, "read() on write-only BGZFile object")
+
+ if size < 0:
+ chunks = []
+ while 1:
+ chunk = PyBytes_FromStringAndSize(NULL, BUFFER_SIZE)
+ cdata = <bytes>chunk
+ read_size = bgzf_read(self.bgzf, <char *>chunk, BUFFER_SIZE)
+ if read_size < 0:
+ raise IOError('Error reading from BGZFile')
+ elif not read_size:
+ break
+ elif read_size < BUFFER_SIZE:
+ chunk = chunk[:read_size]
+ chunks.append(chunk)
+ return b''.join(chunks)
+
+ elif size > 0:
+ chunk = PyBytes_FromStringAndSize(NULL, size)
+ read_size = bgzf_read(self.bgzf, <char *>chunk, size)
+ if read_size < 0:
+ raise IOError('Error reading from BGZFile')
+ elif read_size < size:
+ chunk = chunk[:size]
+ return chunk
+ else:
+ return b''
+
+ @property
+ def closed(self):
+ return self.bgzf == NULL
+
+ def close(self):
+ if not self.bgzf:
+ return
+
+ if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0:
+ raise IOError('Error flushing BGZFile object')
+
+ if self.index and bgzf_index_dump(self.bgzf, self.index, NULL) < 0:
+ raise IOError('Cannot write index')
+
+ cdef ret = bgzf_close(self.bgzf)
+ self.bgzf = NULL
+
+ if ret < 0:
+ raise IOError('Error closing BGZFile object')
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, tb):
+ self.close()
+
+ def flush(self):
+ if not self.bgzf:
+ return
+
+ if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0:
+ raise IOError('Error flushing BGZFile object')
+
+ def fileno(self):
+ """Invoke the underlying file object's fileno() method.
+
+ This will raise AttributeError if the underlying file object
+ doesn't support fileno().
+ """
+ raise AttributeError('fileno')
+
+ def rewind(self):
+ '''Return the uncompressed stream file position indicator to the
+ beginning of the file'''
+ if not self.bgzf:
+ raise ValueError("rewind() on closed BGZFile object")
+ if not self.bgzf.is_write:
+ raise OSError("Can't rewind in write mode")
+ if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0:
+ raise IOError('Error seeking BGZFFile object')
+
+ def readable(self):
+ if not self.bgzf:
+ raise ValueError("readable() on closed BGZFile object")
+ return self.bgzf != NULL and not self.bgzf.is_write
+
+ def writable(self):
+ return self.bgzf != NULL and self.bgzf.is_write
+
+ def seekable(self):
+ return True
+
+ def seek(self, offset, whence=io.SEEK_SET):
+ if not self.bgzf:
+ raise ValueError("seek() on closed BGZFile object")
+ if whence is not io.SEEK_SET:
+ raise ValueError('Seek from end not supported')
+
+ cdef int64_t off = bgzf_seek(self.bgzf, offset, SEEK_SET)
+ if off < 0:
+ raise IOError('Error seeking BGZFFile object')
+
+ return off
+
+ def readline(self, size=-1):
+ if not self.bgzf:
+ raise ValueError("readline() on closed BGZFile object")
+
+ cdef kstring_t line
+ cdef char c
+
+ line.l = line.m = 0
+ line.s = NULL
+ if bgzf_getline(self.bgzf, '\n', &line) < 0:
+ raise IOError('Error reading line in BGZFFile object')
+
+ ret = charptr_to_str_w_len(line.s, line.l)
+
+ if line.m:
+ free(line.s)
+
+ return ret
diff --git a/pysam/cfaidx.pxd b/pysam/libcfaidx.pxd
similarity index 94%
rename from pysam/cfaidx.pxd
rename to pysam/libcfaidx.pxd
index 7749274..2f5f44b 100644
--- a/pysam/cfaidx.pxd
+++ b/pysam/libcfaidx.pxd
@@ -6,7 +6,7 @@ from libc.stdio cimport FILE, printf
cimport cython
from cpython cimport array
-from pysam.chtslib cimport faidx_t, kstring_t, BGZF
+from pysam.libchtslib cimport faidx_t, kstring_t, BGZF
# These functions are put here and not in chtslib.pxd in order
# to avoid warnings for unused functions.
@@ -50,7 +50,7 @@ cdef class FastqProxy:
cdef class PersistentFastqProxy:
"""
- Python container for pysam.cfaidx.FastqProxy with persistence.
+ Python container for pysam.libcfaidx.FastqProxy with persistence.
"""
cdef public str comment, quality, sequence, name
cdef cython.str tostring(self)
diff --git a/pysam/cfaidx.pyx b/pysam/libcfaidx.pyx
similarity index 97%
rename from pysam/cfaidx.pyx
rename to pysam/libcfaidx.pyx
index 78f9aac..774152d 100644
--- a/pysam/cfaidx.pyx
+++ b/pysam/libcfaidx.pyx
@@ -57,15 +57,15 @@ from cpython cimport PyErr_SetString, \
from cpython.version cimport PY_MAJOR_VERSION
-from pysam.chtslib cimport \
+from pysam.libchtslib cimport \
faidx_nseq, fai_load, fai_destroy, fai_fetch, \
faidx_seq_len, \
faidx_fetch_seq, hisremote, \
bgzf_open, bgzf_close
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-from pysam.cutils cimport qualitystring_to_array, parse_region
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport qualitystring_to_array, parse_region
cdef class FastqProxy
cdef makeFastqProxy(kseq_t * src):
@@ -371,7 +371,7 @@ cdef class FastqProxy:
cdef class PersistentFastqProxy:
"""
- Python container for pysam.cfaidx.FastqProxy with persistence.
+ Python container for pysam.libcfaidx.FastqProxy with persistence.
Needed to compare multiple fastq records from the same file.
"""
def __init__(self, FastqProxy FastqRead):
@@ -568,4 +568,5 @@ cdef class Fastafile(FastaFile):
__all__ = ["FastaFile",
"FastqFile",
"FastxFile",
- "Fastafile"]
+ "Fastafile",
+ "FastqProxy"]
diff --git a/pysam/chtslib.pxd b/pysam/libchtslib.pxd
similarity index 99%
rename from pysam/chtslib.pxd
rename to pysam/libchtslib.pxd
index 33c1559..657a754 100644
--- a/pysam/chtslib.pxd
+++ b/pysam/libchtslib.pxd
@@ -1290,6 +1290,9 @@ cdef extern from "htslib/vcf.h" nogil:
uint8_t BCF_ERR_TAG_UNDEF
uint8_t BCF_ERR_NCOLS
uint8_t BCF_ERR_LIMITS
+ uint8_t BCF_ERR_CHAR
+ uint8_t BCF_ERR_CTG_INVALID
+ uint8_t BCF_ERR_TAG_INVALID
# The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
# is slower because the string is first to be parsed, packed into BCF line
@@ -1896,3 +1899,18 @@ cdef extern from "htslib/vcfutils.h" nogil:
# @i,j: allele indexes, 0-based, i<=j
# Returns index to the Number=G diploid array
uint32_t bcf_ij2G(uint32_t i, uint32_t j)
+
+
+cdef class HTSFile(object):
+ cdef htsFile *htsfile # pointer to htsFile structure
+ cdef int64_t start_offset # BGZF offset of first record
+
+ cdef readonly object filename # filename as supplied by user
+ cdef readonly object mode # file opening mode
+ cdef readonly object index_filename # filename of index, if supplied by user
+
+ cdef readonly bint is_stream # Is htsfile a non-seekable stream
+ cdef readonly bint is_remote # Is htsfile a remote stream
+ cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh
+
+ cdef htsFile *_open_htsfile(self) except? NULL
diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx
new file mode 100644
index 0000000..7eea059
--- /dev/null
+++ b/pysam/libchtslib.pyx
@@ -0,0 +1,265 @@
+# cython: embedsignature=True
+# cython: profile=True
+# adds doc-strings for sphinx
+import os
+
+from posix.unistd cimport dup
+
+from pysam.libchtslib cimport *
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+
+__all__ = ["get_verbosity", "set_verbosity"]
+
+
+########################################################################
+########################################################################
+## Constants
+########################################################################
+
+cdef int MAX_POS = 2 << 29
+cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
+cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
+ 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
+cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
+
+
+cpdef set_verbosity(int verbosity):
+ """Set htslib's hts_verbose global variable to the specified value."""
+ return hts_set_verbosity(verbosity)
+
+cpdef get_verbosity():
+ """Return the value of htslib's hts_verbose global variable."""
+ return hts_get_verbosity()
+
+
+class CallableValue(object):
+ def __init__(self, value):
+ self.value = value
+ def __call__(self):
+ return self.value
+ def __bool__(self):
+ return self.value
+ def __nonzero__(self):
+ return self.value
+ def __eq__(self, other):
+ return self.value == other
+ def __ne__(self, other):
+ return self.value != other
+
+
+CTrue = CallableValue(True)
+CFalse = CallableValue(False)
+
+
+cdef class HTSFile(object):
+ """
+ Base class for HTS file types
+ """
+ def __cinit__(self, *args, **kwargs):
+ self.htsfile = NULL
+ self.duplicate_filehandle = True
+
+ def __dealloc__(self):
+ if self.htsfile:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
+ @property
+ def category(self):
+ """General file format category. One of UNKNOWN, ALIGNMENTS,
+ VARIANTS, INDEX, REGIONS"""
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return FORMAT_CATEGORIES[self.htsfile.format.category]
+
+ @property
+ def format(self):
+ """File format.
+
+ One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
+ BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
+ """
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return FORMATS[self.htsfile.format.format]
+
+ @property
+ def version(self):
+ """Tuple of file format version numbers (major, minor)"""
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return self.htsfile.format.version.major, self.htsfile.format.version.minor
+
+ @property
+ def compression(self):
+ """File compression.
+
+ One of NONE, GZIP, BGZF, CUSTOM."""
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return COMPRESSION[self.htsfile.format.compression]
+
+ @property
+ def description(self):
+ """Vaguely human readable description of the file format"""
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ cdef char *desc = hts_format_description(&self.htsfile.format)
+ try:
+ return charptr_to_str(desc)
+ finally:
+ free(desc)
+
+ @property
+ def is_open(self):
+ """return True if HTSFile is open and in a valid state."""
+ return CTrue if self.htsfile != NULL else CFalse
+
+ @property
+ def is_closed(self):
+ """return True if HTSFile is closed."""
+ return self.htsfile == NULL
+
+ @property
+ def closed(self):
+ """return True if HTSFile is closed."""
+ return self.htsfile == NULL
+
+ @property
+ def is_write(self):
+ """return True if HTSFile is open for writing"""
+ return self.htsfile != NULL and self.htsfile.is_write != 0
+
+ @property
+ def is_read(self):
+ """return True if HTSFile is open for reading"""
+ return self.htsfile != NULL and self.htsfile.is_write == 0
+
+ @property
+ def is_sam(self):
+ """return True if HTSFile is reading or writing a SAM alignment file"""
+ return self.htsfile != NULL and self.htsfile.format.format == sam
+
+ @property
+ def is_bam(self):
+ """return True if HTSFile is reading or writing a BAM alignment file"""
+ return self.htsfile != NULL and self.htsfile.format.format == bam
+
+ @property
+ def is_cram(self):
+ """return True if HTSFile is reading or writing a BAM alignment file"""
+ return self.htsfile != NULL and self.htsfile.format.format == cram
+
+ @property
+ def is_vcf(self):
+ """return True if HTSFile is reading or writing a VCF variant file"""
+ return self.htsfile != NULL and self.htsfile.format.format == vcf
+
+ @property
+ def is_bcf(self):
+ """return True if HTSFile is reading or writing a BCF variant file"""
+ return self.htsfile != NULL and self.htsfile.format.format == bcf
+
+ def reset(self):
+ """reset file position to beginning of file just after the header.
+
+ Returns
+ -------
+
+ The file position after moving the file pointer.
+
+ """
+ return self.seek(self.start_offset)
+
+ def seek(self, uint64_t offset):
+ """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+ if self.is_stream:
+ raise OSError('seek not available in streams')
+
+ cdef int64_t ret
+ if self.htsfile.format.compression != no_compression:
+ with nogil:
+ ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
+ else:
+ with nogil:
+ ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
+ return ret
+
+ def tell(self):
+ """return current file position, see :meth:`pysam.HTSFile.seek`."""
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+ if self.is_stream:
+ raise OSError('tell not available in streams')
+
+ cdef int64_t ret
+ if self.htsfile.format.compression != no_compression:
+ with nogil:
+ ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
+ else:
+ with nogil:
+ ret = hts_utell(self.htsfile)
+ return ret
+
+ cdef htsFile *_open_htsfile(self) except? NULL:
+ cdef char *cfilename
+ cdef char *cmode = self.mode
+ cdef int fd, dup_fd
+
+ if isinstance(self.filename, bytes):
+ cfilename = self.filename
+ with nogil:
+ return hts_open(cfilename, cmode)
+ else:
+ if isinstance(self.filename, int):
+ fd = self.filename
+ else:
+ fd = self.filename.fileno()
+
+ if self.duplicate_filehandle:
+ dup_fd = dup(fd)
+ else:
+ dup_fd = fd
+
+ # Replicate mode normalization done in hts_open_format
+ smode = self.mode.replace(b'b', b'').replace(b'c', b'')
+ if b'b' in self.mode:
+ smode += b'b'
+ elif b'c' in self.mode:
+ smode += b'c'
+ cmode = smode
+
+ hfile = hdopen(dup_fd, cmode)
+ if hfile == NULL:
+ raise IOError('Cannot create hfile')
+
+ try:
+ # filename.name can be an int
+ filename = str(self.filename.name)
+ except AttributeError:
+ filename = '<fd:{}>'.format(fd)
+
+ filename = encode_filename(filename)
+ cfilename = filename
+ with nogil:
+ return hts_hopen(hfile, cfilename, cmode)
+
+ def _exists(self):
+ """return False iff file is local, a file and exists.
+ """
+ return (not isinstance(self.filename, (str, bytes)) or
+ self.filename == b'-' or
+ self.is_remote or
+ os.path.exists(self.filename))
diff --git a/pysam/csamfile.pxd b/pysam/libcsamfile.pxd
similarity index 93%
rename from pysam/csamfile.pxd
rename to pysam/libcsamfile.pxd
index a76a599..de36998 100644
--- a/pysam/csamfile.pxd
+++ b/pysam/libcsamfile.pxd
@@ -1,10 +1,10 @@
-from pysam.calignmentfile cimport AlignedSegment, AlignmentFile
+from pysam.libcalignmentfile cimport AlignedSegment, AlignmentFile
#################################################
# Compatibility Layer for pysam < 0.8
# import all declarations from htslib
-from pysam.chtslib cimport *
+from pysam.libchtslib cimport *
cdef class AlignedRead(AlignedSegment):
pass
diff --git a/pysam/csamfile.pyx b/pysam/libcsamfile.pyx
similarity index 92%
rename from pysam/csamfile.pyx
rename to pysam/libcsamfile.pyx
index ed9d79b..bde93d8 100644
--- a/pysam/csamfile.pyx
+++ b/pysam/libcsamfile.pyx
@@ -19,7 +19,7 @@ from cpython cimport PyErr_SetString, \
from cpython.version cimport PY_MAJOR_VERSION
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
cdef class Samfile(AlignmentFile):
diff --git a/pysam/ctabix.pxd b/pysam/libctabix.pxd
similarity index 89%
rename from pysam/ctabix.pxd
rename to pysam/libctabix.pxd
index 028090e..12cd9dd 100644
--- a/pysam/ctabix.pxd
+++ b/pysam/libctabix.pxd
@@ -13,8 +13,9 @@ cdef extern from "unistd.h" nogil:
ssize_t read(int fd, void *buf, size_t count)
int close(int fd)
-from pysam.chtslib cimport hts_idx_t, hts_itr_t, htsFile, \
- tbx_t, kstring_t, BGZF
+from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \
+ tbx_t, kstring_t, BGZF, HTSFile
+
# These functions are put here and not in chtslib.pxd in order
# to avoid warnings for unused functions.
@@ -55,40 +56,39 @@ cdef class tabix_file_iterator:
cdef __cnext__(self)
-cdef class TabixFile:
- # pointer to tabixfile
- cdef htsFile * tabixfile
+cdef class TabixFile(HTSFile):
# pointer to index structure
cdef tbx_t * index
- # flag indicating whether file is remote
- cdef int is_remote
-
- cdef object _filename
- cdef object _filename_index
+ cdef readonly object filename_index
cdef Parser parser
cdef encoding
+
cdef class Parser:
cdef encoding
-
cdef parse(self, char * buffer, int len)
+
cdef class asTuple(Parser):
cdef parse(self, char * buffer, int len)
+
cdef class asGTF(Parser):
pass
+
cdef class asBed(Parser):
pass
+
cdef class asVCF(Parser):
pass
+
cdef class TabixIterator:
cdef hts_itr_t * iterator
cdef TabixFile tabixfile
@@ -96,9 +96,11 @@ cdef class TabixIterator:
cdef encoding
cdef int __cnext__(self)
+
cdef class TabixIteratorParsed(TabixIterator):
cdef Parser parser
+
cdef class GZIterator:
cdef object _filename
cdef BGZF * gzipfile
@@ -107,13 +109,15 @@ cdef class GZIterator:
cdef int __cnext__(self)
cdef encoding
+
cdef class GZIteratorHead(GZIterator):
pass
+
cdef class GZIteratorParsed(GZIterator):
cdef Parser parser
+
# Compatibility Layer for pysam < 0.8
cdef class Tabixfile(TabixFile):
pass
-
diff --git a/pysam/ctabix.pyx b/pysam/libctabix.pyx
similarity index 94%
rename from pysam/ctabix.pyx
rename to pysam/libctabix.pyx
index a23fa87..10dc23b 100644
--- a/pysam/ctabix.pyx
+++ b/pysam/libctabix.pyx
@@ -66,16 +66,16 @@ from cpython cimport PyErr_SetString, PyBytes_Check, \
from cpython.version cimport PY_MAJOR_VERSION
-cimport pysam.ctabixproxies as ctabixproxies
+cimport pysam.libctabixproxies as ctabixproxies
-from pysam.chtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
+from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
- tbx_destroy, hisremote
+ tbx_destroy, hisremote, region_list
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
cdef class Parser:
@@ -290,14 +290,16 @@ cdef class TabixFile:
"""
def __cinit__(self,
filename,
- mode = 'r',
+ mode='r',
parser=None,
index=None,
encoding="ascii",
*args,
**kwargs ):
- self.tabixfile = NULL
+ self.htsfile = NULL
+ self.is_remote = False
+ self.is_stream = False
self.parser = parser
self._open(filename, mode, index, *args, **kwargs)
self.encoding = encoding
@@ -307,21 +309,22 @@ cdef class TabixFile:
mode='r',
index=None,
):
- '''open a :term:`tabix file` for reading.
- '''
+ '''open a :term:`tabix file` for reading.'''
- assert mode in ("r",), "invalid file opening mode `%s`" % mode
+ if mode != 'r':
+ raise ValueError("invalid file opening mode `%s`" % mode)
- if self.tabixfile != NULL:
+ if self.htsfile != NULL:
self.close()
- self.tabixfile = NULL
+ self.htsfile = NULL
filename_index = index or (filename + ".tbi")
# encode all the strings to pass to tabix
- self._filename = encode_filename(filename)
- self._filename_index = encode_filename(filename_index)
+ self.filename = encode_filename(filename)
+ self.filename_index = encode_filename(filename_index)
- self.is_remote = hisremote(self._filename)
+ self.is_stream = self.filename == b'-'
+ self.is_remote = hisremote(self.filename)
if not self.is_remote:
if not os.path.exists(filename):
@@ -331,36 +334,37 @@ cdef class TabixFile:
raise IOError("index `%s` not found" % filename_index)
# open file
- cdef char *cfilename = self._filename
+ cdef char *cfilename = self.filename
with nogil:
- self.tabixfile = hts_open(cfilename, 'r')
+ self.htsfile = hts_open(cfilename, 'r')
- if self.tabixfile == NULL:
+ if self.htsfile == NULL:
raise IOError("could not open file `%s`" % filename)
- cfilename = self._filename_index
+ #if self.htsfile.format.category != region_list:
+ # raise ValueError("file does not contain region data")
+
+ cfilename = self.filename_index
with nogil:
self.index = tbx_index_load(cfilename)
if self.index == NULL:
raise IOError("could not open index for `%s`" % filename)
+ if not self.is_stream:
+ self.start_offset = self.tell()
+
def _dup(self):
'''return a copy of this tabix file.
The file is being re-opened.
'''
- return TabixFile(self._filename,
+ return TabixFile(self.filename,
mode="r",
parser=self.parser,
- index=self._filename_index,
+ index=self.filename_index,
encoding=self.encoding)
- def is_open(self):
- '''return true if samfile has been opened.'''
- return self.tabixfile != NULL
-
-
def fetch(self,
reference=None,
start=None,
@@ -472,33 +476,11 @@ cdef class TabixFile:
return a
- # context manager interface
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
###############################################################
###############################################################
###############################################################
## properties
###############################################################
- property closed:
- """"bool indicating the current state of the file object.
- This is a read-only attribute; the close() method changes the value.
- """
- def __get__(self):
- return not self.is_open()
-
- property filename:
- '''filename associated with this object.'''
- def __get__(self):
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- return self._filename
-
property header:
'''the file header.
@@ -543,9 +525,9 @@ cdef class TabixFile:
def close(self):
'''
closes the :class:`pysam.TabixFile`.'''
- if self.tabixfile != NULL:
- hts_close(self.tabixfile)
- self.tabixfile = NULL
+ if self.htsfile != NULL:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
if self.index != NULL:
tbx_destroy(self.index)
self.index = NULL
@@ -554,9 +536,9 @@ cdef class TabixFile:
# remember: dealloc cannot call other python methods
# note: no doc string
# note: __del__ is not called.
- if self.tabixfile != NULL:
- hts_close(self.tabixfile)
- self.tabixfile = NULL
+ if self.htsfile != NULL:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
if self.index != NULL:
tbx_destroy(self.index)
@@ -582,7 +564,7 @@ cdef class TabixIterator:
Return -5 if file has been closed when this function
was called.
'''
- if self.tabixfile.tabixfile == NULL:
+ if self.tabixfile.htsfile == NULL:
return -5
cdef int retval
@@ -590,7 +572,7 @@ cdef class TabixIterator:
while 1:
with nogil:
retval = tbx_itr_next(
- self.tabixfile.tabixfile,
+ self.tabixfile.htsfile,
self.tabixfile.index,
self.iterator,
&self.buffer)
diff --git a/pysam/ctabixproxies.pxd b/pysam/libctabixproxies.pxd
similarity index 100%
rename from pysam/ctabixproxies.pxd
rename to pysam/libctabixproxies.pxd
diff --git a/pysam/ctabixproxies.pyx b/pysam/libctabixproxies.pyx
similarity index 99%
rename from pysam/ctabixproxies.pyx
rename to pysam/libctabixproxies.pyx
index f5288cc..9a8a678 100644
--- a/pysam/ctabixproxies.pyx
+++ b/pysam/libctabixproxies.pyx
@@ -5,8 +5,8 @@ from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
from libc.stdlib cimport free, malloc, calloc, realloc
from libc.stdlib cimport atoi, atol, atof
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
import collections
diff --git a/pysam/cutils.pxd b/pysam/libcutils.pxd
similarity index 100%
rename from pysam/cutils.pxd
rename to pysam/libcutils.pxd
diff --git a/pysam/cutils.pyx b/pysam/libcutils.pyx
similarity index 92%
rename from pysam/cutils.pyx
rename to pysam/libcutils.pyx
index 7510727..80bd9e4 100644
--- a/pysam/cutils.pyx
+++ b/pysam/libcutils.pyx
@@ -7,7 +7,7 @@ import os
import io
from contextlib import contextmanager
-from cpython.version cimport PY_MAJOR_VERSION
+from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
from cpython cimport PyBytes_Check, PyUnicode_Check
from cpython cimport array as c_array
from libc.stdlib cimport calloc, free
@@ -36,10 +36,10 @@ cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
if qualities is None:
return None
cdef int x
-
+
cdef c_array.array result
result = c_array.clone(qualities, len(qualities), zero=False)
-
+
for x from 0 <= x < len(qualities):
result[x] = qualities[x] + offset
return force_str(result.tostring())
@@ -76,6 +76,7 @@ cpdef qualities_to_qualitystring(qualities, int offset=33):
########################################################################
## Python 3 compatibility functions
########################################################################
+
cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
cdef from_string_and_size(const char* s, size_t length):
@@ -84,42 +85,39 @@ cdef from_string_and_size(const char* s, size_t length):
else:
return s[:length]
-# filename encoding (copied from lxml.etree.pyx)
-cdef str _FILENAME_ENCODING
-_FILENAME_ENCODING = sys.getfilesystemencoding()
-if _FILENAME_ENCODING is None:
- _FILENAME_ENCODING = sys.getdefaultencoding()
-if _FILENAME_ENCODING is None:
- _FILENAME_ENCODING = 'ascii'
-#cdef char* _C_FILENAME_ENCODING
-#_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
+# filename encoding (adapted from lxml.etree.pyx)
+cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
+
cdef bytes encode_filename(object filename):
"""Make sure a filename is 8-bit encoded (or None)."""
if filename is None:
return None
+ elif PY_MAJOR_VERSION >= 3 and PY_MINOR_VERSION >= 2:
+ # Added to support path-like objects
+ return os.fsencode(filename)
elif PyBytes_Check(filename):
return filename
elif PyUnicode_Check(filename):
- return filename.encode(_FILENAME_ENCODING)
+ return filename.encode(FILENAME_ENCODING)
else:
- raise TypeError(u"Argument must be string or unicode.")
+ raise TypeError("Argument must be string or unicode.")
+
cdef bytes force_bytes(object s, encoding="ascii"):
- u"""convert string or unicode object to bytes, assuming
+ """convert string or unicode object to bytes, assuming
ascii encoding.
"""
- if not IS_PYTHON3:
- return s
- elif s is None:
+ if s is None:
return None
elif PyBytes_Check(s):
return s
elif PyUnicode_Check(s):
return s.encode(encoding)
else:
- raise TypeError(u"Argument must be string, bytes or unicode.")
+ raise TypeError("Argument must be string, bytes or unicode.")
+
cdef charptr_to_str(const char* s, encoding="ascii"):
if s == NULL:
@@ -129,6 +127,7 @@ cdef charptr_to_str(const char* s, encoding="ascii"):
else:
return s.decode(encoding)
+
cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
if s == NULL:
return None
@@ -137,12 +136,14 @@ cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
else:
return s[:n].decode(encoding)
+
cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
if s == NULL:
return None
else:
return s
+
cdef force_str(object s, encoding="ascii"):
"""Return s converted to str type of current Python
(bytes in Py2, unicode in Py3)"""
@@ -156,6 +157,7 @@ cdef force_str(object s, encoding="ascii"):
# assume unicode
return s
+
cpdef parse_region(reference=None,
start=None,
end=None,
@@ -283,7 +285,9 @@ def _pysam_dispatch(collection,
if method not in ("index", "roh", "stats"):
stdout_option = "-o {}"
elif method in MAP_STDOUT_OPTIONS[collection]:
- stdout_option = MAP_STDOUT_OPTIONS[collection][method]
+ # special case - samtools view -c outputs on stdout
+ if not(method == "view" and "-c" in args):
+ stdout_option = MAP_STDOUT_OPTIONS[collection][method]
if stdout_option is not None:
os.close(stdout_h)
diff --git a/pysam/cvcf.pxd b/pysam/libcvcf.pxd
similarity index 100%
rename from pysam/cvcf.pxd
rename to pysam/libcvcf.pxd
diff --git a/pysam/cvcf.pyx b/pysam/libcvcf.pyx
similarity index 99%
rename from pysam/cvcf.pyx
rename to pysam/libcvcf.pyx
index 5e2fda2..956f8a5 100644
--- a/pysam/cvcf.pyx
+++ b/pysam/libcvcf.pyx
@@ -54,10 +54,10 @@ from libc.stdlib cimport atoi
from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-cimport pysam.ctabix as ctabix
-cimport pysam.ctabixproxies as ctabixproxies
+cimport pysam.libctabix as libctabix
+cimport pysam.libctabixproxies as libctabixproxies
-from pysam.cutils cimport force_str
+from pysam.libcutils cimport force_str
import pysam
@@ -101,7 +101,7 @@ FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue
#
###########################################################################################################
-cdef class VCFRecord( ctabixproxies.TupleProxy):
+cdef class VCFRecord(libctabixproxies.TupleProxy):
'''vcf record.
initialized from data and vcf meta
@@ -141,7 +141,7 @@ cdef class VCFRecord( ctabixproxies.TupleProxy):
nbytes does not include the terminal '\0'.
'''
- ctabixproxies.TupleProxy.update(self, buffer, nbytes)
+ libctabixproxies.TupleProxy.update(self, buffer, nbytes)
self.contig = self.fields[0]
# vcf counts from 1 - correct here
@@ -238,7 +238,7 @@ cdef class VCFRecord( ctabixproxies.TupleProxy):
return result
-cdef class asVCFRecord(ctabix.Parser):
+cdef class asVCFRecord(libctabix.Parser):
'''converts a :term:`tabix row` into a VCF record.'''
cdef vcffile
def __init__(self, vcffile):
diff --git a/pysam/utils.py b/pysam/utils.py
index c5bb539..5c045df 100644
--- a/pysam/utils.py
+++ b/pysam/utils.py
@@ -1,4 +1,4 @@
-from pysam.cutils import _pysam_dispatch
+from pysam.libcutils import _pysam_dispatch
class SamtoolsError(Exception):
diff --git a/pysam/version.py b/pysam/version.py
index 0a985de..facb3bb 100644
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,7 +1,9 @@
# pysam versioning information
-__version__ = "0.9.1.4"
+__version__ = "0.10.0"
__samtools_version__ = "1.3.1"
-__htslib_version__ = "1.3.1"
+__bcftools_version__ = "1.3.1"
+
+__htslib_version__ = "1.3.2"
diff --git a/requirements.txt b/requirements.txt
index 687929a..6e8fc44 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-cython>=0.22
+cython>=0.24.1
diff --git a/run_tests_travis.sh b/run_tests_travis.sh
index 414043e..a229ff5 100755
--- a/run_tests_travis.sh
+++ b/run_tests_travis.sh
@@ -6,75 +6,36 @@ WORKDIR=`pwd`
#Install miniconda python
if [ $TRAVIS_OS_NAME == "osx" ]; then
- curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
- bash Miniconda3-latest-MacOSX-x86_64.sh -b
+ wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3.sh
else
- curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
- bash Miniconda3-latest-Linux-x86_64.sh -b
+ wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3.sh --no-check-certificate # Default OS versions are old and have SSL / CERT issues
fi
+bash Miniconda3.sh -b
+
# Create a new conda environment with the target python version
~/miniconda3/bin/conda install conda-build -y
-~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose
+~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose psutil pip
+
+# activate testenv environment
+source ~/miniconda3/bin/activate testenv
-# Add new conda environment to PATH
-export PATH=~/miniconda3/envs/testenv/bin/:$PATH
+conda config --add channels conda-forge
+conda config --add channels defaults
+conda config --add channels r
+conda config --add channels bioconda
-# Hack to force linking to anaconda libraries rather than system libraries
-#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/
-#export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/
+conda install -y samtools bcftools htslib
# Need to make C compiler and linker use the anaconda includes and libraries:
export PREFIX=~/miniconda3/
export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib"
export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
-# create a new folder to store external tools
-mkdir -p $WORKDIR/external-tools
-
-# install htslib
-cd $WORKDIR/external-tools
-curl -L https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 > htslib-1.3.1.tar.bz2
-tar xjvf htslib-1.3.1.tar.bz2
-cd htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/htslib-1.3.1
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3.1
-
-# install samtools, compile against htslib
-cd $WORKDIR/external-tools
-curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3.1/samtools-1.3.1.tar.bz2 > samtools-1.3.1.tar.bz2
-tar xjvf samtools-1.3.1.tar.bz2
-cd samtools-1.3.1
-./configure --with-htslib=../htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/samtools-1.3.1
-
-echo "installed samtools"
samtools --version
-
-if [ $? != 0 ]; then
- exit 1
-fi
-
-# install bcftools
-cd $WORKDIR/external-tools
-curl -L https://github.com/samtools/bcftools/releases/download/1.3.1/bcftools-1.3.1.tar.bz2 > bcftools-1.3.1.tar.bz2
-tar xjf bcftools-1.3.1.tar.bz2
-cd bcftools-1.3.1
-./configure --with-htslib=../htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3.1
-
-echo "installed bcftools"
+htslib --version
bcftools --version
-if [ $? != 0 ]; then
- exit 1
-fi
-
-popd
-
# Try building conda recipe first
~/miniconda3/bin/conda-build ci/conda-recipe/ --python=$CONDA_PY
@@ -105,9 +66,10 @@ if [ $? != 0 ]; then
exit 1
fi
-# build source tar-ball
+# build source tar-ball. Make sure to build so that .pyx files
+# are cythonized.
cd ..
-python setup.py sdist
+python setup.py build sdist
if [ $? != 0 ]; then
exit 1
@@ -123,7 +85,7 @@ fi
# test pip installation from tar-ball with cython
echo "pip installing with cython"
-pip install --verbose --no-deps --no-use-wheel dist/pysam-*.tar.gz
+pip install --verbose --no-deps --no-binary=:all: dist/pysam-*.tar.gz
if [ $? != 0 ]; then
exit 1
@@ -131,10 +93,10 @@ fi
# attempt pip installation without cython
echo "pip installing without cython"
-~/miniconda3/bin/conda remove cython
+~/miniconda3/bin/conda remove -y cython
~/miniconda3/bin/conda list
-echo "pthyon is" `which python`
-pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz
+echo "python is" `which python`
+pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz
if [ $? != 0 ]; then
exit 1
@@ -144,7 +106,7 @@ fi
# command line options
echo "pip installing without cython and no configure options"
export HTSLIB_CONFIGURE_OPTIONS=""
-pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz
+pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz
if [ $? != 0 ]; then
exit 1
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c
index 3d5ffa5..8c883b0 100644
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -489,9 +489,9 @@ int main_samview(int argc, char *argv[])
}
view_end:
- if (is_count && ret == 0)
+ if (is_count && ret == 0)
fprintf(pysam_stdout, "%" PRId64 "\n", count);
-
+
// close files, free and return
if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
diff --git a/setup.py b/setup.py
index e301f11..6d52617 100644
--- a/setup.py
+++ b/setup.py
@@ -60,13 +60,18 @@ def run_configure(option):
def run_make_print_config():
- stdout = subprocess.check_output(["make", "print-config"])
+ stdout = subprocess.check_output(["make", "-s", "print-config"])
if IS_PYTHON3:
stdout = stdout.decode("ascii")
- result = dict([[x.strip() for x in line.split("=")]
- for line in stdout.splitlines()])
- return result
+ make_print_config = {}
+ for line in stdout.splitlines():
+ if "=" in line:
+ row = line.split("=")
+ if len(row) == 2:
+ make_print_config.update(
+ {row[0].strip(): row[1].strip()})
+ return make_print_config
def configure_library(library_dir, env_options=None, options=[]):
@@ -139,16 +144,12 @@ try:
import cython
HAVE_CYTHON = True
print ("# pysam: cython is available - using cythonize if necessary")
- source_pattern = "pysam/c%s.pyx"
- if HTSLIB_MODE != "external":
- HTSLIB_MODE = "shared"
+ source_pattern = "pysam/libc%s.pyx"
except ImportError:
HAVE_CYTHON = False
print ("# pysam: no cython available - using pre-compiled C")
# no Cython available - use existing C code
- source_pattern = "pysam/c%s.c"
- if HTSLIB_MODE != "external":
- HTSLIB_MODE = "shared"
+ source_pattern = "pysam/libc%s.c"
# collect pysam version
sys.path.insert(0, "pysam")
@@ -230,7 +231,6 @@ if HTSLIB_LIBRARY_DIR:
chtslib_sources = []
htslib_library_dirs = [HTSLIB_LIBRARY_DIR]
htslib_include_dirs = [HTSLIB_INCLUDE_DIR]
- internal_htslib_libraries = []
external_htslib_libraries = ['z', 'hts']
elif HTSLIB_MODE == 'separate':
@@ -240,7 +240,6 @@ elif HTSLIB_MODE == 'separate':
shared_htslib_sources = htslib_sources
htslib_library_dirs = []
htslib_include_dirs = ['htslib']
- internal_htslib_libraries = []
elif HTSLIB_MODE == 'shared':
# link each pysam component against the same
@@ -249,30 +248,15 @@ elif HTSLIB_MODE == 'shared':
htslib_library_dirs = [
'pysam',
".",
- os.path.join("build",
- distutils_dir_name("lib"),
- "pysam")]
+ os.path.join("build", distutils_dir_name("lib"), "pysam")]
htslib_include_dirs = ['htslib']
- if IS_PYTHON3:
- if sys.version_info.minor >= 5:
- internal_htslib_libraries = ["chtslib.{}".format(
- sysconfig.get_config_var('SOABI'))]
- else:
- if sys.platform == "darwin":
- # On OSX, python 3.3 and 3.4 Libs have no platform tags.
- internal_htslib_libraries = ["chtslib"]
- else:
- internal_htslib_libraries = ["chtslib.{}{}".format(
- sys.implementation.cache_tag,
- sys.abiflags)]
- else:
- internal_htslib_libraries = ["chtslib"]
-
else:
raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
+internal_htslib_libraries = [os.path.splitext("chtslib{}".format(sysconfig.get_config_var('SO')))[0]]
+
# build config.py
with open(os.path.join("pysam", "config.py"), "w") as outf:
outf.write('HTSLIB = "{}"\n'.format(HTSLIB_SOURCE))
@@ -382,7 +366,7 @@ chtslib = Extension(
# Selected ones have been copied into samfile_utils.c
# Needs to be devolved somehow.
csamfile = Extension(
- "pysam.csamfile",
+ "pysam.libcsamfile",
[source_pattern % "samfile",
"pysam/htslib_util.c",
"pysam/samfile_util.c",
@@ -402,7 +386,7 @@ csamfile = Extension(
# Selected ones have been copied into samfile_utils.c
# Needs to be devolved somehow.
calignmentfile = Extension(
- "pysam.calignmentfile",
+ "pysam.libcalignmentfile",
[source_pattern % "alignmentfile",
"pysam/htslib_util.c",
"pysam/samfile_util.c",
@@ -422,7 +406,7 @@ calignmentfile = Extension(
# Selected ones have been copied into samfile_utils.c
# Needs to be devolved somehow.
calignedsegment = Extension(
- "pysam.calignedsegment",
+ "pysam.libcalignedsegment",
[source_pattern % "alignedsegment",
"pysam/htslib_util.c",
"pysam/samfile_util.c",
@@ -438,7 +422,7 @@ calignedsegment = Extension(
)
ctabix = Extension(
- "pysam.ctabix",
+ "pysam.libctabix",
[source_pattern % "tabix",
"pysam/tabix_util.c"] +
htslib_sources +
@@ -452,7 +436,7 @@ ctabix = Extension(
)
cutils = Extension(
- "pysam.cutils",
+ "pysam.libcutils",
[source_pattern % "utils", "pysam/pysam_util.c"] +
glob.glob(os.path.join("samtools", "*.pysam.c")) +
# glob.glob(os.path.join("samtools", "*", "*.pysam.c")) +
@@ -470,7 +454,7 @@ cutils = Extension(
)
cfaidx = Extension(
- "pysam.cfaidx",
+ "pysam.libcfaidx",
[source_pattern % "faidx"] +
htslib_sources +
os_c_files,
@@ -483,7 +467,7 @@ cfaidx = Extension(
)
ctabixproxies = Extension(
- "pysam.ctabixproxies",
+ "pysam.libctabixproxies",
[source_pattern % "tabixproxies"] +
os_c_files,
library_dirs=htslib_library_dirs,
@@ -495,7 +479,7 @@ ctabixproxies = Extension(
)
cvcf = Extension(
- "pysam.cvcf",
+ "pysam.libcvcf",
[source_pattern % "vcf"] +
os_c_files,
library_dirs=htslib_library_dirs,
@@ -507,7 +491,7 @@ cvcf = Extension(
)
cbcf = Extension(
- "pysam.cbcf",
+ "pysam.libcbcf",
[source_pattern % "bcf"] +
htslib_sources +
os_c_files,
@@ -519,6 +503,19 @@ cbcf = Extension(
define_macros=define_macros
)
+cbgzf = Extension(
+ "pysam.libcbgzf",
+ [source_pattern % "bgzf"] +
+ htslib_sources +
+ os_c_files,
+ library_dirs=htslib_library_dirs,
+ include_dirs=["htslib", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
+ language="c",
+ extra_compile_args=extra_compile_args,
+ define_macros=define_macros
+)
+
metadata = {
'name': "pysam",
'version': version,
@@ -539,6 +536,7 @@ metadata = {
ctabixproxies,
cvcf,
cbcf,
+ cbgzf,
cfaidx,
cutils],
'cmdclass': cmdclass,
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py
index 94b2eb3..b0a3466 100644
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -46,19 +46,19 @@ class TestAlignedSegment(ReadTest):
self.assertEqual(a.query_sequence, None)
self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None)
self.assertEqual(a.flag, 0)
- self.assertEqual(a.reference_id, 0)
+ self.assertEqual(a.reference_id, -1)
self.assertEqual(a.mapping_quality, 0)
self.assertEqual(a.cigartuples, None)
self.assertEqual(a.tags, [])
- self.assertEqual(a.next_reference_id, 0)
- self.assertEqual(a.next_reference_start, 0)
+ self.assertEqual(a.next_reference_id, -1)
+ self.assertEqual(a.next_reference_start, -1)
self.assertEqual(a.template_length, 0)
def testStrOfEmptyRead(self):
a = pysam.AlignedSegment()
s = str(a)
self.assertEqual(
- "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
+ "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
s)
def testSettingTagInEmptyRead(self):
@@ -231,6 +231,24 @@ class TestAlignedSegment(ReadTest):
self.assertEqual(a.get_blocks(),
[(20, 30), (31, 40), (40, 60)])
+ def test_infer_query_length(self):
+ '''Test infer_query_length on M|=|X|I|D|H|S cigar ops'''
+ a = self.buildRead()
+ a.cigarstring = '15M'
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '15='
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '15X'
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '5M5I5M'
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '5M5D5M'
+ self.assertEqual(a.infer_query_length(), 10)
+ a.cigarstring = '5H10M'
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '5S10M'
+ self.assertEqual(a.infer_query_length(), 15)
+
def test_get_aligned_pairs_soft_clipping(self):
a = self.buildRead()
a.cigartuples = ((4, 2), (0, 35), (4, 3))
@@ -375,6 +393,18 @@ class TestAlignedSegment(ReadTest):
a.cigarstring = "1S20M1S"
self.assertEqual(a.query_alignment_length, 20)
+ def test_query_length_is_limited(self):
+
+ a = self.buildRead()
+ a.query_name = "A" * 1
+ a.query_name = "A" * 254
+ self.assertRaises(
+ ValueError,
+ setattr,
+ a,
+ "query_name",
+ "A" * 255)
+
class TestCigarStats(ReadTest):
@@ -754,5 +784,6 @@ class TestAsString(unittest.TestCase):
for s, p in zip(reference, pysamf):
self.assertEqual(s, p.tostring(pysamf))
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py
index c042f4f..18fb05b 100644
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -29,7 +29,8 @@ from TestUtils import checkBinaryEqual, checkURL, \
get_temp_filename
-DATADIR = "pysam_data"
+DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
+ "pysam_data"))
##################################################
@@ -353,26 +354,53 @@ class BasicTestBAMFromFilename(BasicTestBAMFromFetch):
class BasicTestBAMFromFile(BasicTestBAMFromFetch):
def setUp(self):
- f = open(os.path.join(DATADIR, "ex3.bam"))
- self.samfile = pysam.AlignmentFile(
- f, "rb")
+ with open(os.path.join(DATADIR, "ex3.bam")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f, "rb")
+ self.reads = [r for r in self.samfile]
+
+
+class BasicTestBAMFromFileNo(BasicTestBAMFromFetch):
+
+ def setUp(self):
+ with open(os.path.join(DATADIR, "ex3.bam")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f.fileno(), "rb")
self.reads = [r for r in self.samfile]
class BasicTestSAMFromFile(BasicTestBAMFromFetch):
def setUp(self):
- f = open(os.path.join(DATADIR, "ex3.sam"))
- self.samfile = pysam.AlignmentFile(
- f, "r")
+ with open(os.path.join(DATADIR, "ex3.sam")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f, "r")
+ self.reads = [r for r in self.samfile]
+
+
+class BasicTestSAMFromFileNo(BasicTestBAMFromFetch):
+
+ def setUp(self):
+ with open(os.path.join(DATADIR, "ex3.sam")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f.fileno(), "r")
self.reads = [r for r in self.samfile]
class BasicTestCRAMFromFile(BasicTestCRAMFromFetch):
def setUp(self):
- f = open(os.path.join(DATADIR, "ex3.cram"))
- self.samfile = pysam.AlignmentFile(f, "rc")
+ with open(os.path.join(DATADIR, "ex3.cram")) as f:
+ self.samfile = pysam.AlignmentFile(f, "rc")
+ self.reads = [r for r in self.samfile]
+
+
+class BasicTestCRAMFromFileNo(BasicTestCRAMFromFetch):
+
+ def setUp(self):
+ with open(os.path.join(DATADIR, "ex3.cram")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f.fileno(), "rc")
self.reads = [r for r in self.samfile]
@@ -690,7 +718,7 @@ class TestIO(unittest.TestCase):
samfile = pysam.AlignmentFile(f, "rb")
f.close()
self.assertTrue(f.closed)
- # access to Samfile should still work
+ # access to Samfile still works
self.checkEcho("ex1.bam",
"ex1.bam",
"tmp_ex1.bam",
@@ -818,6 +846,15 @@ class TestIO(unittest.TestCase):
mode="rb")
self.assertEqual(len(list(samfile.fetch())), 3270)
+ def testBAMWithCSIIndex(self):
+ '''see issue 116'''
+ input_filename = os.path.join(DATADIR, "ex1_csi.bam")
+ samfile = pysam.AlignmentFile(input_filename,
+ "rb",
+ check_sq=False)
+ samfile.fetch('chr2')
+
+
class TestAutoDetect(unittest.TestCase):
@@ -1291,8 +1328,8 @@ class TestHeaderSAM(unittest.TestCase):
"""testing header manipulation"""
- header = {'SQ': [{'LN': 1575, 'SN': 'chr1'},
- {'LN': 1584, 'SN': 'chr2'}],
+ header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'},
+ {'LN': 1584, 'SN': 'chr2', 'AH': '*'}],
'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891',
'PU': 'SC_1_10', "CN": "name:with:colon"},
{'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891',
@@ -2343,6 +2380,46 @@ class TestPileupQueryPosition(unittest.TestCase):
last[r.alignment.query_name] = r.query_position
+class TestFindIntrons(unittest.TestCase):
+ samfilename = "pysam_data/ex_spliced.bam"
+
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(self.samfilename)
+
+ def tearDown(self):
+ self.samfile.close()
+
+ def test_total(self):
+ all_read_counts = self.samfile.count()
+ splice_sites = self.samfile.find_introns(self.samfile.fetch())
+ self.assertEqual(sum(splice_sites.values()), all_read_counts -1) # there is a single unspliced read in there
+
+ def test_first(self):
+ reads = list(self.samfile.fetch())[:10]
+ splice_sites = self.samfile.find_introns(reads)
+ starts = [14792+38 - 1]
+ stops = [14792+38 + 140 - 1]
+ self.assertEqual(len(splice_sites), 1)
+ self.assertTrue((starts[0], stops[0]) in splice_sites)
+ self.assertEqual(splice_sites[(starts[0], stops[0])], 9) # first one is the unspliced read
+
+ def test_all(self):
+ reads = list(self.samfile.fetch())
+ splice_sites = self.samfile.find_introns(reads)
+ should = collections.Counter({
+ (14829, 14969): 33,
+ (15038, 15795): 24,
+ (15947, 16606): 3,
+ (16765, 16857): 9,
+ (16765, 16875): 1,
+ (17055, 17232): 19,
+ (17055, 17605): 3,
+ (17055, 17914): 1,
+ (17368, 17605): 7,
+ })
+ self.assertEqual(should, splice_sites)
+
+
class TestLogging(unittest.TestCase):
'''test around bug issue 42,
@@ -2511,7 +2588,6 @@ class TestMappedUnmapped(unittest.TestCase):
inf.mapped)
-
class TestSamtoolsProxy(unittest.TestCase):
'''tests for sanity checking access to samtools functions.'''
@@ -2592,6 +2668,34 @@ class TestVerbosity(unittest.TestCase):
self.assertEqual(pysam.get_verbosity(), 3)
+class TestSanityCheckingBAM(unittest.TestCase):
+
+ mode = "wb"
+
+ def check_write(self, read):
+
+ fn = "tmp_test_sanity_check.bam"
+ names = ["chr1"]
+ lengths = [10000]
+ with pysam.AlignmentFile(
+ fn,
+ self.mode,
+ reference_names=names,
+ reference_lengths=lengths) as outf:
+ outf.write(read)
+
+ if os.path.exists(fn):
+ os.unlink(fn)
+
+ def test_empty_read_gives_value_error(self):
+ read = pysam.AlignedSegment()
+ self.check_write(read)
+
+# SAM writing fails, as query length is 0
+# class TestSanityCheckingSAM(TestSanityCheckingSAM):
+# mode = "w"
+
+
if __name__ == "__main__":
# build data files
print ("building data files")
diff --git a/tests/SamFile_test.py b/tests/SamFile_test.py
index 1fc88f3..ff13045 100644
--- a/tests/SamFile_test.py
+++ b/tests/SamFile_test.py
@@ -941,8 +941,8 @@ class TestIteratorColumn2(unittest.TestCase):
class TestHeaderSam(unittest.TestCase):
- header = {'SQ': [{'LN': 1575, 'SN': 'chr1'},
- {'LN': 1584, 'SN': 'chr2'}],
+ header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'},
+ {'LN': 1584, 'SN': 'chr2', 'AH': '*'}],
'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"},
{'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN": "name:with:colon"}],
'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}],
@@ -1231,19 +1231,19 @@ class TestAlignedRead(ReadTest):
self.assertEqual(a.seq, None)
self.assertEqual(a.qual, None)
self.assertEqual(a.flag, 0)
- self.assertEqual(a.rname, 0)
+ self.assertEqual(a.rname, -1)
self.assertEqual(a.mapq, 0)
self.assertEqual(a.cigar, [])
self.assertEqual(a.tags, [])
- self.assertEqual(a.mrnm, 0)
- self.assertEqual(a.mpos, 0)
+ self.assertEqual(a.mrnm, -1)
+ self.assertEqual(a.mpos, -1)
self.assertEqual(a.isize, 0)
def testStrOfEmptyRead(self):
a = pysam.AlignedRead()
s = str(a)
self.assertEqual(
- "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
+ "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
s)
def buildRead(self):
diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py
new file mode 100644
index 0000000..ce59da7
--- /dev/null
+++ b/tests/StreamFiledescriptors_test.py
@@ -0,0 +1,82 @@
+import os
+import subprocess
+import threading
+import errno
+import unittest
+
+from pysam import AlignmentFile
+
+DATADIR = os.path.abspath(os.path.join(
+ os.path.dirname(__file__),
+ "pysam_data"))
+
+
+def alignmentfile_writer_thread(infile, outfile):
+ def _writer_thread(infile, outfile):
+ """read from infile and write to outfile"""
+ try:
+ i = 0
+ for record in infile:
+ outfile.write(record)
+ i += 1
+ except IOError as e:
+ if e.errno != errno.EPIPE:
+ pass
+ finally:
+ outfile.close()
+
+ writer = threading.Thread(target=_writer_thread, args=(infile, outfile))
+ writer.daemon = True
+ writer.start()
+ return writer
+
+
+class StreamTest(unittest.TestCase):
+
+ def stream_process(self, proc, in_stream, out_stream, writer):
+
+ with AlignmentFile(proc.stdout) as infile:
+ read = 0
+ for record in infile:
+ read += 1
+ return 0, read
+
+ def test_text_processing(self):
+
+ proc = subprocess.Popen('head -n200',
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ shell=True)
+
+ in_stream = AlignmentFile('pysam_data/ex1.bam')
+ out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header)
+ writer = alignmentfile_writer_thread(in_stream,
+ out_stream)
+
+ written, read = self.stream_process(proc,
+ in_stream,
+ out_stream,
+ writer)
+ self.assertEqual(read, 198)
+
+ def test_samtools_processing(self):
+
+ proc = subprocess.Popen('samtools view -b -f 4',
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ shell=True)
+
+ in_stream = AlignmentFile('pysam_data/ex1.bam')
+ out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header)
+ writer = alignmentfile_writer_thread(in_stream,
+ out_stream)
+
+ written, read = self.stream_process(proc,
+ in_stream,
+ out_stream,
+ writer)
+ self.assertEqual(read, 35)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py
index ef21245..aa82c66 100644
--- a/tests/VariantFile_test.py
+++ b/tests/VariantFile_test.py
@@ -1,8 +1,15 @@
import os
+import sys
import unittest
import pysam
import gzip
import subprocess
+
+try:
+ from pathlib import Path
+except ImportError:
+ Path = None
+
from TestUtils import get_temp_filename, check_lines_equal
DATADIR="cbcf_data"
@@ -75,6 +82,17 @@ class TestOpening(unittest.TestCase):
os.unlink("tmp_testEmptyFile.vcf")
+
+ if Path and sys.version_info >= (3,6):
+ def testEmptyFileVCFFromPath(self):
+ with open("tmp_testEmptyFile.vcf", "w"):
+ pass
+
+ self.assertRaises(ValueError, pysam.VariantFile,
+ Path("tmp_testEmptyFile.vcf"))
+
+ os.unlink("tmp_testEmptyFile.vcf")
+
def testEmptyFileVCFGZWithIndex(self):
with open("tmp_testEmptyFile.vcf", "w"):
pass
@@ -171,12 +189,12 @@ class TestHeader(unittest.TestCase):
# remove last header line starting with #CHROM
ref.pop()
ref = sorted(ref)
- comp = sorted([str(x) for x in v.header.records])
+ comp = sorted(str(x) for x in v.header.records)
self.assertEqual(len(ref), len(comp))
for x, y in zip(ref, comp):
- self.assertEqual(x[:-1], str(y))
+ self.assertEqual(x, y)
# These tests need to be separate and start from newly opened files. This
@@ -195,6 +213,13 @@ class TestParsing(unittest.TestCase):
chrom = [rec.chrom for rec in v]
self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
+ if Path and sys.version_info >= (3,6):
+ def testChromFromPath(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(Path(fn))
+ chrom = [rec.chrom for rec in v]
+ self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
+
def testPos(self):
fn = os.path.join(DATADIR, self.filename)
v = pysam.VariantFile(fn)
@@ -330,9 +355,22 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
"""construct VariantFile from scratch."""
filename = "example_vcf42_withcontigs.vcf"
+ compression = 'NONE'
+ description = 'VCF version 4.2 variant calling text'
- def complete_check(self, fn_in, fn_out):
+ def testBase(self):
+ with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+ self.assertEqual(inf.category, 'VARIANTS')
+ self.assertEqual(inf.format, 'VCF')
+ self.assertEqual(inf.version, (4, 2))
+ self.assertEqual(inf.compression, self.compression)
+ self.assertEqual(inf.description, self.description)
+ self.assertTrue(inf.is_open)
+ self.assertEqual(inf.is_read, True)
+ self.assertEqual(inf.is_write, False)
+ def complete_check(self, fn_in, fn_out):
+ self.maxDiff = None
check_lines_equal(
self, fn_in, fn_out, sort=True,
filter_f=lambda x: x.startswith("##contig"))
@@ -349,14 +387,15 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
for record in vcf_in.header.records:
header.add_record(record)
- fn = str("tmp_VariantFileTest_testConstructionWithRecords") + ".vcf"
- vcf_out = pysam.VariantFile(fn, "w", header=header)
+ for sample in vcf_in.header.samples:
+ header.add_sample(sample)
+
+ vcf_out = pysam.VariantFile(fn_out, "w", header=header)
for record in vcf_in:
- # currently segfaults here:
- # vcf_out.write(record)
- pass
- return
+ record.translate(header)
+ vcf_out.write(record)
+ vcf_in.close()
vcf_out.close()
self.complete_check(fn_in, fn_out)
@@ -370,6 +409,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
for record in vcf_in:
vcf_out.write(record)
+ vcf_in.close()
vcf_out.close()
self.complete_check(fn_in, fn_out)
@@ -397,8 +437,8 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
self.complete_check(fn_in, fn_out)
-# Currently segfaults for VCFs without contigs
-# class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs):
+
+#class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs):
# """construct VariantFile from scratch."""
# filename = "example_vcf40.vcf"
@@ -407,18 +447,33 @@ class TestConstructionVCFGZWithContigs(TestConstructionVCFWithContigs):
"""construct VariantFile from scratch."""
filename = "example_vcf42_withcontigs.vcf.gz"
+ compression = 'BGZF'
+ description = 'VCF version 4.2 BGZF-compressed variant calling data'
class TestConstructionVCFGZWithoutContigs(TestConstructionVCFWithContigs):
"""construct VariantFile from scratch."""
filename = "example_vcf42.vcf.gz"
+ compression = 'BGZF'
+ description = 'VCF version 4.2 BGZF-compressed variant calling data'
class TestSettingRecordValues(unittest.TestCase):
filename = "example_vcf40.vcf"
+ def testBase(self):
+ with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+ self.assertEqual(inf.category, 'VARIANTS')
+ self.assertEqual(inf.format, 'VCF')
+ self.assertEqual(inf.version, (4, 0))
+ self.assertEqual(inf.compression, 'NONE')
+ self.assertEqual(inf.description, 'VCF version 4.0 variant calling text')
+ self.assertTrue(inf.is_open)
+ self.assertEqual(inf.is_read, True)
+ self.assertEqual(inf.is_write, False)
+
def testSetQual(self):
with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
record = next(inf)
@@ -435,8 +490,7 @@ class TestSettingRecordValues(unittest.TestCase):
sample = record.samples["NA00001"]
print (sample["GT"])
self.assertEqual(sample["GT"], (0, 0))
-# Fails with TypeError
-# sample["GT"] = sample["GT"]
+ sample["GT"] = sample["GT"]
class TestSubsetting(unittest.TestCase):
diff --git a/tests/_compile_test.pyx b/tests/_compile_test.pyx
index db6b5b6..dfe7937 100644
--- a/tests/_compile_test.pyx
+++ b/tests/_compile_test.pyx
@@ -1,5 +1,5 @@
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
-from pysam.ctabix cimport Tabixfile
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+from pysam.libctabix cimport Tabixfile
cdef AlignmentFile samfile
cdef Tabixfile tabixfile
diff --git a/tests/_cython_flagstat.pyx b/tests/_cython_flagstat.pyx
index f0f03bb..8e376b0 100644
--- a/tests/_cython_flagstat.pyx
+++ b/tests/_cython_flagstat.pyx
@@ -1,6 +1,6 @@
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
-from pysam.calignmentfile cimport pysam_get_flag
-from pysam.calignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+from pysam.libcalignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED
+from pysam.libcalignedsegment cimport pysam_get_flag
def count(AlignmentFile samfile):
cdef int is_proper = 0
diff --git a/tests/cython_flagstat.py b/tests/cython_flagstat.py
deleted file mode 100644
index 851157a..0000000
--- a/tests/cython_flagstat.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import pysam
-
-import pyximport
-pyximport.install()
-import _cython_flagstat
-
-is_paired, is_proper = _cython_flagstat.count(
- pysam.AlignmentFile("ex1.bam", "rb"))
-
-print ("there are alignments of %i paired reads" % is_paired)
-print ("there are %i proper paired alignments" % is_proper)
diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile
index 89a4a0c..2ccedd2 100644
--- a/tests/pysam_data/Makefile
+++ b/tests/pysam_data/Makefile
@@ -18,7 +18,8 @@ all: ex1.pileup.gz \
empty.bam empty.bam.bai \
explicit_index.bam explicit_index.cram \
faidx_empty_seq.fq.gz \
- ex1.fa.gz ex1.fa.gz.fai
+ ex1.fa.gz ex1.fa.gz.fai \
+ ex1_csi.bam
# ex2.sam - as ex1.sam, but with header
ex2.sam.gz: ex1.bam ex1.bam.bai
@@ -44,6 +45,7 @@ uncompressed.bam: ex2.sam
ex1.fa.fai:ex1.fa
samtools faidx ex1.fa
+
ex1.bam:ex1.sam.gz ex1.fa.fai
samtools import ex1.fa.fai ex1.sam.gz ex1.bam
@@ -56,6 +58,10 @@ ex1.pileup.gz:ex1.bam ex1.fa
ex2_truncated.bam: ex2.bam
head -c 124000 ex2.bam > ex2_truncated.bam
+ex1_csi.bam: ex1.bam
+ cp ex1.bam ex1_csi.bam
+ samtools index -c ex1_csi.bam
+
empty.bam: ex2.sam
grep "^@" $< | samtools view -Sb - > $@
diff --git a/tests/pysam_data/ex3.sam b/tests/pysam_data/ex3.sam
index 495d4fe..7a09188 100644
--- a/tests/pysam_data/ex3.sam
+++ b/tests/pysam_data/ex3.sam
@@ -1,6 +1,6 @@
@HD VN:1.0
- at SQ SN:chr1 LN:1575
- at SQ SN:chr2 LN:1584
+ at SQ SN:chr1 LN:1575 AH:chr1:5000000-5010000
+ at SQ SN:chr2 LN:1584 AH:*
@RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 CN:name:with:colon
@RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 CN:name:with:colon
@PG ID:P1 VN:1.0
diff --git a/tests/pysam_data/ex_spliced.sam b/tests/pysam_data/ex_spliced.sam
new file mode 100644
index 0000000..ae8086a
--- /dev/null
+++ b/tests/pysam_data/ex_spliced.sam
@@ -0,0 +1,297 @@
+ at HD VN:1.4 SO:coordinate
+ at SQ SN:1 LN:248956422
+ at SQ SN:2 LN:242193529
+ at SQ SN:3 LN:198295559
+ at SQ SN:4 LN:190214555
+ at SQ SN:5 LN:181538259
+ at SQ SN:6 LN:170805979
+ at SQ SN:7 LN:159345973
+ at SQ SN:8 LN:145138636
+ at SQ SN:9 LN:138394717
+ at SQ SN:10 LN:133797422
+ at SQ SN:11 LN:135086622
+ at SQ SN:12 LN:133275309
+ at SQ SN:13 LN:114364328
+ at SQ SN:14 LN:107043718
+ at SQ SN:15 LN:101991189
+ at SQ SN:16 LN:90338345
+ at SQ SN:17 LN:83257441
+ at SQ SN:18 LN:80373285
+ at SQ SN:19 LN:58617616
+ at SQ SN:20 LN:64444167
+ at SQ SN:21 LN:46709983
+ at SQ SN:22 LN:50818468
+ at SQ SN:X LN:156040895
+ at SQ SN:Y LN:57227415
+ at SQ SN:MT LN:16569
+ at SQ SN:GL000008.2 LN:209709
+ at SQ SN:GL000009.2 LN:201709
+ at SQ SN:GL000194.1 LN:191469
+ at SQ SN:GL000195.1 LN:182896
+ at SQ SN:GL000205.2 LN:185591
+ at SQ SN:GL000208.1 LN:92689
+ at SQ SN:GL000213.1 LN:164239
+ at SQ SN:GL000214.1 LN:137718
+ at SQ SN:GL000216.2 LN:176608
+ at SQ SN:GL000218.1 LN:161147
+ at SQ SN:GL000219.1 LN:179198
+ at SQ SN:GL000220.1 LN:161802
+ at SQ SN:GL000221.1 LN:155397
+ at SQ SN:GL000224.1 LN:179693
+ at SQ SN:GL000225.1 LN:211173
+ at SQ SN:GL000226.1 LN:15008
+ at SQ SN:KI270302.1 LN:2274
+ at SQ SN:KI270303.1 LN:1942
+ at SQ SN:KI270304.1 LN:2165
+ at SQ SN:KI270305.1 LN:1472
+ at SQ SN:KI270310.1 LN:1201
+ at SQ SN:KI270311.1 LN:12399
+ at SQ SN:KI270312.1 LN:998
+ at SQ SN:KI270315.1 LN:2276
+ at SQ SN:KI270316.1 LN:1444
+ at SQ SN:KI270317.1 LN:37690
+ at SQ SN:KI270320.1 LN:4416
+ at SQ SN:KI270322.1 LN:21476
+ at SQ SN:KI270329.1 LN:1040
+ at SQ SN:KI270330.1 LN:1652
+ at SQ SN:KI270333.1 LN:2699
+ at SQ SN:KI270334.1 LN:1368
+ at SQ SN:KI270335.1 LN:1048
+ at SQ SN:KI270336.1 LN:1026
+ at SQ SN:KI270337.1 LN:1121
+ at SQ SN:KI270338.1 LN:1428
+ at SQ SN:KI270340.1 LN:1428
+ at SQ SN:KI270362.1 LN:3530
+ at SQ SN:KI270363.1 LN:1803
+ at SQ SN:KI270364.1 LN:2855
+ at SQ SN:KI270366.1 LN:8320
+ at SQ SN:KI270371.1 LN:2805
+ at SQ SN:KI270372.1 LN:1650
+ at SQ SN:KI270373.1 LN:1451
+ at SQ SN:KI270374.1 LN:2656
+ at SQ SN:KI270375.1 LN:2378
+ at SQ SN:KI270376.1 LN:1136
+ at SQ SN:KI270378.1 LN:1048
+ at SQ SN:KI270379.1 LN:1045
+ at SQ SN:KI270381.1 LN:1930
+ at SQ SN:KI270382.1 LN:4215
+ at SQ SN:KI270383.1 LN:1750
+ at SQ SN:KI270384.1 LN:1658
+ at SQ SN:KI270385.1 LN:990
+ at SQ SN:KI270386.1 LN:1788
+ at SQ SN:KI270387.1 LN:1537
+ at SQ SN:KI270388.1 LN:1216
+ at SQ SN:KI270389.1 LN:1298
+ at SQ SN:KI270390.1 LN:2387
+ at SQ SN:KI270391.1 LN:1484
+ at SQ SN:KI270392.1 LN:971
+ at SQ SN:KI270393.1 LN:1308
+ at SQ SN:KI270394.1 LN:970
+ at SQ SN:KI270395.1 LN:1143
+ at SQ SN:KI270396.1 LN:1880
+ at SQ SN:KI270411.1 LN:2646
+ at SQ SN:KI270412.1 LN:1179
+ at SQ SN:KI270414.1 LN:2489
+ at SQ SN:KI270417.1 LN:2043
+ at SQ SN:KI270418.1 LN:2145
+ at SQ SN:KI270419.1 LN:1029
+ at SQ SN:KI270420.1 LN:2321
+ at SQ SN:KI270422.1 LN:1445
+ at SQ SN:KI270423.1 LN:981
+ at SQ SN:KI270424.1 LN:2140
+ at SQ SN:KI270425.1 LN:1884
+ at SQ SN:KI270429.1 LN:1361
+ at SQ SN:KI270435.1 LN:92983
+ at SQ SN:KI270438.1 LN:112505
+ at SQ SN:KI270442.1 LN:392061
+ at SQ SN:KI270448.1 LN:7992
+ at SQ SN:KI270465.1 LN:1774
+ at SQ SN:KI270466.1 LN:1233
+ at SQ SN:KI270467.1 LN:3920
+ at SQ SN:KI270468.1 LN:4055
+ at SQ SN:KI270507.1 LN:5353
+ at SQ SN:KI270508.1 LN:1951
+ at SQ SN:KI270509.1 LN:2318
+ at SQ SN:KI270510.1 LN:2415
+ at SQ SN:KI270511.1 LN:8127
+ at SQ SN:KI270512.1 LN:22689
+ at SQ SN:KI270515.1 LN:6361
+ at SQ SN:KI270516.1 LN:1300
+ at SQ SN:KI270517.1 LN:3253
+ at SQ SN:KI270518.1 LN:2186
+ at SQ SN:KI270519.1 LN:138126
+ at SQ SN:KI270521.1 LN:7642
+ at SQ SN:KI270522.1 LN:5674
+ at SQ SN:KI270528.1 LN:2983
+ at SQ SN:KI270529.1 LN:1899
+ at SQ SN:KI270530.1 LN:2168
+ at SQ SN:KI270538.1 LN:91309
+ at SQ SN:KI270539.1 LN:993
+ at SQ SN:KI270544.1 LN:1202
+ at SQ SN:KI270548.1 LN:1599
+ at SQ SN:KI270579.1 LN:31033
+ at SQ SN:KI270580.1 LN:1553
+ at SQ SN:KI270581.1 LN:7046
+ at SQ SN:KI270582.1 LN:6504
+ at SQ SN:KI270583.1 LN:1400
+ at SQ SN:KI270584.1 LN:4513
+ at SQ SN:KI270587.1 LN:2969
+ at SQ SN:KI270588.1 LN:6158
+ at SQ SN:KI270589.1 LN:44474
+ at SQ SN:KI270590.1 LN:4685
+ at SQ SN:KI270591.1 LN:5796
+ at SQ SN:KI270593.1 LN:3041
+ at SQ SN:KI270706.1 LN:175055
+ at SQ SN:KI270707.1 LN:32032
+ at SQ SN:KI270708.1 LN:127682
+ at SQ SN:KI270709.1 LN:66860
+ at SQ SN:KI270710.1 LN:40176
+ at SQ SN:KI270711.1 LN:42210
+ at SQ SN:KI270712.1 LN:176043
+ at SQ SN:KI270713.1 LN:40745
+ at SQ SN:KI270714.1 LN:41717
+ at SQ SN:KI270715.1 LN:161471
+ at SQ SN:KI270716.1 LN:153799
+ at SQ SN:KI270717.1 LN:40062
+ at SQ SN:KI270718.1 LN:38054
+ at SQ SN:KI270719.1 LN:176845
+ at SQ SN:KI270720.1 LN:39050
+ at SQ SN:KI270721.1 LN:100316
+ at SQ SN:KI270722.1 LN:194050
+ at SQ SN:KI270723.1 LN:38115
+ at SQ SN:KI270724.1 LN:39555
+ at SQ SN:KI270725.1 LN:172810
+ at SQ SN:KI270726.1 LN:43739
+ at SQ SN:KI270727.1 LN:448248
+ at SQ SN:KI270728.1 LN:1872759
+ at SQ SN:KI270729.1 LN:280839
+ at SQ SN:KI270730.1 LN:112551
+ at SQ SN:KI270731.1 LN:150754
+ at SQ SN:KI270732.1 LN:41543
+ at SQ SN:KI270733.1 LN:179772
+ at SQ SN:KI270734.1 LN:165050
+ at SQ SN:KI270735.1 LN:42811
+ at SQ SN:KI270736.1 LN:181920
+ at SQ SN:KI270737.1 LN:103838
+ at SQ SN:KI270738.1 LN:99375
+ at SQ SN:KI270739.1 LN:73985
+ at SQ SN:KI270740.1 LN:37240
+ at SQ SN:KI270741.1 LN:157432
+ at SQ SN:KI270742.1 LN:186739
+ at SQ SN:KI270743.1 LN:210658
+ at SQ SN:KI270744.1 LN:168472
+ at SQ SN:KI270745.1 LN:41891
+ at SQ SN:KI270746.1 LN:66486
+ at SQ SN:KI270747.1 LN:198735
+ at SQ SN:KI270748.1 LN:93321
+ at SQ SN:KI270749.1 LN:158759
+ at SQ SN:KI270750.1 LN:148850
+ at SQ SN:KI270751.1 LN:150742
+ at SQ SN:KI270752.1 LN:27745
+ at SQ SN:KI270753.1 LN:62944
+ at SQ SN:KI270754.1 LN:40191
+ at SQ SN:KI270755.1 LN:36723
+ at SQ SN:KI270756.1 LN:79590
+ at SQ SN:KI270757.1 LN:71251
+ at PG ID:STAR PN:STAR VN:STAR_2.4.1a
+HWI-C00113:131:HMHYWADXX:1:2202:17748:47494 272 1 14792 0 51M * 0 0 GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG NH:i:6 HI:i:3 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2202:17748:47494 272 1 14792 0 38M140N13M * 0 0 GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG NH:i:6 HI:i:3 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1214:7658:35836 272 1 14792 0 38M140N13M * 0 0 GGGCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHJJJJJJJJCGHIJJIJJJJJJIJJGIJJIJIJIJJJJI NH:i:6 HI:i:3 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2114:4116:44566 272 1 14794 0 36M140N15M * 0 0 GCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGA <@@DDDDDDFHCFHEFGBE+2AFH at GIEGF=GGHII9F<GHHIIA at 6=48; NH:i:6 HI:i:3 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1114:13704:81420 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC @@@DDDDFHBFHHGIGIE3CFHIIIIII<E at FHIGIIC?BFDHDHGIIIII NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2115:10483:10806 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC CCCFFFFFHHHHHJJJIIHHJIJJJJJJHHHIHGIIJJJHJIJJIJJJJJJ NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2214:18560:59872 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC =;?BA at DAFFFFF?EAF;A?9CH?CB9E9?D9FGEGGCGGEHIDBE at FFH; NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2028:49488 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC ???+4 at B?FHHFHFGIBEF9BDHCB??CEHGG*1C<FEHAF?(?(@@=B8@ NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2949:63319 272 1 14795 0 1S35M140N15M * 0 0 ACCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGA @@CDFDFFHHHDHGIIIEEFGIJJJGIJIIGCGIJJJJJJIGIJIJJJHGA NH:i:6 HI:i:3 AS:i:46 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1209:18680:84812 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC CC at FFFFFHHFHHJJJHHEHIIJJJJJIIEGFIJJJIIIIJJIJJJJJIII NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2205:10731:40239 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC @@<=A at DDFB:?FG<F;:3CCBEEBFGIIA?GIAD>B?BFF<BDF<8B8FF NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2207:18860:77945 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC C at CFFFFFHHHHHJJJIJGHIIJJJJJJJJJJJJIJJJJJIJJIJJJJIJJ NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1102:4544:68832 272 1 14796 0 34M140N17M * 0 0 CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC ???DDDDDBDDD:CE:C2<CFBDDECCEDC>DEE??BD?D at DADD<CC=8B NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1203:5829:91963 272 1 14796 0 34M140N17M * 0 0 CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC CCCFFFFFHHHHHJJIJIHJIIJGIIJIJJJJJIJJIJGGIJJJGIJJHJI NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2110:3018:17806 272 1 14796 0 34M140N17M * 0 0 CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC CCCFFFFFHHHFHIJJIFGIJIHIJIIDHIIJGGIIJIJJJJJJIJJIIIJ NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1111:17873:10434 272 1 14797 0 33M140N18M * 0 0 CCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCA ???DDDADDDDD8:2<2<FFFEIICEI;E@>DDBDDD@<?0@@D9=<.BBB NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1109:10709:93463 272 1 14801 0 29M140N22M * 0 0 ACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTT CC at DFFFFHHGH<CGDFHIIGIDAEGDGBBGFH at GEH:FHGGHIEFFDHII NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2214:12148:62454 272 1 14801 0 29M140N22M * 0 0 ACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTT ?@?DDDDD:FF=CFGGIGGBFGA;3<1:EEG>FGHFHFHIHGI@?DGC at CF NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1105:1515:82248 272 1 14802 0 28M140N23M * 0 0 CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG ??:ADBDDDDD:A<+C?AFDB at E?F4<*?:?1:??):??0009??9?(8BC NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1110:16355:5537 272 1 14802 0 28M140N23M * 0 0 CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG @CCFFFFFHH?ADHGIJIJJJJJIIEHIJJJJJIJIGIIJJIJJIIJIJJJ NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1102:17802:20689 272 1 14805 0 25M140N26M * 0 0 GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG CCCFFFFFHHHHHJJJJJJIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJI NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1104:7670:95815 272 1 14805 0 25M140N26M * 0 0 GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @@@DBDDDHHBFDBFGEBBGHG at HIBHIDHBGGGEFBDDDFDGBBBGCHHI NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1110:11368:101298 272 1 14805 0 25M140N26M * 0 0 GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG BCCFFFFFCFHHHJJJJJIJJJJJJJJJJJJJGJJJJJJJJJJJJJJJIJJ NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2363:85646 272 1 14805 0 25M140N26M * 0 0 GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @C at FFFB?CFDFHDHGIIIIEGIIIIEDGIIIIIIIIIGGIIGIIGCGHIH NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:2213:6044:80821 272 1 14805 0 25M140N26M * 0 0 GCTCCGGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @@@FFFFFFFBFDGIIJIJGGFHIIIJIIJGIIEHI<FEGIIFEGIHIHGE NH:i:7 HI:i:4 AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:1105:6336:76198 272 1 14807 0 23M140N28M * 0 0 CCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAA CCCFFFFFHHGHHGGHIIJEHIJIEGIJIJGIIIJICBFFIGAHFHHHJBH NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1108:3508:3794 272 1 14807 0 1S23M140N27M * 0 0 GCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGA CCCFFFFDHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJIJJJJJJJJJIJJ NH:i:7 HI:i:4 AS:i:48 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2209:11671:4960 272 1 14811 0 19M140N32M * 0 0 GGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGA BCBFFFFFHHHHHJJGIIGIJJJJJJJJJJJJIIJIJJJJJJHIJIIJIJI NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2105:5117:87572 272 1 14812 0 18M140N33M * 0 0 GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT C at CFFFFFFFFHGFEFHIJJJJGIBHIJJJGGCHIEEGIJJFDGGGIGIGI NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2202:9099:82513 272 1 14812 0 18M140N33M * 0 0 GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT 8?;DDDDDDFB?A3:<EE<<CGA+<F:F1?D*:*1:))???99??<FB??B NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2214:5571:19703 272 1 14812 0 18M140N33M * 0 0 GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT =:?7ADFDHHHDHHGIEF?CFFCBFEG at G>CHIGEGFG?FGHGA>9B8BF@ NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1215:4185:31561 272 1 14815 0 15M140N36M * 0 0 CTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCG CCCFFFFFHHHHGJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJIJJJH NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2108:1506:70629 272 1 14816 0 14M140N37M * 0 0 TTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCGA ?@@;BDDD=DFFDDDFGGFCA?)<CEG at C??C?FDFFB<FGIFDFFDFC;; NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1113:4051:71948 272 1 14818 0 12M140N39M * 0 0 TCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCGACA @@@FFDFFF:CDCFGGDHHGEFHIJIJIIGIGHIJDBBDHGI at 9BFGIEHI NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1101:6114:49389 272 1 15001 0 38M757N13M * 0 0 ATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC CCCFFFFFHHHHHHIJJJJJJJJIJJJJJJIJJJJJJJJJJJJJGJJJJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1103:12497:23506 272 1 15001 0 38M757N13M * 0 0 ATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC CCCFFFFFHHHHGJIJJJJJJJJJJJJJJJJJJJJJIJJIJJJJJIJIJJI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2214:19931:4971 272 1 15002 0 37M757N14M * 0 0 TCCTACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCC ?;=+4ADDBBBDAFGGI>1 at F?F+AAEEEB<GAG;?DGFE>FFIIF at DE4= NH:i:8 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1108:6828:32713 272 1 15003 0 36M757N15M * 0 0 CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT ?@@ADDDDD?CDD:CFB@:@G at ABGFGFGFBFEAFEEEFCFCF@F=)8=@> NH:i:8 HI:i:2 AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:1111:7491:39504 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFDFFFHHHFIGEGHIGIGGDGIJFHEHGGIJJJIJIJJJJJIIIIGI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1212:16079:85811 272 1 15003 0 36M757N15M * 0 0 CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @CCFFFFFHGHHHJJJJJJJIIJJJJJIJIJJHIIJJJJIJJJJJJJIJJJ NH:i:8 HI:i:2 AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:2101:7167:50357 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@DD?DDFHHHD@?<AGHBEHFGAGIFHEH3??BFGBD at GGCHGGGCHI; NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2201:9548:48040 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHHJJJJJJJJJJJJIJHIIIGGIIJJJJHIJJJJIJIJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2201:14017:74222 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT =@?=DDDDDBDCFHB at CG?EF<BC>CG?FHGIIIIG@??BGHIE;8 at B<FB NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2204:7589:97905 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHDHIGGGJIJJIJJJJIJJJJIGIJIJJIJJJJJIJJIJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2212:18929:92726 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@DDDDDFFFF:AE<AFGFEHFFAF8:1:@8:DBBD9BB?/BDF<CDB<F NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2215:2615:12154 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFGHHHHJJJJJJJJIJJJJJJJJJJJJJJJJJIJJJJJJJJJJI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1106:7741:42827 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHHJJJJJJJJJIJJJIIJJJJIJJJJJJJJJJJIJGHIHH NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1201:8380:74978 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHFHHHIJJJJJJJJJJJJJFHIIGJJJJJJJIGGIIJEEDHHI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1205:11268:38021 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT 1:?7DDD?ACDFBGHEAE at FGB@;@@A at C@0?F9FBFCF at 48*9==3=CCF NH:i:8 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1208:17413:76793 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT C@@FFFFFGGDDDGHIGGIH<FHGEHB8CEIIJIIFG?FFHFHIJII>FEG NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1211:4828:84953 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @BB?DFFFHHHHHIJIJJJJJJJJJJJHIJJIIJJJJJJIIIJJJJJJIJI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2107:20905:80208 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @CCFFFFFHHHFBHIIEIIDIHGGGGG at GGHCFGHIIJIGGGGIJIGIGGH NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2112:6263:84991 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@?DDDDFBH?FHIGIGIIGG;GHBGCD?DCGIIGHEGBBFHGGIHBFIG NH:i:8 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:2202:10314:26844 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHHJJJJJJJJJJJJJJIJJJJIIJJJJJJJJJJJJJJJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2213:21028:90280 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@BDDDAD?FDF9GIBB@@FFG3CFF:DD)?BD*9D@@F4BDEEEFFF8= NH:i:8 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1216:14847:22529 272 1 15004 0 35M757N16M * 0 0 CGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTT @@@FFFFDHHBDHIIIJJJJIIIIIIJJIJJGIJIFIJJIDHHGBEHIJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2111:14281:81135 272 1 15007 0 32M757N19M * 0 0 CATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTG @@@DDDBD42=:ACFFIE?FFGAFF at FFFDGEAG>D at DBB9BC3D@EDFFA NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2203:4824:93169 272 1 15008 0 31M757N20M * 0 0 ATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTGC CCCFFFFFHHHHHJJJJIJJJJHIJIJJJJJJJJGIJJJJI?DFGFHIHJF NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1112:17298:87937 272 1 15925 1 23M659N28M * 0 0 CACTTCCCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTC ?@;;BA;3ABC?C?6EGDGIIBA+AAC<?D9CBGG@@FFFFAFCIIECC7= NH:i:4 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2210:17342:39133 272 1 15925 1 23M659N28M * 0 0 CACTTCCCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTC @?@DDDDDFAB::<EBFGIFG at FF9AECEFIFAGCD:F at F8=@E;7)77@@ NH:i:4 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2112:17740:2548 272 1 15931 1 17M659N34M * 0 0 CCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTCGTCCTG @@@FFD:ACFDCFCGGGDF?HHIBDEHFGHDHFIIGBDGEEHIFHGIIGHH NH:i:4 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2112:15228:46115 272 1 16727 0 39M92N12M * 0 0 GGGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTTGAGAGGCTC @CCDDFF:?FFHH- at B:AABCB at DDEEDCDCCDCCCCD>ACD>>:9:??2< NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2109:14386:93817 272 1 16728 0 38M92N13M * 0 0 GGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTTGAGAGGCTCG @CCFFFDDHHHHDHIFHIJJJGHHIIJHHHHHHFFFFEFEEEECDDDDDDB NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2203:14322:7218 272 1 16741 0 25M110N26M * 0 0 GGTGTTAGTACCCCATCTTGTAGGTCTCAGTGTGGAAGGTGGGCAGTTCTG ?@?DDD?BFFHHFB7EFGGGEFHIHA<CFHIGEHI<FEHH<=DEGG?DGEH NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1212:14242:21074 272 1 16751 0 15M92N36M * 0 0 CCCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGG @@CDFFFFGGBFFECGGGGIIHCHCEG at FAEGII9?*?BB9BFGC at H)=FG NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1106:12278:45196 272 1 16752 0 14M92N37M * 0 0 CCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGC CCCFFFFFHHHHHHIHIJIJIJJJJJIJFEHHIGGEGHGFHGBGGH8BGH; NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:12811:3161 272 1 16752 0 14M92N37M * 0 0 CCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGC CCCFFFFFHFFHGEGHIJJJJJIIGHFHAFEGGDDCE:DBDGDHFH?CGH@ NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1102:2563:17519 272 1 16753 0 13M92N38M * 0 0 CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGCGGGCA ?<?D>DB8D>822<CC<<<+A;CE?1):C?))1:*0B<9*8*0*((7 at 4'3 NH:i:6 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2201:2398:40333 16 1 16753 0 13M92N38M * 0 0 CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA CCCFFFFFHHHDHEHIHIJGF1FFHEBH at FHICHDD<B?DDA@?FD?FHFH NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2209:7506:25914 16 1 16753 0 13M92N38M * 0 0 CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA @C at FDFFFFHHHHIIJIIIJJJJJJBHG=?DFBC<?:?9?FGHCG8BHHD7 NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1207:6786:56046 16 1 16753 0 13M92N38M * 0 0 CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA @B at DFFFFHHHHHGIGIIIAGIJEGAHHEGHHBF>BDG?FHBGEH?FHGG3 NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2116:7403:96086 272 1 17020 0 36M177N15M * 0 0 GCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG :?=DDD=AAAC:+<CA2C+::AFAC,9<9CA+::CEDDDD>BDIIIIIIA? NH:i:7 HI:i:5 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1209:11002:81132 272 1 17020 0 36M177N15M * 0 0 GCCCGGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG @@@DD at A<@DDDF;BCGF<4CHCEG?EG at FGF9)?BB:?B?DBF>D?**9B NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1115:8064:78307 272 1 17021 0 35M177N16M * 0 0 CCCTGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGC 11844BBDD=FDFEFFFDFI?HEHAFBEHEEEFC?E:FDGDD<FE4:9??9 NH:i:7 HI:i:5 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1211:18547:26385 272 1 17027 0 29M177N22M * 0 0 TCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTC BCCFFFFFHHHHHJJIIJHJJJJJJJIJJJIJJJJJJJJJIJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2109:12204:47428 272 1 17028 0 28M177N23M * 0 0 CTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCT @@@DDBD:=ACDBFEGECFGIHD>DH9CBEHHHEEFB?F>GD at 3?FB?BB@ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1101:15891:42282 272 1 17028 0 28M177N23M * 0 0 CTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCT CCCFFFFFHHHHHJHHIIJJJJJJJJJJIIJJJJIJJJIJJJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1107:10929:6659 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHDHIHHJJGJJJJJJJIJJIJGIJJJIJJJIJJJJJIJJG NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1114:7098:71178 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG =?@BDEEFHBDHFBGIEGIHEHIGDHGEIIJIIIEHIHIIGHDGHIGIIH@ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1209:3383:100724 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG ?@@ADDDDDHDH?EEFH<CAHHGCHIF?GG>EHIGIIGHGHIFII>BFIH? NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2111:3771:31345 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG @@@DFFFFGHDHHHJGIHJJJJGIJJIJIJIIJJIIJJIGHIJJJIJJIJ< NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2205:14794:36455 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHHIJJJJJJJJJJJJJJJJJJIJJJJIJJIJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1107:19701:64552 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFDFFHHHHDGIIJIJJJIIJDGHGJJJJJJIJJJJJJJGIJJJJJJF NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1210:18711:88303 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHHJJJJJJJJIJFIJJJEIIHIIJJIIJJGJJJIJJJJJE NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:19113:15559 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG @@@7B>DDC=<AF@<CFB?<,?FFDBF3AD+?9*?EGCF>@BFBGBAF<FG NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:14258:59619 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG =?@DDD:=DADFAEFGEF<,AADE<F<?AAFCGG@?FD>CGBF<D<9B<D< NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2103:9695:24819 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC CCCFFFFFHHHHHJHIJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJGG NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1204:13994:2816 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC ?@@DDDDDHHHHFHEFAABA@?FGBEFHIIIHH>DB at DHIHIDD>@@GHID NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1212:15591:47491 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC @@C+ADDDDHFFDEGEGIIIDFHIFHIIIIIGEHIIBH>FGGGHGHFGGII NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2215:10125:81395 272 1 17031 0 25M859N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCAGGGCCCGCTCGTCCAGGGGGC CCCFFFFFGHHHHJJJJJJJJJHJJJJJJIJIIJJJHIJJJJJJJJJIJHE NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2102:9065:90529 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG C at CFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJFHIFHIJIJJJJJJJJ NH:i:5 HI:i:2 AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2204:7767:77376 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@@FDFDDBFHADEHEIGIGIJIGHIHG?EDGHGGCFH:B?BD at FGFHGIH NH:i:5 HI:i:2 AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1212:6793:42000 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@?DADBD8CFADGFHIIIIE3A<EC:EHGGGIIB8?80?DDH>9?<FGCD NH:i:5 HI:i:2 AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1211:14829:37922 272 1 17036 0 20M177N31M * 0 0 TAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGCTGATG @<@BDBFDFFHAFHFF at GIIIHECFHFGFHICFHFIIIIGIIEGFF<FHII NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2216:19937:47046 16 1 17334 1 35M237N16M * 0 0 CAGCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGT CC at F?DA?FDHHHIIGI at DHHGGFHHHIIAG@F at GFHHGGHEHG7-;FEHE NH:i:4 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1111:7653:49738 16 1 17336 1 33M237N18M * 0 0 GCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTG ?8=4BDDDFDFFAGF@@GD?9?FBFDDDBDEFFIII?BDEFFI75F5;65C NH:i:4 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2205:11163:47820 16 1 17336 1 33M237N18M * 0 0 GCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTG C at CFFFFFHHHHHJJJJIIJJJIJIJJJJJJJJJJJJJJJJJJGHIAGHII NH:i:4 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2208:13311:23997 16 1 17340 1 29M237N22M * 0 0 GGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGA ?@8AD?@D?F>DH?FHGHH at EHGIEHGGIIIGGHIGHGFDEHGH=FHGIIH NH:i:3 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2207:3786:78354 16 1 17340 1 29M237N22M * 0 0 GGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGA CCCFFFFFHHHHHJJJJJIIJJJJJJJJJJJJHHIJIHHBFIHIIJJJJJI NH:i:3 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1115:8438:81914 16 1 17341 1 28M237N23M * 0 0 GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG @@CFFFFDHH?HDGGHIIGIGHIGHGIDIIIFGIIGHHDG:?DFHEHIIII NH:i:3 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1114:13486:49038 16 1 17341 1 28M237N23M * 0 0 GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG ?@@:D at DDFHAFFHGGFHFHH@CCHIIIII@:CFGFGGC?D)?8DHHGCGI NH:i:3 HI:i:1 AS:i:49 nM:i:0
diff --git a/tests/python_flagstat.py b/tests/python_flagstat.py
deleted file mode 100644
index b14e52d..0000000
--- a/tests/python_flagstat.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import pysam
-
-is_paired = 0
-is_proper = 0
-
-for read in pysam.AlignmentFile("ex1.bam", "rb"):
- is_paired += read.is_paired
- is_proper += read.is_proper_pair
-
-print ("there are alignments of %i paired reads" % is_paired)
-print ("there are %i proper paired alignments" % is_proper)
diff --git a/tests/samtools_test.py b/tests/samtools_test.py
index d5b2791..aa4c554 100644
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -71,6 +71,7 @@ class SamtoolsTest(unittest.TestCase):
# an output file.
statements = [
"view ex1.bam > %(out)s_ex1.view",
+ "view -c ex1.bam > %(out)s_ex1.count",
# ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam",
"sort ex1.bam -o %(out)s_ex1.sort.bam",
"mpileup ex1.bam > %(out)s_ex1.pileup",
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pysam.git
More information about the debian-med-commit
mailing list