[med-svn] [Git][med-team/htseq][upstream] New upstream version 0.11.2
Andreas Tille
gitlab at salsa.debian.org
Fri Jan 11 22:40:05 GMT 2019
Andreas Tille pushed to branch upstream at Debian Med / htseq
Commits:
237b3f13 by Andreas Tille at 2019-01-11T22:31:44Z
New upstream version 0.11.2
- - - - -
15 changed files:
- PKG-INFO
- README.rst
- VERSION
- python2/HTSeq/__init__.py
- python2/HTSeq/_version.py
- python2/HTSeq/scripts/count.py
- python2/HTSeq/scripts/qa.py
- python2/src/HTSeq/_HTSeq.pxd
- python2/src/HTSeq/_HTSeq.pyx
- python3/HTSeq/__init__.py
- python3/HTSeq/_version.py
- python3/HTSeq/scripts/count.py
- python3/HTSeq/scripts/qa.py
- python3/src/HTSeq/_HTSeq.pxd
- python3/src/HTSeq/_HTSeq.pyx
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: HTSeq
-Version: 0.10.0
+Version: 0.11.2
Summary: A framework to process and analyze data from high-throughput sequencing (HTS) assays
Home-page: https://github.com/simon-anders/htseq
Author: Simon Anders
=====================================
README.rst
=====================================
@@ -29,13 +29,15 @@ and are transferring the binaries onto another machine with a compatible
environment (architechture, shared libraries). If you are not sure,
chances are you need them.
-Both ``Linux`` and ``OSX`` are supported and binaries are provided for virtually
-all ``Linux`` versions and for some ``OSX`` versions (the latter only for ``Python 2.7``
-and ``Python 3.6``). A source package which should not require ``Cython`` nor ``SWIG``
-is provided for all other cases. ``Windows`` is not officially supported as we don't
-have access to a Continuous Integration ``Windows`` machine that supports ``pysam``.
-However, if you have built ``HTSeq`` for ``Windows``, please open an issue and we'll
-try and include it in the release.
+Both **Linux** and **OSX** are supported and binaries are provided for virtually
+all Linux versions and for some OSX versions (the latter only for Python 2.7
+and 3.6). A source package which should not require ``Cython`` nor ``SWIG``
+is provided for all other cases.
+
+**Windows is not officially supported** as we don't have access to a Continuous
+Integration Windows machine that supports ``pysam``. However, if you have built
+``HTSeq`` for Windows, please open an issue and we'll try and include it in the
+release.
Installation
~~~~~~~~~~~~
@@ -57,6 +59,20 @@ To install directly from PyPI:
</div>
+To install a specific version (e.g. version 0.11.0):
+
+.. raw:: html
+
+ <div class="highlight highlight-source-shell">
+
+::
+
+ pip install 'HTSeq==0.11.0'
+
+.. raw:: html
+
+ </div>
+
If this fails, please install all dependencies first:
.. raw:: html
=====================================
VERSION
=====================================
@@ -1 +1 @@
-0.10.0
+0.11.2
=====================================
python2/HTSeq/__init__.py
=====================================
@@ -7,21 +7,14 @@ import itertools
import warnings
import os
import shlex
+import sys
-try:
- from _HTSeq import *
-except ImportError:
- if os.path.isfile("setup.py"):
- raise ImportError("Cannot import 'HTSeq' when working directory is HTSeq's own build directory.")
- else:
- raise
+from _HTSeq import *
from _version import __version__
-#from vcf_reader import *
-
#########################
-## Utils
+# Utils
#########################
@@ -82,7 +75,7 @@ class FileOrSequence(object):
#########################
-## Features
+# Features
#########################
class GenomicFeature(object):
@@ -137,9 +130,10 @@ class GenomicFeature(object):
sep = "="
else:
sep = " "
- attr_str = '; '.join(['%s%s\"%s\"' % (ak, sep, attr[ak]) for ak in attr])
+ attr_str = '; '.join(
+ ['%s%s\"%s\"' % (ak, sep, attr[ak]) for ak in attr])
return "\t".join(str(a) for a in (self.iv.chrom, source,
- self.type, self.iv.start+1, self.iv.end, score,
+ self.type, self.iv.start + 1, self.iv.end, score,
self.iv.strand, frame, attr_str)) + "\n"
@@ -151,24 +145,27 @@ def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
"""Parses a GFF attribute string and returns it as a dictionary.
If 'extra_return_first_value' is set, a pair is returned: the dictionary
- and the value of the first attribute. This might be useful if this is the ID.
+ and the value of the first attribute. This might be useful if this is the
+ ID.
"""
if attrStr.endswith("\n"):
attrStr = attrStr[:-1]
d = {}
first_val = "_unnamed_"
- for (i, attr) in itertools.izip(itertools.count(), _HTSeq.quotesafe_split(attrStr)):
+ for (i, attr) in itertools.izip(
+ itertools.count(),
+ _HTSeq.quotesafe_split(attrStr)):
if _re_attr_empty.match(attr):
continue
if attr.count('"') not in (0, 2):
- raise ValueError("The attribute string seems to contain mismatched quotes.")
+ raise ValueError(
+ "The attribute string seems to contain mismatched quotes.")
mo = _re_attr_main.match(attr)
if not mo:
raise ValueError("Failure parsing GFF attribute line")
val = mo.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
- # val = urllib.unquote(val)
d[intern(mo.group(1))] = intern(val)
if extra_return_first_value and i == 0:
first_val = val
@@ -210,9 +207,15 @@ class GFF_Reader(FileOrSequence):
strand, frame, attributeStr) = line.split("\t", 8)
(attr, name) = parse_GFF_attribute_string(attributeStr, True)
if self.end_included:
- iv = GenomicInterval(seqname, int(start)-1, int(end), strand)
+ iv = GenomicInterval(
+ seqname,
+ int(start) - 1, int(end),
+ strand)
else:
- iv = GenomicInterval(seqname, int(start)-1, int(end)-1, strand)
+ iv = GenomicInterval(
+ seqname,
+ int(start) - 1, int(end) - 1,
+ strand)
f = GenomicFeature(name, feature, iv)
if score != ".":
score = float(score)
@@ -224,6 +227,7 @@ class GFF_Reader(FileOrSequence):
f.attr = attr
yield f
+
def make_feature_dict(feature_sequence):
"""A feature dict is a convenient way to organize a sequence of Feature
object (which you have got, e.g., from parse_GFF).
@@ -260,16 +264,17 @@ def make_feature_dict(feature_sequence):
#########################
-## GenomicArray
+# GenomicArray
#########################
def read_chrom_lens(filename, delimiter="\t"):
- return dict(((chrom, int(len))
- for chrom, len in csv.reader(open(filename), delimiter=delimiter)))
+ return dict(
+ ((chrom, int(len))
+ for chrom, len in csv.reader(open(filename), delimiter=delimiter)))
#########################
-## Sequence readers
+# Sequence readers
#########################
_re_fasta_header_line = re.compile(r'>\s*(\S+)\s*(.*)')
@@ -286,87 +291,96 @@ class FastaReader(FileOrSequence):
self.raw_iterator = raw_iterator
def __iter__(self):
- seq = None
- for line in FileOrSequence.__iter__(self):
- if line.startswith(">"):
- if seq:
- if self.raw_iterator:
- s = (seq, name, descr)
- else:
- s = Sequence(seq, name)
- s.descr = descr
- yield s
- mo = _re_fasta_header_line.match(line)
- name = mo.group(1)
- descr = mo.group(2)
- seq = ""
- else:
- assert seq is not None, "FASTA file does not start with '>'."
- seq += line[:-1]
- if seq is not None:
- if self.raw_iterator:
- s = (seq, name, descr)
- else:
- s = Sequence(seq, name)
- s.descr = descr
- yield s
+ seq = None
+ name = None
+ descr = None
+ for line in FileOrSequence.__iter__(self):
+ if line.startswith(">"):
+ if seq:
+ if self.raw_iterator:
+ s = (seq, name, descr)
+ else:
+ s = Sequence(seq, name)
+ s.descr = descr
+ yield s
+ mo = _re_fasta_header_line.match(line)
+ name = mo.group(1)
+ descr = mo.group(2)
+ seq = ""
+ else:
+ assert seq is not None, "FASTA file does not start with '>'."
+ seq += line[:-1]
+ if seq is not None:
+ if self.raw_iterator:
+ s = (seq, name, descr)
+ else:
+ s = Sequence(seq, name)
+ s.descr = descr
+ yield s
def get_sequence_lengths(self):
- seqname = None
- seqlengths = {}
- for line in FileOrSequence.__iter__(self):
- if line.startswith(">"):
- if seqname is not None:
- seqlengths[ seqname ] = length
- mo = _re_fasta_header_line.match(line)
- seqname = mo.group(1)
- length = 0
- else:
- assert seqname is not None, "FASTA file does not start with '>'."
- length += len(line.rstrip())
- if seqname is not None:
- seqlengths[ seqname ] = length
- return seqlengths
+ seqname = None
+ length = 0
+ seqlengths = {}
+ for line in FileOrSequence.__iter__(self):
+ if line.startswith(">"):
+ if seqname is not None:
+ seqlengths[seqname] = length
+ mo = _re_fasta_header_line.match(line)
+ seqname = mo.group(1)
+ length = 0
+ else:
+ assert seqname is not None, "FASTA file does not start with '>'."
+ length += len(line.rstrip())
+ if seqname is not None:
+ seqlengths[seqname] = length
+ return seqlengths
@staticmethod
def _import_pysam():
- global pysam
- try:
- import pysam
- except ImportError:
- sys.stderr.write("Please install the 'pysam' package to be able to use the Fasta indexing functionality.")
- raise
-
- def build_index(self, force = False):
- self._import_pysam()
- if not isinstance(self.fos, str):
- raise TypeError, "This function only works with FastaReader objects " + \
- "connected to a fasta file via file name"
- index_filename = self.fos + ".fai"
- if os.access(index_filename, os.R_OK):
- if (not force) and os.stat(self.filename_or_sequence).st_mtime <= \
- os.stat(index_filename).st_mtime:
- # index is up to date
- return
- pysam.faidx(self.fos)
- if not os.access(index_filename, os.R_OK):
- raise SystemError, "Building of Fasta index failed due to unknown error."
+ global pysam
+ try:
+ import pysam
+ except ImportError:
+ sys.stderr.write(
+ "Please install the 'pysam' package to be able to use the Fasta indexing functionality.")
+ raise
+
+ def build_index(self, force=False):
+ self._import_pysam()
+ if not isinstance(self.fos, str):
+ raise TypeError(
+ "This function only works with FastaReader objects " +
+ "connected to a fasta file via file name")
+ index_filename = self.fos + ".fai"
+ if os.access(index_filename, os.R_OK):
+ if (not force) and os.stat(self.filename_or_sequence).st_mtime <= \
+ os.stat(index_filename).st_mtime:
+ # index is up to date
+ return
+ pysam.faidx(self.fos)
+ if not os.access(index_filename, os.R_OK):
+ raise SystemError(
+ "Building of Fasta index failed due to unknown error.")
def __getitem__(self, iv):
- if not isinstance(iv, GenomicInterval):
- raise TypeError, "GenomicInterval expected as key."
- if not isinstance(self.fos, str):
- raise TypeError, "This function only works with FastaReader objects " + \
- "connected to a fasta file via file name"
- self._import_pysam()
- fasta = pysam.faidx(self.fos, "%s:%d-%d" % (iv.chrom, iv.start, iv.end-1))
- ans = list(FastaReader(fasta))
- assert len(ans) == 1
- ans[0].name = str(iv)
- if iv.strand != "-":
- return ans[0]
- else:
- return ans[0].get_reverse_complement()
+ if not isinstance(iv, GenomicInterval):
+ raise TypeError("GenomicInterval expected as key.")
+ if not isinstance(self.fos, str):
+ raise TypeError(
+ "This function only works with FastaReader objects " +
+ "connected to a fasta file via file name")
+ self._import_pysam()
+ fasta = pysam.faidx(
+ self.fos,
+ "%s:%d-%d" % (iv.chrom, iv.start, iv.end - 1))
+ ans = list(FastaReader(fasta))
+ assert len(ans) == 1
+ ans[0].name = str(iv)
+ if iv.strand != "-":
+ return ans[0]
+ else:
+ return ans[0].get_reverse_complement()
class FastqReader(FileOrSequence):
@@ -401,20 +415,26 @@ class FastqReader(FileOrSequence):
if not qual.endswith("\n"):
qual += "\n"
if not id1.startswith("@"):
- raise ValueError("Primary ID line in FASTQ file does"
- "not start with '@'. Either this is not FASTQ data or the parser got out of sync.")
+ raise ValueError(
+ "Primary ID line in FASTQ file does "
+ "not start with '@'. Either this is not FASTQ data or the "
+ "parser got out of sync.")
if not id2.startswith("+"):
- raise ValueError("Secondary ID line in FASTQ file does"
- "not start with '+'. Maybe got out of sync.")
+ raise ValueError(
+ "Secondary ID line in FASTQ file does"
+ "not start with '+'. Maybe got out of sync.")
if len(id2) > 2 and id1[1:] != id2[1:]:
- raise ValueError("Primary and secondary ID line in FASTQ"
- "disagree.")
+ raise ValueError(
+ "Primary and secondary ID line in FASTQ"
+ "disagree.")
if self.raw_iterator:
s = (seq[:-1], id1[1:-1], qual[:-1], self.qual_scale)
else:
- s = SequenceWithQualities(seq[:-1], id1[1:-1], qual[:-1],
- self.qual_scale)
+ s = SequenceWithQualities(
+ seq[:-1], id1[1:-1],
+ qual[:-1],
+ self.qual_scale)
yield s
@@ -430,119 +450,145 @@ class BowtieReader(FileOrSequence):
except ValueError:
if line.startswith("Reported "):
continue
- warnings.warn("BowtieReader: Ignoring the following line, which could not be parsed:\n%s\n" % line,
- RuntimeWarning)
+ warnings.warn(
+ "BowtieReader: Ignoring the following line, which could "
+ "not be parsed:\n%s\n" % line,
+ RuntimeWarning)
yield algnt
def bundle_multiple_alignments(sequence_of_alignments):
- """Some alignment programs, e.g., Bowtie, can output multiple alignments,
- i.e., the same read is reported consecutively with different alignments.
- This function takes an iterator over alignments and bundles consecutive
- alignments regarding the same read to a list of Alignment objects and
- returns an iterator over these.
- """
- alignment_iter = iter(sequence_of_alignments)
- algnt = alignment_iter.next()
- ma = [ algnt ]
- for algnt in alignment_iter:
- if algnt.read.name != ma[0].read.name:
- yield ma
- ma = [ algnt ]
- else:
- ma.append(algnt)
- yield ma
+ """Some alignment programs, e.g., Bowtie, can output multiple alignments,
+ i.e., the same read is reported consecutively with different alignments.
+ This function takes an iterator over alignments and bundles consecutive
+ alignments regarding the same read to a list of Alignment objects and
+ returns an iterator over these.
+ """
+ alignment_iter = iter(sequence_of_alignments)
+ algnt = alignment_iter.next()
+ ma = [algnt]
+ for algnt in alignment_iter:
+ if algnt.read.name != ma[0].read.name:
+ yield ma
+ ma = [algnt]
+ else:
+ ma.append(algnt)
+ yield ma
class SolexaExportAlignment(Alignment):
- """Iterating over SolexaExportReader objects will yield SoelxaExportRecord
- objects. These have four fields:
- read - a SequenceWithQualities object
- aligned - a boolean, indicating whether the object was aligned
- iv - a GenomicInterval giving the alignment (or None, if not aligned)
- passed_filter - a boolean, indicating whether the object passed the filter
- nomatch_code - a code indicating why no match was found (or None, if the
- read was aligned)
-
- As long as 'aligned' is True, a SolexaExportRecord can be treated as an
- Alignment object.
- """
-
- def __init__(self):
- # Data is filled in by SolexaExportRecord
- pass
-
- def __repr__(self):
- if self.aligned:
- return "< %s object: Read '%s', aligned to %s >" % (
- self.__class__.__name__, self.read.name, self.iv)
- else:
- return "< %s object: Non-aligned read '%s' >" % (
- self.__class__.__name__, self.read.name)
+ """Iterating over SolexaExportReader objects will yield SoelxaExportRecord
+ objects. These have four fields:
+ read - a SequenceWithQualities object
+ aligned - a boolean, indicating whether the object was aligned
+ iv - a GenomicInterval giving the alignment (or None, if not aligned)
+ passed_filter - a boolean, indicating whether the object passed the filter
+ nomatch_code - a code indicating why no match was found (or None, if the
+ read was aligned)
+
+ As long as 'aligned' is True, a SolexaExportRecord can be treated as an
+ Alignment object.
+ """
+
+ def __init__(self):
+ # Data is filled in by SolexaExportRecord
+ pass
+
+ def __repr__(self):
+ if self.aligned:
+ return "< %s object: Read '%s', aligned to %s >" % (
+ self.__class__.__name__, self.read.name, self.iv)
+ else:
+ return "< %s object: Non-aligned read '%s' >" % (
+ self.__class__.__name__, self.read.name)
+
class SolexaExportReader(FileOrSequence):
- """Parser for *_export.txt files from the SolexaPipeline software.
-
- Iterating over a SolexaExportReader yields SolexaExportRecord objects.
- """
-
- def __init__(self, filename_or_sequence, solexa_old = False):
- FileOrSequence.__init__(self, filename_or_sequence)
- if solexa_old:
- self.qualscale = "solexa-old"
- else:
- self.qualscale = "solexa"
-
- @classmethod
- def parse_line_bare(dummy, line):
- if line[-1] == "\n":
- line = line[:-1]
- res = {}
- (res['machine'], res['run_number'], res['lane'], res['tile'], res['x_coord'],
- res['y_coord'], res['index_string'], res['read_nbr'], res['read_seq'],
- res['qual_str'], res['chrom'], res['contig'], res['pos'], res['strand'],
- res['match_descr'], res['single_read_algnt_score'],
- res['paired_read_algnt_score'], res['partner_chrom'], res['partner_contig'],
- res['partner_offset'], res['partner_strand'], res['passed_filtering']) \
- = line.split("\t")
- return res
-
- def __iter__(self):
- for line in FileOrSequence.__iter__(self):
- record = SolexaExportAlignment()
- fields = SolexaExportReader.parse_line_bare(line)
- if fields['read_nbr'] != "1":
- warnings.warn("Paired-end read encountered. PE is so far supported only for " +
- "SAM files, not yet for SolexaExport. All PE-related fields are ignored. ")
- record.read = SequenceWithQualities(
- fields['read_seq'],
- "%s:%s:%s:%s:%s#0" % (fields['machine'], fields['lane'], fields['tile'],
- fields['x_coord'], fields['y_coord']),
- fields['qual_str'], self.qualscale)
- if fields['passed_filtering'] == 'Y':
- record.passed_filter = True
- elif fields['passed_filtering'] == 'N':
- record.passed_filter = False
- else:
- raise ValueError, "Illegal 'passed filter' value in Solexa export data: '%s'." % fields['passed_filtering']
- record.index_string = fields['index_string']
- if fields['pos'] == '':
- record.iv = None
- record.nomatch_code = fields['chrom']
- else:
- if fields['strand'] == 'F':
- strand = '+'
- elif fields['strand'] == 'R':
- strand = '-'
+ """Parser for *_export.txt files from the SolexaPipeline software.
+
+ Iterating over a SolexaExportReader yields SolexaExportRecord objects.
+ """
+
+ def __init__(self, filename_or_sequence, solexa_old=False):
+ FileOrSequence.__init__(self, filename_or_sequence)
+ if solexa_old:
+ self.qualscale = "solexa-old"
+ else:
+ self.qualscale = "solexa"
+
+ @classmethod
+ def parse_line_bare(dummy, line):
+ if line[-1] == "\n":
+ line = line[:-1]
+ res = {}
+ (res['machine'],
+ res['run_number'],
+ res['lane'],
+ res['tile'],
+ res['x_coord'],
+ res['y_coord'],
+ res['index_string'],
+ res['read_nbr'],
+ res['read_seq'],
+ res['qual_str'],
+ res['chrom'],
+ res['contig'],
+ res['pos'],
+ res['strand'],
+ res['match_descr'],
+ res['single_read_algnt_score'],
+ res['paired_read_algnt_score'],
+ res['partner_chrom'],
+ res['partner_contig'],
+ res['partner_offset'],
+ res['partner_strand'],
+ res['passed_filtering']) = line.split("\t")
+ return res
+
+ def __iter__(self):
+ for line in FileOrSequence.__iter__(self):
+ record = SolexaExportAlignment()
+ fields = SolexaExportReader.parse_line_bare(line)
+ if fields['read_nbr'] != "1":
+ warnings.warn(
+ "Paired-end read encountered. PE is so far supported only "
+ "for SAM files, not yet for SolexaExport. All PE-related "
+ "fields are ignored.")
+ record.read = SequenceWithQualities(
+ fields['read_seq'],
+ "%s:%s:%s:%s:%s#0" % (fields['machine'],
+ fields['lane'],
+ fields['tile'],
+ fields['x_coord'],
+ fields['y_coord']),
+ fields['qual_str'], self.qualscale)
+ if fields['passed_filtering'] == 'Y':
+ record.passed_filter = True
+ elif fields['passed_filtering'] == 'N':
+ record.passed_filter = False
+ else:
+ raise ValueError(
+ "Illegal 'passed filter' value in Solexa export data: '%s'." % fields['passed_filtering'])
+ record.index_string = fields['index_string']
+ if fields['pos'] == '':
+ record.iv = None
+ record.nomatch_code = fields['chrom']
else:
- raise ValueError, "Illegal strand value in Solexa export data."
- start = int(fields['pos'])
- chrom = fields['chrom']
- if fields['chrom'] == "":
- chrom = fields['contig']
- record.iv = GenomicInterval(chrom, start,
- start + len(fields['read_seq']), strand)
- yield record
+ if fields['strand'] == 'F':
+ strand = '+'
+ elif fields['strand'] == 'R':
+ strand = '-'
+ else:
+ raise ValueError(
+ "Illegal strand value in Solexa export data.")
+ start = int(fields['pos'])
+ chrom = fields['chrom']
+ if fields['chrom'] == "":
+ chrom = fields['contig']
+ record.iv = GenomicInterval(
+ chrom, start,
+ start + len(fields['read_seq']), strand)
+ yield record
class SAM_Reader(FileOrSequence):
@@ -582,15 +628,53 @@ class GenomicArrayOfSets(GenomicArray):
###########################
-## paired-end handling
+# paired-end handling
###########################
-
-def pair_SAM_alignments(alignments, bundle=False):
+def pair_SAM_alignments(
+ alignments,
+ bundle=False,
+ primary_only=False):
+ '''Iterate over SAM aligments, name-sorted paired-end
+
+ Args:
+ alignments (iterator of SAM/BAM alignments): the alignments to wrap
+ bundle (bool): if True, bundle all alignments from one read pair into a
+ single yield. If False (default), each pair of alignments is
+ yielded separately.
+ primary_only (bool): for each read, consider only the primary line
+ (SAM flag 0x900 = 0). The SAM specification requires one and only
+ one of those for each read.
+
+ Yields:
+ 2-tuples with each pair of alignments or, if bundle==True, each bundled
+ list of alignments.
+ '''
mate_missing_count = [0]
def process_list(almnt_list):
+ '''Transform a list of alignment with the same read name into pairs
+
+ Args:
+ almnt_list (list): alignments to process
+
+ Yields:
+ each pair of alignments.
+
+ This function is needed because each line of a BAM file is not a read
+ but an alignment. For uniquely mapped and unmapped reads, those two are
+ the same. For multimapped reads, however, there can be more than one
+ alignment for each read. Also, it is normal for a mapper to uniquely
+ map one read and multimap its mate.
+
+ This function goes down the list of alignments for a given read name
+ and tries to find the first mate. So if read 1 is uniquely mapped but
+ read 2 is mapped 4 times, only (read 1, read 2 - first occurrence) will
+ yield; the other 3 alignments of read 2 are ignored.
+ '''
+
+
while len(almnt_list) > 0:
a1 = almnt_list.pop(0)
# Find its mate
@@ -608,7 +692,8 @@ def pair_SAM_alignments(alignments, bundle=False):
if a1.mate_aligned:
mate_missing_count[0] += 1
if mate_missing_count[0] == 1:
- warnings.warn("Read " + a1.read.name + " claims to have an aligned mate " +
+ warnings.warn(
+ "Read " + a1.read.name + " claims to have an aligned mate " +
"which could not be found in an adjacent line.")
a2 = None
if a2 is not None:
@@ -623,9 +708,14 @@ def pair_SAM_alignments(alignments, bundle=False):
current_name = None
for almnt in alignments:
if not almnt.paired_end:
- raise ValueError, "'pair_alignments' needs a sequence of paired-end alignments"
+ raise ValueError(
+ "'pair_alignments' needs a sequence of paired-end alignments")
if almnt.pe_which == "unknown":
- raise ValueError, "Paired-end read found with 'unknown' 'pe_which' status."
+ raise ValueError(
+ "Paired-end read found with 'unknown' 'pe_which' status.")
+ # FIXME: almnt.not_primary_alignment currently means secondary
+ if primary_only and (almnt.not_primary_alignment or almnt.supplementary):
+ continue
if almnt.read.name == current_name:
almnt_list.append(almnt)
else:
@@ -642,118 +732,144 @@ def pair_SAM_alignments(alignments, bundle=False):
for p in process_list(almnt_list):
yield p
if mate_missing_count[0] > 1:
- warnings.warn("%d reads with missing mate encountered." % mate_missing_count[0])
-
-
-def pair_SAM_alignments_with_buffer(alignments, max_buffer_size=30000000):
-
- almnt_buffer = {}
- ambiguous_pairing_counter = 0
- for almnt in alignments:
-
- if not almnt.paired_end:
- raise ValueError, "Sequence of paired-end alignments expected, but got single-end alignment."
- if almnt.pe_which == "unknown":
- raise ValueError, "Cannot process paired-end alignment found with 'unknown' 'pe_which' status."
-
- matekey = (
- almnt.read.name,
- "second" if almnt.pe_which == "first" else "first",
- almnt.mate_start.chrom if almnt.mate_aligned else None,
- almnt.mate_start.pos if almnt.mate_aligned else None,
- almnt.iv.chrom if almnt.aligned else None,
- almnt.iv.start if almnt.aligned else None,
- -almnt.inferred_insert_size if almnt.aligned and almnt.mate_aligned else None)
-
- if matekey in almnt_buffer:
- if len(almnt_buffer[ matekey ]) == 1:
- mate = almnt_buffer[ matekey ][ 0 ]
- del almnt_buffer[ matekey ]
- else:
- mate = almnt_buffer[ matekey ].pop(0)
- if ambiguous_pairing_counter == 0:
- ambiguous_pairing_first_occurance = matekey
- ambiguous_pairing_counter += 1
- if almnt.pe_which == "first":
- yield (almnt, mate)
- else:
- yield (mate, almnt)
- else:
- almntkey = (
- almnt.read.name, almnt.pe_which,
- almnt.iv.chrom if almnt.aligned else None,
- almnt.iv.start if almnt.aligned else None,
+ warnings.warn("%d reads with missing mate encountered." %
+ mate_missing_count[0])
+
+
+def pair_SAM_alignments_with_buffer(
+ alignments,
+ max_buffer_size=30000000,
+ primary_only=False):
+ '''Iterate over SAM aligments with buffer, position-sorted paired-end
+
+ Args:
+ alignments (iterator of SAM/BAM alignments): the alignments to wrap
+ max_buffer_size (int): maxmal numer of alignments to keep in memory.
+ primary_only (bool): for each read, consider only the primary line
+ (SAM flag 0x900 = 0). The SAM specification requires one and only
+ one of those for each read.
+
+ Yields:
+ 2-tuples with each pair of alignments.
+ '''
+
+ almnt_buffer = {}
+ ambiguous_pairing_counter = 0
+ for almnt in alignments:
+ if not almnt.paired_end:
+ raise ValueError(
+ "Sequence of paired-end alignments expected, but got single-end alignment.")
+ if almnt.pe_which == "unknown":
+ raise ValueError(
+ "Cannot process paired-end alignment found with 'unknown' 'pe_which' status.")
+ # FIXME: almnt.not_primary_alignment currently means secondary
+ if primary_only and (almnt.not_primary_alignment or almnt.supplementary):
+ continue
+
+ matekey = (
+ almnt.read.name,
+ "second" if almnt.pe_which == "first" else "first",
almnt.mate_start.chrom if almnt.mate_aligned else None,
almnt.mate_start.pos if almnt.mate_aligned else None,
- almnt.inferred_insert_size if almnt.aligned and almnt.mate_aligned else None)
- if almntkey not in almnt_buffer:
- almnt_buffer[ almntkey ] = [ almnt ]
- else:
- almnt_buffer[ almntkey ].append(almnt)
- if len(almnt_buffer) > max_buffer_size:
- raise ValueError, "Maximum alignment buffer size exceeded while pairing SAM alignments."
-
- if len(almnt_buffer) > 0:
- warnings.warn("Mate records missing for %d records; first such record: %s." %
- (len(almnt_buffer), str(almnt_buffer.values()[0][0])))
- for almnt_list in almnt_buffer.values():
- for almnt in almnt_list:
+ almnt.iv.chrom if almnt.aligned else None,
+ almnt.iv.start if almnt.aligned else None,
+ -almnt.inferred_insert_size if almnt.aligned and almnt.mate_aligned else None)
+
+ if matekey in almnt_buffer:
+ if len(almnt_buffer[matekey]) == 1:
+ mate = almnt_buffer[matekey][0]
+ del almnt_buffer[matekey]
+ else:
+ mate = almnt_buffer[matekey].pop(0)
+ if ambiguous_pairing_counter == 0:
+ ambiguous_pairing_first_occurance = matekey
+ ambiguous_pairing_counter += 1
if almnt.pe_which == "first":
- yield (almnt, None)
+ yield (almnt, mate)
+ else:
+ yield (mate, almnt)
+ else:
+ almntkey = (
+ almnt.read.name, almnt.pe_which,
+ almnt.iv.chrom if almnt.aligned else None,
+ almnt.iv.start if almnt.aligned else None,
+ almnt.mate_start.chrom if almnt.mate_aligned else None,
+ almnt.mate_start.pos if almnt.mate_aligned else None,
+ almnt.inferred_insert_size if almnt.aligned and almnt.mate_aligned else None)
+ if almntkey not in almnt_buffer:
+ almnt_buffer[almntkey] = [almnt]
else:
- yield (None, almnt)
+ almnt_buffer[almntkey].append(almnt)
+ if len(almnt_buffer) > max_buffer_size:
+ raise ValueError(
+ "Maximum alignment buffer size exceeded while pairing SAM alignments.")
+
+ if len(almnt_buffer) > 0:
+ warnings.warn(
+ "Mate records missing for %d records; first such record: %s." %
+ (len(almnt_buffer), str(almnt_buffer.values()[0][0])))
+ for almnt_list in almnt_buffer.values():
+ for almnt in almnt_list:
+ if almnt.pe_which == "first":
+ yield (almnt, None)
+ else:
+ yield (None, almnt)
- if ambiguous_pairing_counter > 0:
- warnings.warn("Mate pairing was ambiguous for %d records; mate key for first such record: %s." %
- (ambiguous_pairing_counter, str(ambiguous_pairing_first_occurance)))
+ if ambiguous_pairing_counter > 0:
+ warnings.warn(
+ "Mate pairing was ambiguous for %d records; mate key for first such record: %s." %
+ (ambiguous_pairing_counter, str(ambiguous_pairing_first_occurance)))
###########################
-## variant calls
+# variant calls
###########################
_re_vcf_meta_comment = re.compile("^##([a-zA-Z]+)\=(.*)$")
-_re_vcf_meta_descr = re.compile('ID=[^,]+,?|Number=[^,]+,?|Type=[^,]+,?|Description="[^"]+",?')
+_re_vcf_meta_descr = re.compile(
+ 'ID=[^,]+,?|Number=[^,]+,?|Type=[^,]+,?|Description="[^"]+",?')
_re_vcf_meta_types = re.compile("[INFO|FILTER|FORMAT]")
_vcf_typemap = {
- "Integer":int,
- "Float":float,
- "String":str,
- "Flag":bool
+ "Integer": int,
+ "Float": float,
+ "String": str,
+ "Flag": bool
}
+
class VariantCall(object):
- def __init__(self, chrom = None, pos = None, identifier = None, ref = None, alt = None, qual = None, filtr = None, info = None):
- self.chrom = chrom
- self.pos = pos
- self.id = identifier
- self.ref = ref
- self.alt = alt
- self.qual = qual
+ def __init__(self, chrom=None, pos=None, identifier=None, ref=None,
+ alt=None, qual=None, filtr=None, info=None):
+ self.chrom = chrom
+ self.pos = pos
+ self.id = identifier
+ self.ref = ref
+ self.alt = alt
+ self.qual = qual
self.filter = filtr
- self.info = info
+ self.info = info
self._original_line = None
@classmethod
def fromdict(cls, dictionary):
ret = cls()
- ret.chrom = dictionary["chrom"]
- ret.pos = dictionary["pos"]
- ret.id = dictionary["id"]
- ret.ref = dictionary["ref"]
- ret.alt = dictionary["alt"]
- ret.qual = dictionary["qual"]
- ret.filter = dictionary["filter"]
- ret.info = dictionary["info"]
+ ret.chrom = dictionary["chrom"]
+ ret.pos = dictionary["pos"]
+ ret.id = dictionary["id"]
+ ret.ref = dictionary["ref"]
+ ret.alt = dictionary["alt"]
+ ret.qual = dictionary["qual"]
+ ret.filter = dictionary["filter"]
+ ret.info = dictionary["info"]
ret._original_line = None
@classmethod
- def fromline(cls, line, nsamples = 0, sampleids = []):
+ def fromline(cls, line, nsamples=0, sampleids=[]):
ret = cls()
if nsamples == 0:
ret.format = None
@@ -763,9 +879,10 @@ class VariantCall(object):
ret.chrom, ret.pos, ret.id, ret.ref, ret.alt, ret.qual, ret.filter, ret.info = lsplit[:8]
ret.format = lsplit[8].split(":")
ret.samples = {}
- spos=9
+ spos = 9
for sid in sampleids:
- ret.samples[ sid ] = dict((name, value) for (name, value) in itertools.izip(ret.format, lsplit[spos].split(":")))
+ ret.samples[sid] = dict((name, value) for (
+ name, value) in itertools.izip(ret.format, lsplit[spos].split(":")))
spos += 1
ret.pos = GenomicPosition(ret.chrom, int(ret.pos))
ret.alt = ret.alt.split(",")
@@ -779,28 +896,29 @@ class VariantCall(object):
return self.info
def get_original_line(self):
- warnings.warn("Original line is empty, probably this object was created from scratch and not from a line in a .vcf file!")
- return self._original_line
+ warnings.warn(
+ "Original line is empty, probably this object was created from scratch and not from a line in a .vcf file!")
+ return self._original_line
def sampleline(self):
- if self.format == None:
- print >> sys.stderr, "No samples in this variant call!"
- return ""
- keys = self.format
- ret = [ ":".join(keys) ]
- for sid in self.samples:
- tmp = []
- for k in keys:
- if k in self.samples[sid]:
- tmp.append(self.samples[sid][k])
- ret.append(":".join(tmp))
- return "\t".join(ret)
+ if self.format == None:
+ sys.stderr.write("No samples in this variant call!\n")
+ return ""
+ keys = self.format
+ ret = [":".join(keys)]
+ for sid in self.samples:
+ tmp = []
+ for k in keys:
+ if k in self.samples[sid]:
+ tmp.append(self.samples[sid][k])
+ ret.append(":".join(tmp))
+ return "\t".join(ret)
def to_line(self):
- if self.format == None:
- return "\t".join(map(str, [ self.pos.chrom, self.pos.pos, self.id, self.ref, ",".join(self.alt), self.qual, self.filter, self.infoline() ])) + "\n"
- else:
- return "\t".join(map(str, [ self.pos.chrom, self.pos.pos, self.id, self.ref, ",".join(self.alt), self.qual, self.filter, self.infoline(), self.sampleline() ])) + "\n"
+ if self.format == None:
+ return "\t".join(map(str, [self.pos.chrom, self.pos.pos, self.id, self.ref, ",".join(self.alt), self.qual, self.filter, self.infoline()])) + "\n"
+ else:
+ return "\t".join(map(str, [self.pos.chrom, self.pos.pos, self.id, self.ref, ",".join(self.alt), self.qual, self.filter, self.infoline(), self.sampleline()])) + "\n"
def __descr__(self):
return "<VariantCall at %s, ref '%s', alt %s >" % (str(self.pos).rstrip("/."), self.ref, str(self.alt).strip("[]"))
@@ -817,9 +935,9 @@ class VariantCall(object):
tmp[token[0]] = map(infodict[token[0]], token[1].split(","))
else:
tmp[token[0]] = token[1].split(",")
- if len(tmp[ token[0] ]) == 1:
+ if len(tmp[token[0]]) == 1:
tmp[token[0]] = tmp[token[0]][0]
- else: #Flag attribute found
+ else: # Flag attribute found
tmp[token] = True
diff = set(infodict.keys()).difference(set(tmp.keys()))
for key in diff:
@@ -827,6 +945,7 @@ class VariantCall(object):
tmp[key] = False
self.info = tmp
+
class VCF_Reader(FileOrSequence):
def __init__(self, filename_or_sequence):
@@ -839,10 +958,12 @@ class VCF_Reader(FileOrSequence):
self.sampleids = []
def make_info_dict(self):
- self.infodict = dict((key, _vcf_typemap[self.info[key]["Type"]]) for key in self.info.keys())
+ self.infodict = dict(
+ (key,
+ _vcf_typemap[self.info[key]["Type"]]) for key in self.info.keys())
- def parse_meta(self, header_filename = None):
- if header_filename == None:
+ def parse_meta(self, header_filename=None):
+ if header_filename is None:
the_iter = FileOrSequence.__iter__(self)
else:
the_iter = open(header_filename, "r")
@@ -854,22 +975,25 @@ class VCF_Reader(FileOrSequence):
if mo:
value = mo.group(2)
if mo.group(1) == "INFO":
- value = dict(e.rstrip(",").split("=",1) for e in _re_vcf_meta_descr.findall(value))
+ value = dict(e.rstrip(",").split("=", 1)
+ for e in _re_vcf_meta_descr.findall(value))
key = value["ID"]
del value["ID"]
- self.info[ key ] = value
+ self.info[key] = value
elif mo.group(1) == "FILTER":
- value = dict(e.rstrip(",").split("=",1) for e in _re_vcf_meta_descr.findall(value))
+ value = dict(e.rstrip(",").split("=", 1)
+ for e in _re_vcf_meta_descr.findall(value))
key = value["ID"]
del value["ID"]
- self.filters[ key ] = value
+ self.filters[key] = value
elif mo.group(1) == "FORMAT":
- value = dict(e.rstrip(",").split("=",1) for e in _re_vcf_meta_descr.findall(value))
+ value = dict(e.rstrip(",").split("=", 1)
+ for e in _re_vcf_meta_descr.findall(value))
key = value["ID"]
del value["ID"]
- self.formats[ key ] = value
+ self.formats[key] = value
else:
- self.metadata[ mo.group(1) ] = mo.group(2)
+ self.metadata[mo.group(1)] = mo.group(2)
else:
self.sampleids = line.rstrip("\t\n").split("\t")[9:]
self.nsamples = len(self.sampleids)
@@ -961,14 +1085,14 @@ class BAM_Reader(object):
try:
import pysam
except ImportError:
- sys.stderr.write("Please Install PySam to use the BAM_Reader Class (http://code.google.com/p/pysam/)")
+ sys.stderr.write(
+ "Please Install PySam to use the BAM_Reader Class (http://code.google.com/p/pysam/)")
raise
def __iter__(self):
sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
self.record_no = 0
for pa in sf:
- #yield SAM_Alignment.from_pysam_AlignedRead(pa, sf)
yield SAM_Alignment.from_pysam_AlignedSegment(pa, sf)
self.record_no += 1
@@ -996,13 +1120,15 @@ class BAM_Reader(object):
def __getitem__(self, iv):
if not isinstance(iv, GenomicInterval):
- raise TypeError("Use a HTSeq.GenomicInterval to access regions within .bam-file!")
+ raise TypeError(
+ "Use a HTSeq.GenomicInterval to access regions within .bam-file!")
if self.sf is None:
self.sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
# NOTE: pysam 0.9 has renames _hasIndex into has_index
if (hasattr(self.sf, '_hasIndex') and (not self.sf._hasIndex())) or (not self.sf.has_index()):
- raise ValueError("The .bam-file has no index, random-access is disabled!")
- for pa in self.sf.fetch(iv.chrom, iv.start+1, iv.end):
+ raise ValueError(
+ "The .bam-file has no index, random-access is disabled!")
+ for pa in self.sf.fetch(iv.chrom, iv.start + 1, iv.end):
yield SAM_Alignment.from_pysam_AlignedRead(pa, self.sf)
def get_header_dict(self):
@@ -1011,11 +1137,13 @@ class BAM_Reader(object):
class BAM_Writer(object):
- def __init__(self, filename, template=None, referencenames=None, referencelengths=None, text=None, header=None):
+ def __init__(self, filename, template=None, referencenames=None,
+ referencelengths=None, text=None, header=None):
try:
import pysam
except ImportError:
- sys.stderr.write("Please Install PySam to use the BAM_Writer Class (http://code.google.com/p/pysam/)")
+ sys.stderr.write(
+ "Please Install PySam to use the BAM_Writer Class (http://code.google.com/p/pysam/)")
raise
self.filename = filename
@@ -1038,7 +1166,6 @@ class BAM_Writer(object):
return BAM_Writer(filename=fn, header=br.get_header_dict())
def write(self, alnmt):
- #self.sf.write(alnmt.to_pysam_AlignedRead(self.sf))
self.sf.write(alnmt.to_pysam_AlignedSegment(self.sf))
def close(self):
@@ -1059,9 +1186,21 @@ class BED_Reader(FileOrSequence):
raise ValueError("BED file line contains less than 3 fields")
if len(fields) > 9:
raise ValueError("BED file line contains more than 9 fields")
- iv = GenomicInterval(fields[0], int(fields[1]), int(fields[2]), fields[5] if len(fields) > 5 else ".")
- f = GenomicFeature(fields[3] if len(fields) > 3 else "unnamed", "BED line", iv)
+ iv = GenomicInterval(
+ fields[0],
+ int(fields[1]),
+ int(fields[2]),
+ fields[5] if len(fields) > 5 else ".")
+ f = GenomicFeature(
+ fields[3] if len(fields) > 3 else "unnamed",
+ "BED line",
+ iv)
f.score = float(fields[4]) if len(fields) > 4 else None
- f.thick = GenomicInterval(iv.chrom, int(fields[6]), int(fields[7]), iv.strand) if len(fields) > 7 else None
- f.itemRgb = [int(a) for a in fields[8].split(",")] if len(fields) > 8 else None
+ f.thick = GenomicInterval(
+ iv.chrom,
+ int(fields[6]),
+ int(fields[7]),
+ iv.strand) if len(fields) > 7 else None
+ f.itemRgb = [int(a) for a in fields[8].split(",")
+ ] if len(fields) > 8 else None
yield(f)
=====================================
python2/HTSeq/_version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.10.0"
\ No newline at end of file
+__version__ = "0.11.2"
\ No newline at end of file
=====================================
python2/HTSeq/scripts/count.py
=====================================
@@ -41,12 +41,13 @@ def count_reads_in_features(sam_filenames, gff_filename,
r = (r,)
for read in r:
if read is not None:
- samoutfile.write(read.original_sam_line.rstrip() +
- "\tXF:Z:" + assignment + "\n")
+ read.optional_fields.append(('XF', assignment))
+ samoutfile.write(read.get_sam_line() + "\n")
- if samouts != "":
+ if samouts != []:
if len(samouts) != len(sam_filenames):
- raise ValueError('Select the same number of SAM input and output files')
+ raise ValueError(
+ 'Select the same number of SAM input and output files')
# Try to open samout files early in case any of them has issues
for samout in samouts:
with open(samout, 'w'):
@@ -73,12 +74,14 @@ def count_reads_in_features(sam_filenames, gff_filename,
try:
feature_id = f.attr[id_attribute]
except KeyError:
- raise ValueError("Feature %s does not contain a '%s' attribute" %
- (f.name, id_attribute))
+ raise ValueError(
+ "Feature %s does not contain a '%s' attribute" %
+ (f.name, id_attribute))
if stranded != "no" and f.iv.strand == ".":
- raise ValueError("Feature %s at %s does not have strand information but you are "
- "running htseq-count in stranded mode. Use '--stranded=no'." %
- (f.name, f.iv))
+ raise ValueError(
+ "Feature %s at %s does not have strand information but you are "
+ "running htseq-count in stranded mode. Use '--stranded=no'." %
+ (f.name, f.iv))
features[f.iv] += feature_id
counts[f.attr[id_attribute]] = 0
attributes[f.attr[id_attribute]] = [
@@ -116,22 +119,28 @@ def count_reads_in_features(sam_filenames, gff_filename,
lowqual_all = []
nonunique_all = []
for isam, (sam_filename) in enumerate(sam_filenames):
- if samouts != '':
+ if samouts != []:
samoutfile = open(samouts[isam], 'w')
else:
samoutfile = None
try:
- if sam_filename != "-":
- read_seq_file = SAM_or_BAM_Reader(sam_filename)
- read_seq = read_seq_file
- first_read = next(iter(read_seq))
- else:
+ if sam_filename == "-":
read_seq_file = SAM_or_BAM_Reader(sys.stdin)
- read_seq_iter = iter(read_seq_file)
+ else:
+ read_seq_file = SAM_or_BAM_Reader(sam_filename)
+ read_seq_iter = iter(read_seq_file)
+ # Catch empty BAM files
+ try:
first_read = next(read_seq_iter)
+ pe_mode = first_read.paired_end
+ except:
+ first_read = None
+ pe_mode = False
+ if first_read is not None:
read_seq = itertools.chain([first_read], read_seq_iter)
- pe_mode = first_read.paired_end
+ else:
+ read_seq = []
except:
sys.stderr.write(
"Error occured when reading beginning of SAM/BAM file.\n")
@@ -175,7 +184,10 @@ def count_reads_in_features(sam_filenames, gff_filename,
try:
if r.optional_field("NH") > 1:
nonunique += 1
- write_to_samout(r, "__alignment_not_unique", samoutfile)
+ write_to_samout(
+ r,
+ "__alignment_not_unique",
+ samoutfile)
if multimapped_mode == 'none':
continue
except KeyError:
@@ -325,7 +337,8 @@ def count_reads_in_features(sam_filenames, gff_filename,
print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
-def my_showwarning(message, category, filename, lineno=None, file=None, line=None):
+def my_showwarning(message, category, filename, lineno=None, file=None,
+ line=None):
sys.stderr.write("Warning: %s\n" % message)
@@ -396,9 +409,11 @@ def main():
"suitable for Ensembl GTF files: gene_id)")
pa.add_argument(
- "--additional-attr", type=str, nargs='+',
- default=(), help="Additional feature attributes (default: none, " +
- "suitable for Ensembl GTF files: gene_name)")
+ "--additional-attr", type=str,
+ action='append',
+ default=[], help="Additional feature attributes (default: none, " +
+ "suitable for Ensembl GTF files: gene_name). Use multiple times " +
+ "for each different attribute")
pa.add_argument(
"-m", "--mode", dest="mode",
@@ -414,19 +429,20 @@ def main():
pa.add_argument(
"--secondary-alignments", dest="secondary_alignments", type=str,
- choices=("score", "ignore"), default="score",
+ choices=("score", "ignore"), default="ignore",
help="Whether to score secondary alignments (0x100 flag)")
pa.add_argument(
"--supplementary-alignments", dest="supplementary_alignments", type=str,
- choices=("score", "ignore"), default="score",
+ choices=("score", "ignore"), default="ignore",
help="Whether to score supplementary alignments (0x800 flag)")
pa.add_argument(
- "-o", "--samout", type=str, dest="samouts", nargs='+',
- default="", help="write out all SAM alignment records into an output " +
- "SAM file called SAMOUT, annotating each line with its feature assignment " +
- "(as an optional field with tag 'XF')")
+ "-o", "--samout", type=str, dest="samouts",
+ action='append',
+ default=[], help="write out all SAM alignment records into " +
+ "SAM files (one per input file needed), annotating each line " +
+ "with its feature assignment (as an optional field with tag 'XF')")
pa.add_argument(
"-q", "--quiet", action="store_true", dest="quiet",
=====================================
python2/HTSeq/scripts/qa.py
=====================================
@@ -5,18 +5,21 @@
# (c) Simon Anders, European Molecular Biology Laboratory, 2010
# released under GNU General Public License
-import sys, time, os.path, optparse
+import sys
+import os.path
+import optparse
from itertools import *
import numpy
import HTSeq
+
def main():
try:
import matplotlib
except ImportError:
sys.stderr.write("This script needs the 'matplotlib' library, which ")
- sys.stderr.write("was not found. Please install it." )
+ sys.stderr.write("was not found. Please install it.")
matplotlib.use('PDF')
from matplotlib import pyplot
@@ -26,10 +29,10 @@ def main():
except ImportError:
from matplotlib.pyplot import normalize as Normalize
-
# **** Parse command line ****
- optParser = optparse.OptionParser( usage = "%prog [options] read_file",
+ optParser = optparse.OptionParser(
+ usage="%prog [options] read_file",
description=
"This script take a file with high-throughput sequencing reads " +
"(supported formats: SAM, Solexa _export.txt, FASTQ, Solexa " +
@@ -40,62 +43,69 @@ def main():
epilog =
"Written by Simon Anders (sanders at fs.tum.de), European Molecular Biology " +
" Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
- " Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__ )
- optParser.add_option( "-t", "--type", type="choice", dest="type",
- choices = ("sam", "bam", "solexa-export", "fastq", "solexa-fastq"),
- default = "sam", help="type of read_file (one of: sam [default], bam, " +
- "solexa-export, fastq, solexa-fastq)" )
- optParser.add_option( "-o", "--outfile", type="string", dest="outfile",
- help="output filename (default is <read_file>.pdf)" )
- optParser.add_option( "-r", "--readlength", type="int", dest="readlen",
- help="the maximum read length (when not specified, the script guesses from the file" )
- optParser.add_option( "-g", "--gamma", type="float", dest="gamma",
+ " Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__)
+ optParser.add_option(
+ "-t", "--type", type="choice", dest="type",
+ choices=("sam", "bam", "solexa-export", "fastq", "solexa-fastq"),
+ default="sam", help="type of read_file (one of: sam [default], bam, " +
+ "solexa-export, fastq, solexa-fastq)")
+ optParser.add_option(
+ "-o", "--outfile", type="string", dest="outfile",
+ help="output filename (default is <read_file>.pdf)")
+ optParser.add_option(
+ "-r", "--readlength", type="int", dest="readlen",
+ help="the maximum read length (when not specified, the script guesses from the file")
+ optParser.add_option(
+ "-g", "--gamma", type="float", dest="gamma",
default = 0.3,
- help="the gamma factor for the contrast adjustment of the quality score plot" )
- optParser.add_option( "-n", "--nosplit", action="store_true", dest="nosplit",
- help="do not split reads in unaligned and aligned ones" )
- optParser.add_option( "-m", "--maxqual", type="int", dest="maxqual", default=41,
- help="the maximum quality score that appears in the data (default: 41)" )
-
- if len( sys.argv ) == 1:
+ help="the gamma factor for the contrast adjustment of the quality score plot")
+ optParser.add_option(
+ "-n", "--nosplit", action="store_true", dest="nosplit",
+ help="do not split reads in unaligned and aligned ones")
+ optParser.add_option(
+ "-m", "--maxqual", type="int", dest="maxqual", default=41,
+ help="the maximum quality score that appears in the data (default: 41)")
+
+ if len(sys.argv) == 1:
optParser.print_help()
sys.exit(1)
(opts, args) = optParser.parse_args()
- if len( args ) != 1:
- sys.stderr.write( sys.argv[0] + ": Error: Please provide one argument (the read_file).\n" )
- sys.stderr.write( " Call with '-h' to get usage information.\n" )
- sys.exit( 1 )
+ if len(args) != 1:
+ sys.stderr.write(
+ sys.argv[0] +
+ ": Error: Please provide one argument (the read_file).\n")
+ sys.stderr.write(" Call with '-h' to get usage information.\n")
+ sys.exit(1)
readfilename = args[0]
if opts.type == "sam":
- readfile = HTSeq.SAM_Reader( readfilename )
+ readfile = HTSeq.SAM_Reader(readfilename)
isAlnmntFile = True
elif opts.type == "bam":
- readfile = HTSeq.BAM_Reader( readfilename )
+ readfile = HTSeq.BAM_Reader(readfilename)
isAlnmntFile = True
elif opts.type == "solexa-export":
- readfile = HTSeq.SolexaExportReader( readfilename )
+ readfile = HTSeq.SolexaExportReader(readfilename)
isAlnmntFile = True
elif opts.type == "fastq":
- readfile = HTSeq.FastqReader( readfilename )
+ readfile = HTSeq.FastqReader(readfilename)
isAlnmntFile = False
elif opts.type == "solexa-fastq":
- readfile = HTSeq.FastqReader( readfilename, "solexa" )
+ readfile = HTSeq.FastqReader(readfilename, "solexa")
isAlnmntFile = False
else:
- sys.error( "Oops." )
+ sys.error("Oops.")
twoColumns = isAlnmntFile and not opts.nosplit
if opts.outfile is None:
- outfilename = os.path.basename( readfilename ) + ".pdf"
+ outfilename = os.path.basename(readfilename) + ".pdf"
else:
outfilename = opts.outfile
-
# **** Get read length ****
if opts.readlen is not None:
@@ -103,25 +113,23 @@ def main():
else:
readlen = 0
if isAlnmntFile:
- reads = ( a.read for a in readfile )
+ reads = (a.read for a in readfile)
else:
reads = readfile
- for r in islice( reads, 10000 ):
- if len( r ) > readlen:
- readlen = len( r )
+ for r in islice(reads, 10000):
+ if len(r) > readlen:
+ readlen = len(r)
max_qual = opts.maxqual
gamma = opts.gamma
-
# **** Initialize count arrays ****
- base_arr_U = numpy.zeros( ( readlen, 5 ), numpy.int )
- qual_arr_U = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )
+ base_arr_U = numpy.zeros((readlen, 5), numpy.int)
+ qual_arr_U = numpy.zeros((readlen, max_qual+1), numpy.int)
if twoColumns:
- base_arr_A = numpy.zeros( ( readlen, 5 ), numpy.int )
- qual_arr_A = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )
-
+ base_arr_A = numpy.zeros((readlen, 5), numpy.int)
+ qual_arr_A = numpy.zeros((readlen, max_qual+1), numpy.int)
# **** Main counting loop ****
@@ -130,110 +138,107 @@ def main():
for a in readfile:
if isAlnmntFile:
r = a.read
- else:
- r = a
- if twoColumns and (isAlnmntFile and a.aligned):
- r.add_bases_to_count_array( base_arr_A )
- r.add_qual_to_count_array( qual_arr_A )
- else:
- r.add_bases_to_count_array( base_arr_U )
- r.add_qual_to_count_array( qual_arr_U )
- i += 1
- if (i % 200000) == 0:
- print i, "reads processed"
+ else:
+ r = a
+ if twoColumns and (isAlnmntFile and a.aligned):
+ r.add_bases_to_count_array(base_arr_A)
+ r.add_qual_to_count_array(qual_arr_A)
+ else:
+ r.add_bases_to_count_array(base_arr_U)
+ r.add_qual_to_count_array(qual_arr_U)
+ i += 1
+ if (i % 200000) == 0:
+ print i, "reads processed"
except:
- sys.stderr.write( "Error occured in: %s\n" %
- readfile.get_line_number_string() )
+ sys.stderr.write("Error occured in: %s\n" %
+ readfile.get_line_number_string())
raise
print i, "reads processed"
-
# **** Normalize result ****
- def norm_by_pos( arr ):
- arr = numpy.array( arr, numpy.float )
- arr_n = ( arr.T / arr.sum( 1 ) ).T
- arr_n[ arr == 0 ] = 0
+ def norm_by_pos(arr):
+ arr = numpy.array(arr, numpy.float)
+ arr_n = (arr.T / arr.sum(1)).T
+ arr_n[arr == 0] = 0
return arr_n
- def norm_by_start( arr ):
- arr = numpy.array( arr, numpy.float )
- arr_n = ( arr.T / arr.sum( 1 )[ 0 ] ).T
- arr_n[ arr == 0 ] = 0
+ def norm_by_start(arr):
+ arr = numpy.array(arr, numpy.float)
+ arr_n = (arr.T / arr.sum(1)[0]).T
+ arr_n[arr == 0] = 0
return arr_n
-
- base_arr_U_n = norm_by_pos( base_arr_U )
- qual_arr_U_n = norm_by_start( qual_arr_U )
- nreads_U = base_arr_U[0,:].sum()
+ base_arr_U_n = norm_by_pos(base_arr_U)
+ qual_arr_U_n = norm_by_start(qual_arr_U)
+ nreads_U = base_arr_U[0, :].sum()
if twoColumns:
- base_arr_A_n = norm_by_pos( base_arr_A )
- qual_arr_A_n = norm_by_start( qual_arr_A )
- nreads_A = base_arr_A[0,:].sum()
-
+ base_arr_A_n = norm_by_pos(base_arr_A)
+ qual_arr_A_n = norm_by_start(qual_arr_A)
+ nreads_A = base_arr_A[0, :].sum()
# **** Make plot ****
- def plot_bases( arr ):
- xg = numpy.arange( readlen )
- pyplot.plot( xg, arr[ : , 0 ], marker='.', color='red')
- pyplot.plot( xg, arr[ : , 1 ], marker='.', color='darkgreen')
- pyplot.plot( xg, arr[ : , 2 ], marker='.',color='lightgreen')
- pyplot.plot( xg, arr[ : , 3 ], marker='.',color='orange')
- pyplot.plot( xg, arr[ : , 4 ], marker='.',color='grey')
- pyplot.axis( (0, readlen-1, 0, 1 ) )
- pyplot.text( readlen*.70, .9, "A", color="red" )
- pyplot.text( readlen*.75, .9, "C", color="darkgreen" )
- pyplot.text( readlen*.80, .9, "G", color="lightgreen" )
- pyplot.text( readlen*.85, .9, "T", color="orange" )
- pyplot.text( readlen*.90, .9, "N", color="grey" )
+ def plot_bases(arr):
+ xg = numpy.arange(readlen)
+ pyplot.plot(xg, arr[:, 0], marker='.', color='red')
+ pyplot.plot(xg, arr[:, 1], marker='.', color='darkgreen')
+ pyplot.plot(xg, arr[:, 2], marker='.', color='lightgreen')
+ pyplot.plot(xg, arr[:, 3], marker='.', color='orange')
+ pyplot.plot(xg, arr[:, 4], marker='.', color='grey')
+ pyplot.axis((0, readlen-1, 0, 1))
+ pyplot.text(readlen*.70, .9, "A", color="red")
+ pyplot.text(readlen*.75, .9, "C", color="darkgreen")
+ pyplot.text(readlen*.80, .9, "G", color="lightgreen")
+ pyplot.text(readlen*.85, .9, "T", color="orange")
+ pyplot.text(readlen*.90, .9, "N", color="grey")
pyplot.figure()
- pyplot.subplots_adjust( top=.85 )
- pyplot.suptitle( os.path.basename(readfilename), fontweight='bold' )
+ pyplot.subplots_adjust(top=.85)
+ pyplot.suptitle(os.path.basename(readfilename), fontweight='bold')
if twoColumns:
- pyplot.subplot( 221 )
- plot_bases( base_arr_U_n )
- pyplot.ylabel( "proportion of base" )
- pyplot.title( "non-aligned reads\n%.0f%% (%.3f million)" %
- ( 100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6 ) )
-
- pyplot.subplot( 222 )
- plot_bases( base_arr_A_n )
- pyplot.title( "aligned reads\n%.0f%% (%.3f million)" %
- ( 100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6 ) )
-
- pyplot.subplot( 223 )
- pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
- norm=Normalize( 0, 1 ) )
- pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
- pyplot.xlabel( "position in read" )
- pyplot.ylabel( "base-call quality score" )
-
- pyplot.subplot( 224 )
- pyplot.pcolor( qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens,
- norm=Normalize( 0, 1 ) )
- pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
- pyplot.xlabel( "position in read" )
+ pyplot.subplot(221)
+ plot_bases(base_arr_U_n)
+ pyplot.ylabel("proportion of base")
+ pyplot.title("non-aligned reads\n%.0f%% (%.4f million)" %
+ (100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6))
+
+ pyplot.subplot(222)
+ plot_bases(base_arr_A_n)
+ pyplot.title("aligned reads\n%.0f%% (%.4f million)" %
+ (100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6))
+
+ pyplot.subplot(223)
+ pyplot.pcolor(qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
+ norm=Normalize(0, 1))
+ pyplot.axis((0, readlen-1, 0, max_qual+1))
+ pyplot.xlabel("position in read")
+ pyplot.ylabel("base-call quality score")
+
+ pyplot.subplot(224)
+ pyplot.pcolor(qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens,
+ norm=Normalize(0, 1))
+ pyplot.axis((0, readlen-1, 0, max_qual+1))
+ pyplot.xlabel("position in read")
else:
- pyplot.subplot( 211 )
- plot_bases( base_arr_U_n )
- pyplot.ylabel( "proportion of base" )
- pyplot.title( "%.3f million reads" % ( nreads_U / 1e6 ) )
+ pyplot.subplot(211)
+ plot_bases(base_arr_U_n)
+ pyplot.ylabel("proportion of base")
+ pyplot.title("%.3f million reads" % (nreads_U / 1e6))
- pyplot.subplot( 212 )
- pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
- norm=Normalize( 0, 1 ) )
- pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
- pyplot.xlabel( "position in read" )
- pyplot.ylabel( "base-call quality score" )
+ pyplot.subplot(212)
+ pyplot.pcolor(qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
+ norm=Normalize(0, 1))
+ pyplot.axis((0, readlen-1, 0, max_qual+1))
+ pyplot.xlabel("position in read")
+ pyplot.ylabel("base-call quality score")
+ pyplot.savefig(outfilename)
- pyplot.savefig( outfilename )
if __name__ == "__main__":
main()
=====================================
python2/src/HTSeq/_HTSeq.pxd
=====================================
@@ -7,47 +7,47 @@ cdef class GenomicInterval:
cdef public long end
cdef str _strand
- cpdef is_contained_in( self, GenomicInterval iv )
- cpdef contains( self, GenomicInterval iv )
- cpdef overlaps( self, GenomicInterval iv )
- cpdef extend_to_include( GenomicInterval self, GenomicInterval iv )
+ cpdef is_contained_in(self, GenomicInterval iv)
+ cpdef contains(self, GenomicInterval iv)
+ cpdef overlaps(self, GenomicInterval iv)
+ cpdef extend_to_include(GenomicInterval self, GenomicInterval iv)
-cdef class GenomicPosition( GenomicInterval ):
+cdef class GenomicPosition(GenomicInterval):
pass
-cdef class Sequence( object ):
+cdef class Sequence(object):
cdef public bytes seq
cdef public str name
cdef public str descr
- cpdef Sequence get_reverse_complement( self )
- cpdef object add_bases_to_count_array( Sequence self, numpy.ndarray count_array_ )
- cpdef Sequence trim_left_end( Sequence self, Sequence pattern, float mismatch_prop = ? )
- cpdef Sequence trim_right_end( Sequence self, Sequence pattern, float mismatch_prop = ? )
+ cpdef Sequence get_reverse_complement(self, bint rename=?)
+ cpdef object add_bases_to_count_array(Sequence self, numpy.ndarray count_array_)
+ cpdef Sequence trim_left_end(Sequence self, Sequence pattern, float mismatch_prop=?)
+ cpdef Sequence trim_right_end(Sequence self, Sequence pattern, float mismatch_prop=?)
-cdef class SequenceWithQualities( Sequence ):
+cdef class SequenceWithQualities(Sequence):
cdef readonly bytes _qualstr
cdef readonly bytes _qualstr_phred
cdef readonly str _qualscale
cdef readonly object _qualarr
- cdef _fill_qual_arr( SequenceWithQualities self )
- cpdef object add_qual_to_count_array( SequenceWithQualities self, numpy.ndarray count_array_ )
- cpdef SequenceWithQualities trim_left_end_with_quals( SequenceWithQualities self,
- Sequence pattern, int max_mm_qual = ? )
- cpdef SequenceWithQualities trim_right_end_with_quals( SequenceWithQualities self,
- Sequence pattern, int max_mm_qual = ? )
+ cdef _fill_qual_arr(SequenceWithQualities self)
+ cpdef object add_qual_to_count_array(SequenceWithQualities self, numpy.ndarray count_array_)
+ cpdef SequenceWithQualities trim_left_end_with_quals(SequenceWithQualities self,
+ Sequence pattern, int max_mm_qual=?)
+ cpdef SequenceWithQualities trim_right_end_with_quals(SequenceWithQualities self,
+ Sequence pattern, int max_mm_qual=?)
-cdef class Alignment( object ):
+cdef class Alignment(object):
cdef public SequenceWithQualities _read
cdef public GenomicInterval iv
-cdef class AlignmentWithSequenceReversal( Alignment ):
+cdef class AlignmentWithSequenceReversal(Alignment):
cdef public SequenceWithQualities read_as_aligned
cdef public SequenceWithQualities _read_as_sequenced
-cdef class SAM_Alignment( AlignmentWithSequenceReversal ):
+cdef class SAM_Alignment(AlignmentWithSequenceReversal):
cdef public list cigar
cdef public int aQual
cdef public GenomicPosition mate_start
=====================================
python2/src/HTSeq/_HTSeq.pyx
=====================================
@@ -651,10 +651,15 @@ cdef class Sequence(object):
self.name = name
self.descr = None
- cpdef Sequence get_reverse_complement(self):
- return Sequence(
- reverse_complement(self.seq),
- "revcomp_of_" + self.name)
+ cpdef Sequence get_reverse_complement(self, bint rename=True):
+ if rename:
+ return Sequence(
+ reverse_complement(self.seq),
+ "revcomp_of_" + self.name)
+ else:
+ return Sequence(
+ reverse_complement(self.seq),
+ self.name)
def __str__(self):
return self.seq
@@ -903,13 +908,20 @@ cdef class SequenceWithQualities(Sequence):
self.write_to_fastq_file(sio, convert_to_phred)
return sio.getvalue()
- cpdef SequenceWithQualities get_reverse_complement(self):
+ cpdef SequenceWithQualities get_reverse_complement(self, bint rename=True):
cdef SequenceWithQualities res
- res = SequenceWithQualities(
- reverse_complement(self.seq),
- "revcomp_of_" + self.name,
- self._qualstr[::-1],
- self._qualscale)
+ if rename:
+ res = SequenceWithQualities(
+ reverse_complement(self.seq),
+ "revcomp_of_" + self.name,
+ self._qualstr[::-1],
+ self._qualscale)
+ else:
+ res = SequenceWithQualities(
+ reverse_complement(self.seq),
+ self.name,
+ self._qualstr[::-1],
+ self._qualscale)
if self._qualarr is not None:
res._qualarr = self._qualarr[::-1]
return res
=====================================
python3/HTSeq/__init__.py
=====================================
@@ -9,21 +9,10 @@ import os
import shlex
import sys
-try:
- import HTSeq
- from HTSeq._HTSeq import *
- #from _HTSeq import *
-except ImportError:
- if os.path.isfile("setup.py"):
- raise ImportError(
- "Cannot import 'HTSeq' when working directory is HTSeq's own build directory.")
- else:
- raise
-
+import HTSeq
+from HTSeq._HTSeq import *
from HTSeq._version import __version__
-#from vcf_reader import *
-
#########################
# Utils
#########################
@@ -32,14 +21,14 @@ from HTSeq._version import __version__
class FileOrSequence(object):
""" The construcutor takes one argument, which may either be a string,
which is interpreted as a file name (possibly with path), or a
- connection, by which we mean a text file opened for reading, or
- any other object that can provide an iterator over strings
+ connection, by which we mean a text file opened for reading, or
+ any other object that can provide an iterator over strings
(lines of the file).
The advantage of passing a file name instead of an already opened file
is that if an iterator is requested several times, the file will be
re-opened each time. If the file is already open, its lines can be read
- only once, and then, the iterator stays exhausted.
+ only once, and then, the iterator stays exhausted.
Furthermore, if a file name is passed that end in ".gz" or ".gzip"
(case insensitive), it is transparently gunzipped.
@@ -90,14 +79,13 @@ class FileOrSequence(object):
#########################
class GenomicFeature(object):
-
"""A genomic feature, i.e., an interval on a genome with metadata.
At minimum, the following information should be provided by slots:
name: a string identifying the feature (e.g., a gene symbol)
type: a string giving the feature type (e.g., "gene", "exon")
- iv: a GenomicInterval object specifying the feature locus
+ iv: a GenomicInterval object specifying the feature locus
"""
def __init__(self, name, type_, interval):
@@ -145,8 +133,8 @@ class GenomicFeature(object):
attr_str = '; '.join(
['%s%s\"%s\"' % (ak, sep, attr[ak]) for ak in attr])
return "\t".join(str(a) for a in (self.iv.chrom, source,
- self.type, self.iv.start + 1, self.iv.end, score,
- self.iv.strand, frame, attr_str)) + "\n"
+ self.type, self.iv.start + 1, self.iv.end, score,
+ self.iv.strand, frame, attr_str)) + "\n"
_re_attr_main = re.compile("\s*([^\s\=]+)[\s=]+(.*)")
@@ -157,13 +145,16 @@ def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
"""Parses a GFF attribute string and returns it as a dictionary.
If 'extra_return_first_value' is set, a pair is returned: the dictionary
- and the value of the first attribute. This might be useful if this is the ID.
+ and the value of the first attribute. This might be useful if this is the
+ ID.
"""
if attrStr.endswith("\n"):
attrStr = attrStr[:-1]
d = {}
first_val = "_unnamed_"
- for (i, attr) in zip(itertools.count(), _HTSeq.quotesafe_split(attrStr.encode())):
+ for (i, attr) in zip(
+ itertools.count(),
+ _HTSeq.quotesafe_split(attrStr.encode())):
attr = attr.decode()
if _re_attr_empty.match(attr):
continue
@@ -176,7 +167,6 @@ def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
val = mo.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
- #val = urllib.unquote( val )
d[sys.intern(mo.group(1))] = sys.intern(val)
if extra_return_first_value and i == 0:
first_val = val
@@ -185,14 +175,14 @@ def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
else:
return d
+
_re_gff_meta_comment = re.compile("##\s*(\S+)\s+(\S*)")
class GFF_Reader(FileOrSequence):
-
"""Parse a GFF file
- Pass the constructor either a file name or an iterator of lines of a
+ Pass the constructor either a file name or an iterator of lines of a
GFF files. If a file name is specified, it may refer to a gzip compressed
file.
@@ -208,7 +198,6 @@ class GFF_Reader(FileOrSequence):
for line in FileOrSequence.__iter__(self):
if isinstance(line, bytes):
line = line.decode()
-
if line == "\n":
continue
if line.startswith('#'):
@@ -221,10 +210,15 @@ class GFF_Reader(FileOrSequence):
strand, frame, attributeStr) = line.split("\t", 8)
(attr, name) = parse_GFF_attribute_string(attributeStr, True)
if self.end_included:
- iv = GenomicInterval(seqname, int(start) - 1, int(end), strand)
+ iv = GenomicInterval(
+ seqname,
+ int(start) - 1, int(end),
+ strand)
else:
- iv = GenomicInterval(seqname, int(
- start) - 1, int(end) - 1, strand)
+ iv = GenomicInterval(
+ seqname,
+ int(start) - 1, int(end) - 1,
+ strand)
f = GenomicFeature(name, feature, iv)
if score != ".":
score = float(score)
@@ -238,7 +232,7 @@ class GFF_Reader(FileOrSequence):
def make_feature_dict(feature_sequence):
- """A feature dict is a convenient way to organize a sequence of Feature
+ """A feature dict is a convenient way to organize a sequence of Feature
object (which you have got, e.g., from parse_GFF).
The function returns a dict with all the feature types as keys. Each value
@@ -248,8 +242,8 @@ def make_feature_dict(feature_sequence):
An example makes this clear. Let's say you load the C. elegans GTF file
from Ensemble and make a feature dict:
- >>> worm_features_dict = HTSeq.make_feature_dict( HTSeq.parse_GFF(
- ... "test_data/Caenorhabditis_elegans.WS200.55.gtf.gz" ) )
+ >>> worm_features_dict = HTSeq.make_feature_dict(HTSeq.parse_GFF(
+ ... "test_data/Caenorhabditis_elegans.WS200.55.gtf.gz"))
(This command may take a few minutes to deal with the 430,000 features
in the GTF file. Note that you may need a lot of RAM if you have millions
@@ -277,8 +271,9 @@ def make_feature_dict(feature_sequence):
#########################
def read_chrom_lens(filename, delimiter="\t"):
- return dict(((chrom, int(len))
- for chrom, len in csv.reader(open(filename), delimiter=delimiter)))
+ return dict(
+ ((chrom, int(len))
+ for chrom, len in csv.reader(open(filename), delimiter=delimiter)))
#########################
@@ -300,6 +295,8 @@ class FastaReader(FileOrSequence):
def __iter__(self):
seq = None
+ name = None
+ descr = None
for line in FileOrSequence.__iter__(self):
if line.startswith(">"):
if seq:
@@ -326,6 +323,7 @@ class FastaReader(FileOrSequence):
def get_sequence_lengths(self):
seqname = None
+ length = 0
seqlengths = {}
for line in FileOrSequence.__iter__(self):
if line.startswith(">"):
@@ -354,12 +352,13 @@ class FastaReader(FileOrSequence):
def build_index(self, force=False):
self._import_pysam()
if not isinstance(self.fos, str):
- raise TypeError("This function only works with FastaReader objects " +
- "connected to a fasta file via file name")
+ raise TypeError(
+ "This function only works with FastaReader objects " +
+ "connected to a fasta file via file name")
index_filename = self.fos + ".fai"
if os.access(index_filename, os.R_OK):
- if (not force) and os.stat( self.filename_or_sequence ).st_mtime <= \
- os.stat(index_filename).st_mtime:
+ if (not force) and os.stat(self.filename_or_sequence).st_mtime <= \
+ os.stat(index_filename).st_mtime:
# index is up to date
return
pysam.faidx(self.fos)
@@ -371,11 +370,13 @@ class FastaReader(FileOrSequence):
if not isinstance(iv, GenomicInterval):
raise TypeError("GenomicInterval expected as key.")
if not isinstance(self.fos, str):
- raise TypeError("This function only works with FastaReader objects " +
- "connected to a fasta file via file name")
+ raise TypeError(
+ "This function only works with FastaReader objects " +
+ "connected to a fasta file via file name")
self._import_pysam()
- fasta = pysam.faidx(self.fos, "%s:%d-%d" %
- (iv.chrom, iv.start, iv.end - 1))
+ fasta = pysam.faidx(
+ self.fos,
+ "%s:%d-%d" % (iv.chrom, iv.start, iv.end - 1))
ans = list(FastaReader(fasta))
assert len(ans) == 1
ans[0].name = str(iv)
@@ -408,28 +409,35 @@ class FastqReader(FileOrSequence):
qual = next(fin)
if qual == "":
if id1 != "":
- warnings.warn("Number of lines in FASTQ file is not "
- "a multiple of 4. Discarding the last, "
- "incomplete record")
+ warnings.warn(
+ "Number of lines in FASTQ file is not "
+ "a multiple of 4. Discarding the last, "
+ "incomplete record")
break
if not qual.endswith("\n"):
qual += "\n"
if not id1.startswith("@"):
- raise ValueError("Primary ID line in FASTQ file does"
- "not start with '@'. Either this is not FASTQ data or the parser got out of sync.")
+ raise ValueError(
+ "Primary ID line in FASTQ file does "
+ "not start with '@'. Either this is not FASTQ data or the "
+ "parser got out of sync.")
if not id2.startswith("+"):
- raise ValueError("Secondary ID line in FASTQ file does"
- "not start with '+'. Maybe got out of sync.")
+ raise ValueError(
+ "Secondary ID line in FASTQ file does"
+ "not start with '+'. Maybe got out of sync.")
if len(id2) > 2 and id1[1:] != id2[1:]:
- raise ValueError("Primary and secondary ID line in FASTQ"
- "disagree.")
+ raise ValueError(
+ "Primary and secondary ID line in FASTQ"
+ "disagree.")
if self.raw_iterator:
s = (seq[:-1], id1[1:-1], qual[:-1], self.qual_scale)
else:
- s = SequenceWithQualities(seq[:-1].encode(), id1[1:-1],
- qual[:-1].encode(), self.qual_scale)
+ s = SequenceWithQualities(
+ seq[:-1].encode(), id1[1:-1],
+ qual[:-1].encode(),
+ self.qual_scale)
yield s
@@ -445,8 +453,10 @@ class BowtieReader(FileOrSequence):
except ValueError:
if line.startswith("Reported "):
continue
- warnings.warn("BowtieReader: Ignoring the following line, which could not be parsed:\n%s\n" % line,
- RuntimeWarning)
+ warnings.warn(
+ "BowtieReader: Ignoring the following line, which could "
+ "not be parsed:\n%s\n" % line,
+ RuntimeWarning)
yield algnt
@@ -472,14 +482,14 @@ def bundle_multiple_alignments(sequence_of_alignments):
class SolexaExportAlignment(Alignment):
"""Iterating over SolexaExportReader objects will yield SoelxaExportRecord
objects. These have four fields:
- read - a SequenceWithQualities object
+ read - a SequenceWithQualities object
aligned - a boolean, indicating whether the object was aligned
iv - a GenomicInterval giving the alignment (or None, if not aligned)
passed_filter - a boolean, indicating whether the object passed the filter
nomatch_code - a code indicating why no match was found (or None, if the
read was aligned)
- As long as 'aligned' is True, a SolexaExportRecord can be treated as an
+ As long as 'aligned' is True, a SolexaExportRecord can be treated as an
Alignment object.
"""
@@ -490,10 +500,10 @@ class SolexaExportAlignment(Alignment):
def __repr__(self):
if self.aligned:
return "< %s object: Read '%s', aligned to %s >" % (
- self.__class__.__name__, self.read.name, self.iv)
+ self.__class__.__name__, self.read.name, self.iv)
else:
return "< %s object: Non-aligned read '%s' >" % (
- self.__class__.__name__, self.read.name)
+ self.__class__.__name__, self.read.name)
class SolexaExportReader(FileOrSequence):
@@ -514,15 +524,28 @@ class SolexaExportReader(FileOrSequence):
if line[-1] == "\n":
line = line[:-1]
res = {}
- (res['machine'], res['run_number'], res['lane'], res['tile'], res['x_coord'],
- res['y_coord'], res['index_string'], res['read_nbr'], res['read_seq'],
- res['qual_str'], res['chrom'], res[
- 'contig'], res['pos'], res['strand'],
- res['match_descr'], res['single_read_algnt_score'],
- res['paired_read_algnt_score'], res[
- 'partner_chrom'], res['partner_contig'],
- res['partner_offset'], res['partner_strand'], res['passed_filtering'] ) \
- = line.split("\t")
+ (res['machine'],
+ res['run_number'],
+ res['lane'],
+ res['tile'],
+ res['x_coord'],
+ res['y_coord'],
+ res['index_string'],
+ res['read_nbr'],
+ res['read_seq'],
+ res['qual_str'],
+ res['chrom'],
+ res['contig'],
+ res['pos'],
+ res['strand'],
+ res['match_descr'],
+ res['single_read_algnt_score'],
+ res['paired_read_algnt_score'],
+ res['partner_chrom'],
+ res['partner_contig'],
+ res['partner_offset'],
+ res['partner_strand'],
+ res['passed_filtering']) = line.split("\t")
return res
def __iter__(self):
@@ -530,20 +553,25 @@ class SolexaExportReader(FileOrSequence):
record = SolexaExportAlignment()
fields = SolexaExportReader.parse_line_bare(line)
if fields['read_nbr'] != "1":
- warnings.warn("Paired-end read encountered. PE is so far supported only for " +
- "SAM files, not yet for SolexaExport. All PE-related fields are ignored. ")
+ warnings.warn(
+ "Paired-end read encountered. PE is so far supported only "
+ "for SAM files, not yet for SolexaExport. All PE-related "
+ "fields are ignored.")
record.read = SequenceWithQualities(
fields['read_seq'],
- "%s:%s:%s:%s:%s#0" % (fields['machine'], fields['lane'], fields['tile'],
- fields['x_coord'], fields['y_coord']),
+ "%s:%s:%s:%s:%s#0" % (fields['machine'],
+ fields['lane'],
+ fields['tile'],
+ fields['x_coord'],
+ fields['y_coord']),
fields['qual_str'], self.qualscale)
if fields['passed_filtering'] == 'Y':
record.passed_filter = True
elif fields['passed_filtering'] == 'N':
record.passed_filter = False
else:
- raise ValueError("Illegal 'passed filter' value in Solexa export data: '%s'." % fields[
- 'passed_filtering'])
+ raise ValueError(
+ "Illegal 'passed filter' value in Solexa export data: '%s'." % fields['passed_filtering'])
record.index_string = fields['index_string']
if fields['pos'] == '':
record.iv = None
@@ -560,13 +588,14 @@ class SolexaExportReader(FileOrSequence):
chrom = fields['chrom']
if fields['chrom'] == "":
chrom = fields['contig']
- record.iv = GenomicInterval(chrom, start,
- start + len(fields['read_seq']), strand)
+ record.iv = GenomicInterval(
+ chrom, start,
+ start + len(fields['read_seq']), strand)
yield record
class SAM_Reader(FileOrSequence):
- """A SAM_Reader object is associated with a SAM file that
+ """A SAM_Reader object is associated with a SAM file that
contains short read alignments. It can generate an iterator of Alignment
objects."""
@@ -584,7 +613,6 @@ class SAM_Reader(FileOrSequence):
class GenomicArrayOfSets(GenomicArray):
-
"""A GenomicArrayOfSets is a specialization of GenomicArray that allows to store
sets of objects. On construction, the step vectors are initialized with empty sets.
By using the 'add_value' method, objects can be added to intervals. If an object
@@ -606,11 +634,50 @@ class GenomicArrayOfSets(GenomicArray):
# paired-end handling
###########################
-def pair_SAM_alignments(alignments, bundle=False):
+def pair_SAM_alignments(
+ alignments,
+ bundle=False,
+ primary_only=False):
+ '''Iterate over SAM aligments, name-sorted paired-end
+
+ Args:
+ alignments (iterator of SAM/BAM alignments): the alignments to wrap
+ bundle (bool): if True, bundle all alignments from one read pair into a
+ single yield. If False (default), each pair of alignments is
+ yielded separately.
+ primary_only (bool): for each read, consider only the primary line
+ (SAM flag 0x900 = 0). The SAM specification requires one and only
+ one of those for each read.
+
+ Yields:
+ 2-tuples with each pair of alignments or, if bundle==True, each bundled
+ list of alignments.
+ '''
mate_missing_count = [0]
def process_list(almnt_list):
+ '''Transform a list of alignment with the same read name into pairs
+
+ Args:
+ almnt_list (list): alignments to process
+
+ Yields:
+ each pair of alignments.
+
+ This function is needed because each line of a BAM file is not a read
+ but an alignment. For uniquely mapped and unmapped reads, those two are
+ the same. For multimapped reads, however, there can be more than one
+ alignment for each read. Also, it is normal for a mapper to uniquely
+ map one read and multimap its mate.
+
+ This function goes down the list of alignments for a given read name
+ and tries to find the first mate. So if read 1 is uniquely mapped but
+ read 2 is mapped 4 times, only (read 1, read 2 - first occurrence) will
+ yield; the other 3 alignments of read 2 are ignored.
+ '''
+
+
while len(almnt_list) > 0:
a1 = almnt_list.pop(0)
# Find its mate
@@ -622,14 +689,15 @@ def pair_SAM_alignments(alignments, bundle=False):
if not (a1.aligned and a2.aligned):
break
if a1.iv.chrom == a2.mate_start.chrom and a1.iv.start == a2.mate_start.pos and \
- a2.iv.chrom == a1.mate_start.chrom and a2.iv.start == a1.mate_start.pos:
+ a2.iv.chrom == a1.mate_start.chrom and a2.iv.start == a1.mate_start.pos:
break
else:
if a1.mate_aligned:
mate_missing_count[0] += 1
if mate_missing_count[0] == 1:
- warnings.warn("Read " + a1.read.name + " claims to have an aligned mate " +
- "which could not be found in an adjacent line.")
+ warnings.warn(
+ "Read " + a1.read.name + " claims to have an aligned mate " +
+ "which could not be found in an adjacent line.")
a2 = None
if a2 is not None:
almnt_list.remove(a2)
@@ -648,6 +716,9 @@ def pair_SAM_alignments(alignments, bundle=False):
if almnt.pe_which == "unknown":
raise ValueError(
"Paired-end read found with 'unknown' 'pe_which' status.")
+ # FIXME: almnt.not_primary_alignment currently means secondary
+ if primary_only and (almnt.not_primary_alignment or almnt.supplementary):
+ continue
if almnt.read.name == current_name:
almnt_list.append(almnt)
else:
@@ -668,18 +739,35 @@ def pair_SAM_alignments(alignments, bundle=False):
mate_missing_count[0])
-def pair_SAM_alignments_with_buffer(alignments, max_buffer_size=30000000):
+def pair_SAM_alignments_with_buffer(
+ alignments,
+ max_buffer_size=30000000,
+ primary_only=False):
+ '''Iterate over SAM aligments with buffer, position-sorted paired-end
+
+ Args:
+ alignments (iterator of SAM/BAM alignments): the alignments to wrap
+ max_buffer_size (int): maxmal numer of alignments to keep in memory.
+ primary_only (bool): for each read, consider only the primary line
+ (SAM flag 0x900 = 0). The SAM specification requires one and only
+ one of those for each read.
+
+ Yields:
+ 2-tuples with each pair of alignments.
+ '''
almnt_buffer = {}
ambiguous_pairing_counter = 0
for almnt in alignments:
-
if not almnt.paired_end:
raise ValueError(
"Sequence of paired-end alignments expected, but got single-end alignment.")
if almnt.pe_which == "unknown":
raise ValueError(
"Cannot process paired-end alignment found with 'unknown' 'pe_which' status.")
+ # FIXME: almnt.not_primary_alignment currently means secondary
+ if primary_only and (almnt.not_primary_alignment or almnt.supplementary):
+ continue
matekey = (
almnt.read.name,
@@ -720,8 +808,9 @@ def pair_SAM_alignments_with_buffer(alignments, max_buffer_size=30000000):
"Maximum alignment buffer size exceeded while pairing SAM alignments.")
if len(almnt_buffer) > 0:
- warnings.warn("Mate records missing for %d records; first such record: %s." %
- (len(almnt_buffer), str(list(almnt_buffer.values())[0][0])))
+ warnings.warn(
+ "Mate records missing for %d records; first such record: %s." %
+ (len(almnt_buffer), str(list(almnt_buffer.values())[0][0])))
for almnt_list in list(almnt_buffer.values()):
for almnt in almnt_list:
if almnt.pe_which == "first":
@@ -730,8 +819,9 @@ def pair_SAM_alignments_with_buffer(alignments, max_buffer_size=30000000):
yield (None, almnt)
if ambiguous_pairing_counter > 0:
- warnings.warn("Mate pairing was ambiguous for %d records; mate key for first such record: %s." %
- (ambiguous_pairing_counter, str(ambiguous_pairing_first_occurance)))
+ warnings.warn(
+ "Mate pairing was ambiguous for %d records; mate key for first such record: %s." %
+ (ambiguous_pairing_counter, str(ambiguous_pairing_first_occurance)))
###########################
@@ -756,7 +846,8 @@ _vcf_typemap = {
class VariantCall(object):
- def __init__(self, chrom=None, pos=None, identifier=None, ref=None, alt=None, qual=None, filtr=None, info=None):
+ def __init__(self, chrom=None, pos=None, identifier=None, ref=None,
+ alt=None, qual=None, filtr=None, info=None):
self.chrom = chrom
self.pos = pos
self.id = identifier
@@ -785,12 +876,10 @@ class VariantCall(object):
ret = cls()
if nsamples == 0:
ret.format = None
- ret.chrom, ret.pos, ret.id, ret.ref, ret.alt, ret.qual, ret.filter, ret.info = line.rstrip(
- "\n").split("\t", 7)
+ ret.chrom, ret.pos, ret.id, ret.ref, ret.alt, ret.qual, ret.filter, ret.info = line.rstrip("\n").split("\t", 7)
else:
lsplit = line.rstrip("\n").split("\t")
- ret.chrom, ret.pos, ret.id, ret.ref, ret.alt, ret.qual, ret.filter, ret.info = lsplit[
- :8]
+ ret.chrom, ret.pos, ret.id, ret.ref, ret.alt, ret.qual, ret.filter, ret.info = lsplit[:8]
ret.format = lsplit[8].split(":")
ret.samples = {}
spos = 9
@@ -874,10 +963,11 @@ class VCF_Reader(FileOrSequence):
def make_info_dict(self):
self.infodict = dict(
- (key, _vcf_typemap[self.info[key]["Type"]]) for key in list(self.info.keys()))
+ (key,
+ _vcf_typemap[self.info[key]["Type"]]) for key in list(self.info.keys()))
def parse_meta(self, header_filename=None):
- if header_filename == None:
+ if header_filename is None:
the_iter = FileOrSequence.__iter__(self)
else:
the_iter = open(header_filename, "r")
@@ -917,7 +1007,7 @@ class VCF_Reader(FileOrSequence):
def meta_info(self, header_filename=None):
ret = []
- if header_filename == None:
+ if header_filename is None:
the_iter = FileOrSequence.__iter__(self)
else:
the_iter = open(header_filename, "r")
@@ -1010,7 +1100,6 @@ class BAM_Reader(object):
sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
self.record_no = 0
for pa in sf:
- # yield SAM_Alignment.from_pysam_AlignedRead( pa, sf )
yield SAM_Alignment.from_pysam_AlignedSegment(pa, sf)
self.record_no += 1
@@ -1056,8 +1145,8 @@ class BAM_Reader(object):
class BAM_Writer(object):
-
- def __init__(self, filename, template=None, referencenames=None, referencelengths=None, text=None, header=None):
+ def __init__(self, filename, template=None, referencenames=None,
+ referencelengths=None, text=None, header=None):
try:
import pysam
except ImportError:
@@ -1071,15 +1160,20 @@ class BAM_Writer(object):
self.referencelengths = referencelengths
self.text = text
self.header = header
- self.sf = pysam.Samfile(self.filename, mode="wb", template=self.template, referencenames=self.referencenames,
- referencelengths=self.referencelengths, text=self.text, header=self.header)
+ self.sf = pysam.Samfile(
+ self.filename,
+ mode="wb",
+ template=self.template,
+ referencenames=self.referencenames,
+ referencelengths=self.referencelengths,
+ text=self.text,
+ header=self.header)
@classmethod
def from_BAM_Reader(cls, fn, br):
return BAM_Writer(filename=fn, header=br.get_header_dict())
def write(self, alnmt):
- #self.sf.write( alnmt.to_pysam_AlignedRead( self.sf ) )
self.sf.write(alnmt.to_pysam_AlignedSegment(self.sf))
def close(self):
@@ -1100,13 +1194,21 @@ class BED_Reader(FileOrSequence):
raise ValueError("BED file line contains less than 3 fields")
if len(fields) > 9:
raise ValueError("BED file line contains more than 9 fields")
- iv = GenomicInterval(fields[0], int(fields[1]), int(
- fields[2]), fields[5] if len(fields) > 5 else ".")
- f = GenomicFeature(fields[3] if len(
- fields) > 3 else "unnamed", "BED line", iv)
+ iv = GenomicInterval(
+ fields[0],
+ int(fields[1]),
+ int(fields[2]),
+ fields[5] if len(fields) > 5 else ".")
+ f = GenomicFeature(
+ fields[3] if len(fields) > 3 else "unnamed",
+ "BED line",
+ iv)
f.score = float(fields[4]) if len(fields) > 4 else None
- f.thick = GenomicInterval(iv.chrom, int(fields[6]), int(
- fields[7]), iv.strand) if len(fields) > 7 else None
+ f.thick = GenomicInterval(
+ iv.chrom,
+ int(fields[6]),
+ int(fields[7]),
+ iv.strand) if len(fields) > 7 else None
f.itemRgb = [int(a) for a in fields[8].split(",")
] if len(fields) > 8 else None
yield(f)
=====================================
python3/HTSeq/_version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.10.0"
\ No newline at end of file
+__version__ = "0.11.2"
\ No newline at end of file
=====================================
python3/HTSeq/scripts/count.py
=====================================
@@ -41,10 +41,10 @@ def count_reads_in_features(sam_filenames, gff_filename,
r = (r,)
for read in r:
if read is not None:
- samoutfile.write(read.original_sam_line.rstrip() +
- "\tXF:Z:" + assignment + "\n")
+ read.optional_fields.append(('XF', assignment))
+ samoutfile.write(read.get_sam_line() + "\n")
- if samouts != "":
+ if samouts != []:
if len(samouts) != len(sam_filenames):
raise ValueError('Select the same number of SAM input and output files')
# Try to open samout files early in case any of them has issues
@@ -116,22 +116,28 @@ def count_reads_in_features(sam_filenames, gff_filename,
lowqual_all = []
nonunique_all = []
for isam, (sam_filename) in enumerate(sam_filenames):
- if samouts != '':
+ if samouts != []:
samoutfile = open(samouts[isam], 'w')
else:
samoutfile = None
try:
- if sam_filename != "-":
- read_seq_file = SAM_or_BAM_Reader(sam_filename)
- read_seq = read_seq_file
- first_read = next(iter(read_seq))
- else:
+ if sam_filename == "-":
read_seq_file = SAM_or_BAM_Reader(sys.stdin)
- read_seq_iter = iter(read_seq_file)
+ else:
+ read_seq_file = SAM_or_BAM_Reader(sam_filename)
+ read_seq_iter = iter(read_seq_file)
+ # Catch empty BAM files
+ try:
first_read = next(read_seq_iter)
+ pe_mode = first_read.paired_end
+ except:
+ first_read = None
+ pe_mode = False
+ if first_read is not None:
read_seq = itertools.chain([first_read], read_seq_iter)
- pe_mode = first_read.paired_end
+ else:
+ read_seq = []
except:
sys.stderr.write(
"Error occured when reading beginning of SAM/BAM file.\n")
@@ -139,12 +145,20 @@ def count_reads_in_features(sam_filenames, gff_filename,
try:
if pe_mode:
+ if ((supplementary_alignment_mode == 'ignore') and
+ (secondary_alignment_mode == 'ignore')):
+ primary_only = True
+ else:
+ primary_only = False
if order == "name":
- read_seq = HTSeq.pair_SAM_alignments(read_seq)
+ read_seq = HTSeq.pair_SAM_alignments(
+ read_seq,
+ primary_only=primary_only)
elif order == "pos":
read_seq = HTSeq.pair_SAM_alignments_with_buffer(
read_seq,
- max_buffer_size=max_buffer_size)
+ max_buffer_size=max_buffer_size,
+ primary_only=primary_only)
else:
raise ValueError("Illegal order specified.")
empty = 0
@@ -396,9 +410,11 @@ def main():
"suitable for Ensembl GTF files: gene_id)")
pa.add_argument(
- "--additional-attr", type=str, nargs='+',
- default=(), help="Additional feature attributes (default: none, " +
- "suitable for Ensembl GTF files: gene_name)")
+ "--additional-attr", type=str,
+ action='append',
+ default=[], help="Additional feature attributes (default: none, " +
+ "suitable for Ensembl GTF files: gene_name). Use multiple times " +
+ "for each different attribute")
pa.add_argument(
"-m", "--mode", dest="mode",
@@ -414,19 +430,20 @@ def main():
pa.add_argument(
"--secondary-alignments", dest="secondary_alignments", type=str,
- choices=("score", "ignore"), default="score",
+ choices=("score", "ignore"), default="ignore",
help="Whether to score secondary alignments (0x100 flag)")
pa.add_argument(
"--supplementary-alignments", dest="supplementary_alignments", type=str,
- choices=("score", "ignore"), default="score",
+ choices=("score", "ignore"), default="ignore",
help="Whether to score supplementary alignments (0x800 flag)")
pa.add_argument(
- "-o", "--samout", type=str, dest="samouts", nargs='+',
- default="", help="write out all SAM alignment records into an output " +
- "SAM file called SAMOUT, annotating each line with its feature assignment " +
- "(as an optional field with tag 'XF')")
+ "-o", "--samout", type=str, dest="samouts",
+ action='append',
+ default=[], help="write out all SAM alignment records into " +
+ "SAM files (one per input file needed), annotating each line " +
+ "with its feature assignment (as an optional field with tag 'XF')")
pa.add_argument(
"-q", "--quiet", action="store_true", dest="quiet",
=====================================
python3/HTSeq/scripts/qa.py
=====================================
@@ -5,18 +5,21 @@
# (c) Simon Anders, European Molecular Biology Laboratory, 2010
# released under GNU General Public License
-import sys, time, os.path, optparse
+import sys
+import os.path
+import optparse
from itertools import *
import numpy
import HTSeq
+
def main():
try:
import matplotlib
except ImportError:
sys.stderr.write("This script needs the 'matplotlib' library, which ")
- sys.stderr.write("was not found. Please install it." )
+ sys.stderr.write("was not found. Please install it.")
matplotlib.use('PDF')
from matplotlib import pyplot
@@ -26,10 +29,10 @@ def main():
except ImportError:
from matplotlib.pyplot import normalize as Normalize
-
# **** Parse command line ****
- optParser = optparse.OptionParser( usage = "%prog [options] read_file",
+ optParser = optparse.OptionParser(
+ usage="%prog [options] read_file",
description=
"This script take a file with high-throughput sequencing reads " +
"(supported formats: SAM, Solexa _export.txt, FASTQ, Solexa " +
@@ -40,62 +43,68 @@ def main():
epilog =
"Written by Simon Anders (sanders at fs.tum.de), European Molecular Biology " +
" Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
- " Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__ )
- optParser.add_option( "-t", "--type", type="choice", dest="type",
- choices = ("sam", "bam", "solexa-export", "fastq", "solexa-fastq"),
- default = "sam", help="type of read_file (one of: sam [default], bam, " +
- "solexa-export, fastq, solexa-fastq)" )
- optParser.add_option( "-o", "--outfile", type="string", dest="outfile",
- help="output filename (default is <read_file>.pdf)" )
- optParser.add_option( "-r", "--readlength", type="int", dest="readlen",
- help="the maximum read length (when not specified, the script guesses from the file" )
- optParser.add_option( "-g", "--gamma", type="float", dest="gamma",
- default = 0.3,
- help="the gamma factor for the contrast adjustment of the quality score plot" )
- optParser.add_option( "-n", "--nosplit", action="store_true", dest="nosplit",
- help="do not split reads in unaligned and aligned ones" )
- optParser.add_option( "-m", "--maxqual", type="int", dest="maxqual", default=41,
- help="the maximum quality score that appears in the data (default: 41)" )
-
- if len( sys.argv ) == 1:
+ " Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__)
+ optParser.add_option(
+ "-t", "--type", type="choice", dest="type",
+ choices=("sam", "bam", "solexa-export", "fastq", "solexa-fastq"),
+ default="sam", help="type of read_file (one of: sam [default], bam, " +
+ "solexa-export, fastq, solexa-fastq)")
+ optParser.add_option(
+ "-o", "--outfile", type="string", dest="outfile",
+ help="output filename (default is <read_file>.pdf)")
+ optParser.add_option(
+ "-r", "--readlength", type="int", dest="readlen",
+ help="the maximum read length (when not specified, the script guesses from the file")
+ optParser.add_option(
+ "-g", "--gamma", type="float", dest="gamma",
+ default=0.3,
+ help="the gamma factor for the contrast adjustment of the quality score plot")
+ optParser.add_option(
+ "-n", "--nosplit", action="store_true", dest="nosplit",
+ help="do not split reads in unaligned and aligned ones")
+ optParser.add_option(
+ "-m", "--maxqual", type="int", dest="maxqual", default=41,
+ help="the maximum quality score that appears in the data (default: 41)")
+
+ if len(sys.argv) == 1:
optParser.print_help()
sys.exit(1)
(opts, args) = optParser.parse_args()
- if len( args ) != 1:
- sys.stderr.write( sys.argv[0] + ": Error: Please provide one argument (the read_file).\n" )
- sys.stderr.write( " Call with '-h' to get usage information.\n" )
- sys.exit( 1 )
+ if len(args) != 1:
+ sys.stderr.write(
+ sys.argv[0] + ": Error: Please provide one argument (the read_file).\n")
+ sys.stderr.write(" Call with '-h' to get usage information.\n")
+ sys.exit(1)
readfilename = args[0]
if opts.type == "sam":
- readfile = HTSeq.SAM_Reader( readfilename )
+ readfile = HTSeq.SAM_Reader(readfilename)
isAlnmntFile = True
elif opts.type == "bam":
- readfile = HTSeq.BAM_Reader( readfilename )
+ readfile = HTSeq.BAM_Reader(readfilename)
isAlnmntFile = True
elif opts.type == "solexa-export":
- readfile = HTSeq.SolexaExportReader( readfilename )
+ readfile = HTSeq.SolexaExportReader(readfilename)
isAlnmntFile = True
elif opts.type == "fastq":
- readfile = HTSeq.FastqReader( readfilename )
+ readfile = HTSeq.FastqReader(readfilename)
isAlnmntFile = False
elif opts.type == "solexa-fastq":
- readfile = HTSeq.FastqReader( readfilename, "solexa" )
+ readfile = HTSeq.FastqReader(readfilename, "solexa")
isAlnmntFile = False
else:
- sys.error( "Oops." )
+ sys.error("Oops.")
twoColumns = isAlnmntFile and not opts.nosplit
if opts.outfile is None:
- outfilename = os.path.basename( readfilename ) + ".pdf"
+ outfilename = os.path.basename(readfilename) + ".pdf"
else:
outfilename = opts.outfile
-
# **** Get read length ****
if opts.readlen is not None:
@@ -103,25 +112,23 @@ def main():
else:
readlen = 0
if isAlnmntFile:
- reads = ( a.read for a in readfile )
+ reads = (a.read for a in readfile)
else:
reads = readfile
- for r in islice( reads, 10000 ):
- if len( r ) > readlen:
- readlen = len( r )
+ for r in islice(reads, 10000):
+ if len(r) > readlen:
+ readlen = len(r)
max_qual = opts.maxqual
gamma = opts.gamma
-
# **** Initialize count arrays ****
- base_arr_U = numpy.zeros( ( readlen, 5 ), numpy.int )
- qual_arr_U = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )
+ base_arr_U = numpy.zeros((readlen, 5), numpy.int)
+ qual_arr_U = numpy.zeros((readlen, max_qual+1), numpy.int)
if twoColumns:
- base_arr_A = numpy.zeros( ( readlen, 5 ), numpy.int )
- qual_arr_A = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )
-
+ base_arr_A = numpy.zeros((readlen, 5), numpy.int)
+ qual_arr_A = numpy.zeros((readlen, max_qual+1), numpy.int)
# **** Main counting loop ****
@@ -130,110 +137,107 @@ def main():
for a in readfile:
if isAlnmntFile:
r = a.read
- else:
- r = a
- if twoColumns and (isAlnmntFile and a.aligned):
- r.add_bases_to_count_array( base_arr_A )
- r.add_qual_to_count_array( qual_arr_A )
- else:
- r.add_bases_to_count_array( base_arr_U )
- r.add_qual_to_count_array( qual_arr_U )
- i += 1
- if (i % 200000) == 0:
- print(i, "reads processed")
+ else:
+ r = a
+ if twoColumns and (isAlnmntFile and a.aligned):
+ r.add_bases_to_count_array(base_arr_A)
+ r.add_qual_to_count_array(qual_arr_A)
+ else:
+ r.add_bases_to_count_array(base_arr_U)
+ r.add_qual_to_count_array(qual_arr_U)
+ i += 1
+ if (i % 200000) == 0:
+ print(i, "reads processed")
except:
- sys.stderr.write( "Error occured in: %s\n" %
- readfile.get_line_number_string() )
+ sys.stderr.write("Error occured in: %s\n" %
+ readfile.get_line_number_string())
raise
print(i, "reads processed")
-
# **** Normalize result ****
- def norm_by_pos( arr ):
- arr = numpy.array( arr, numpy.float )
- arr_n = ( arr.T / arr.sum( 1 ) ).T
- arr_n[ arr == 0 ] = 0
+ def norm_by_pos(arr):
+ arr = numpy.array(arr, numpy.float)
+ arr_n = (arr.T / arr.sum(1)).T
+ arr_n[arr == 0] = 0
return arr_n
- def norm_by_start( arr ):
- arr = numpy.array( arr, numpy.float )
- arr_n = ( arr.T / arr.sum( 1 )[ 0 ] ).T
- arr_n[ arr == 0 ] = 0
+ def norm_by_start(arr):
+ arr = numpy.array(arr, numpy.float)
+ arr_n = (arr.T / arr.sum(1)[0]).T
+ arr_n[arr == 0] = 0
return arr_n
-
- base_arr_U_n = norm_by_pos( base_arr_U )
- qual_arr_U_n = norm_by_start( qual_arr_U )
- nreads_U = base_arr_U[0,:].sum()
+ base_arr_U_n = norm_by_pos(base_arr_U)
+ qual_arr_U_n = norm_by_start(qual_arr_U)
+ nreads_U = base_arr_U[0, :].sum()
if twoColumns:
- base_arr_A_n = norm_by_pos( base_arr_A )
- qual_arr_A_n = norm_by_start( qual_arr_A )
- nreads_A = base_arr_A[0,:].sum()
-
+ base_arr_A_n = norm_by_pos(base_arr_A)
+ qual_arr_A_n = norm_by_start(qual_arr_A)
+ nreads_A = base_arr_A[0, :].sum()
# **** Make plot ****
- def plot_bases( arr ):
- xg = numpy.arange( readlen )
- pyplot.plot( xg, arr[ : , 0 ], marker='.', color='red')
- pyplot.plot( xg, arr[ : , 1 ], marker='.', color='darkgreen')
- pyplot.plot( xg, arr[ : , 2 ], marker='.',color='lightgreen')
- pyplot.plot( xg, arr[ : , 3 ], marker='.',color='orange')
- pyplot.plot( xg, arr[ : , 4 ], marker='.',color='grey')
- pyplot.axis( (0, readlen-1, 0, 1 ) )
- pyplot.text( readlen*.70, .9, "A", color="red" )
- pyplot.text( readlen*.75, .9, "C", color="darkgreen" )
- pyplot.text( readlen*.80, .9, "G", color="lightgreen" )
- pyplot.text( readlen*.85, .9, "T", color="orange" )
- pyplot.text( readlen*.90, .9, "N", color="grey" )
+ def plot_bases(arr):
+ xg = numpy.arange(readlen)
+ pyplot.plot(xg, arr[:, 0], marker='.', color='red')
+ pyplot.plot(xg, arr[:, 1], marker='.', color='darkgreen')
+ pyplot.plot(xg, arr[:, 2], marker='.', color='lightgreen')
+ pyplot.plot(xg, arr[:, 3], marker='.', color='orange')
+ pyplot.plot(xg, arr[:, 4], marker='.', color='grey')
+ pyplot.axis((0, readlen-1, 0, 1))
+ pyplot.text(readlen*.70, .9, "A", color="red")
+ pyplot.text(readlen*.75, .9, "C", color="darkgreen")
+ pyplot.text(readlen*.80, .9, "G", color="lightgreen")
+ pyplot.text(readlen*.85, .9, "T", color="orange")
+ pyplot.text(readlen*.90, .9, "N", color="grey")
pyplot.figure()
- pyplot.subplots_adjust( top=.85 )
- pyplot.suptitle( os.path.basename(readfilename), fontweight='bold' )
+ pyplot.subplots_adjust(top=.85)
+ pyplot.suptitle(os.path.basename(readfilename), fontweight='bold')
if twoColumns:
- pyplot.subplot( 221 )
- plot_bases( base_arr_U_n )
- pyplot.ylabel( "proportion of base" )
- pyplot.title( "non-aligned reads\n%.0f%% (%.3f million)" %
- ( 100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6 ) )
-
- pyplot.subplot( 222 )
- plot_bases( base_arr_A_n )
- pyplot.title( "aligned reads\n%.0f%% (%.3f million)" %
- ( 100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6 ) )
-
- pyplot.subplot( 223 )
- pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
- norm=Normalize( 0, 1 ) )
- pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
- pyplot.xlabel( "position in read" )
- pyplot.ylabel( "base-call quality score" )
-
- pyplot.subplot( 224 )
- pyplot.pcolor( qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens,
- norm=Normalize( 0, 1 ) )
- pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
- pyplot.xlabel( "position in read" )
+ pyplot.subplot(221)
+ plot_bases(base_arr_U_n)
+ pyplot.ylabel("proportion of base")
+ pyplot.title("non-aligned reads\n%.0f%% (%.4f million)" %
+ (100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6))
+
+ pyplot.subplot(222)
+ plot_bases(base_arr_A_n)
+ pyplot.title("aligned reads\n%.0f%% (%.4f million)" %
+ (100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6))
+
+ pyplot.subplot(223)
+ pyplot.pcolor(qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
+ norm=Normalize(0, 1))
+ pyplot.axis((0, readlen-1, 0, max_qual+1))
+ pyplot.xlabel("position in read")
+ pyplot.ylabel("base-call quality score")
+
+ pyplot.subplot(224)
+ pyplot.pcolor(qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens,
+ norm=Normalize(0, 1))
+ pyplot.axis((0, readlen-1, 0, max_qual+1))
+ pyplot.xlabel("position in read")
else:
- pyplot.subplot( 211 )
- plot_bases( base_arr_U_n )
- pyplot.ylabel( "proportion of base" )
- pyplot.title( "%.3f million reads" % ( nreads_U / 1e6 ) )
+ pyplot.subplot(211)
+ plot_bases(base_arr_U_n)
+ pyplot.ylabel("proportion of base")
+ pyplot.title("%.3f million reads" % (nreads_U / 1e6))
- pyplot.subplot( 212 )
- pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
- norm=Normalize( 0, 1 ) )
- pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
- pyplot.xlabel( "position in read" )
- pyplot.ylabel( "base-call quality score" )
+ pyplot.subplot(212)
+ pyplot.pcolor(qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
+ norm=Normalize(0, 1))
+ pyplot.axis((0, readlen-1, 0, max_qual+1))
+ pyplot.xlabel("position in read")
+ pyplot.ylabel("base-call quality score")
+ pyplot.savefig(outfilename)
- pyplot.savefig( outfilename )
if __name__ == "__main__":
main()
=====================================
python3/src/HTSeq/_HTSeq.pxd
=====================================
@@ -7,47 +7,47 @@ cdef class GenomicInterval:
cdef public long end
cdef str _strand
- cpdef is_contained_in( self, GenomicInterval iv )
- cpdef contains( self, GenomicInterval iv )
- cpdef overlaps( self, GenomicInterval iv )
- cpdef extend_to_include( GenomicInterval self, GenomicInterval iv )
+ cpdef is_contained_in(self, GenomicInterval iv)
+ cpdef contains(self, GenomicInterval iv)
+ cpdef overlaps(self, GenomicInterval iv)
+ cpdef extend_to_include(GenomicInterval self, GenomicInterval iv)
cdef class GenomicPosition( GenomicInterval ):
pass
-cdef class Sequence( object ):
+cdef class Sequence(object):
cdef public bytes seq
cdef public str name
cdef public str descr
- cpdef Sequence get_reverse_complement( self )
- cpdef object add_bases_to_count_array( Sequence self, numpy.ndarray count_array_ )
- cpdef Sequence trim_left_end( Sequence self, Sequence pattern, float mismatch_prop = ? )
- cpdef Sequence trim_right_end( Sequence self, Sequence pattern, float mismatch_prop = ? )
+ cpdef Sequence get_reverse_complement(self, bint rename=?)
+ cpdef object add_bases_to_count_array(Sequence self, numpy.ndarray count_array_)
+ cpdef Sequence trim_left_end(Sequence self, Sequence pattern, float mismatch_prop=?)
+ cpdef Sequence trim_right_end(Sequence self, Sequence pattern, float mismatch_prop=?)
-cdef class SequenceWithQualities( Sequence ):
+cdef class SequenceWithQualities(Sequence):
cdef readonly bytes _qualstr
cdef readonly bytes _qualstr_phred
cdef readonly str _qualscale
cdef readonly object _qualarr
- cdef _fill_qual_arr( SequenceWithQualities self )
- cpdef object add_qual_to_count_array( SequenceWithQualities self, numpy.ndarray count_array_ )
- cpdef SequenceWithQualities trim_left_end_with_quals( SequenceWithQualities self,
- Sequence pattern, int max_mm_qual = ? )
- cpdef SequenceWithQualities trim_right_end_with_quals( SequenceWithQualities self,
- Sequence pattern, int max_mm_qual = ? )
+ cdef _fill_qual_arr(SequenceWithQualities self)
+ cpdef object add_qual_to_count_array(SequenceWithQualities self, numpy.ndarray count_array_)
+ cpdef SequenceWithQualities trim_left_end_with_quals(SequenceWithQualities self,
+ Sequence pattern, int max_mm_qual=?)
+ cpdef SequenceWithQualities trim_right_end_with_quals(SequenceWithQualities self,
+ Sequence pattern, int max_mm_qual=?)
-cdef class Alignment( object ):
+cdef class Alignment(object):
cdef public SequenceWithQualities _read
cdef public GenomicInterval iv
-cdef class AlignmentWithSequenceReversal( Alignment ):
+cdef class AlignmentWithSequenceReversal(Alignment):
cdef public SequenceWithQualities read_as_aligned
cdef public SequenceWithQualities _read_as_sequenced
-cdef class SAM_Alignment( AlignmentWithSequenceReversal ):
+cdef class SAM_Alignment(AlignmentWithSequenceReversal):
cdef public list cigar
cdef public int aQual
cdef public GenomicPosition mate_start
=====================================
python3/src/HTSeq/_HTSeq.pyx
=====================================
@@ -645,10 +645,15 @@ cdef class Sequence(object):
self.name = name
self.descr = None
- cpdef Sequence get_reverse_complement(self):
- return Sequence(
- reverse_complement(self.seq),
- "revcomp_of_" + self.name)
+ cpdef Sequence get_reverse_complement(self, bint rename=True):
+ if rename:
+ return Sequence(
+ reverse_complement(self.seq),
+ "revcomp_of_" + self.name)
+ else:
+ return Sequence(
+ reverse_complement(self.seq),
+ self.name)
def __str__(self):
return self.seq.decode()
@@ -898,13 +903,20 @@ cdef class SequenceWithQualities(Sequence):
self.write_to_fastq_file(sio, convert_to_phred)
return sio.getvalue()
- cpdef SequenceWithQualities get_reverse_complement(self):
+ cpdef SequenceWithQualities get_reverse_complement(self, bint rename=True):
cdef SequenceWithQualities res
- res = SequenceWithQualities(
- reverse_complement(self.seq),
- "revcomp_of_" + self.name,
- self._qualstr[::-1],
- self._qualscale)
+ if rename:
+ res = SequenceWithQualities(
+ reverse_complement(self.seq),
+ "revcomp_of_" + self.name,
+ self._qualstr[::-1],
+ self._qualscale)
+ else:
+ res = SequenceWithQualities(
+ reverse_complement(self.seq),
+ self.name,
+ self._qualstr[::-1],
+ self._qualscale)
if self._qualarr is not None:
res._qualarr = self._qualarr[::-1]
return res
View it on GitLab: https://salsa.debian.org/med-team/htseq/commit/237b3f1303e6f7f84c205db94eb8919765a26a34
--
View it on GitLab: https://salsa.debian.org/med-team/htseq/commit/237b3f1303e6f7f84c205db94eb8919765a26a34
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190111/d9cd240f/attachment-0001.html>
More information about the debian-med-commit
mailing list