[med-svn] [python-cutadapt] 01/06: New upstream version 1.12
Kevin Murray
daube-guest at moszumanska.debian.org
Fri Dec 2 03:06:47 UTC 2016
This is an automated email from the git hooks/post-receive script.
daube-guest pushed a commit to branch master
in repository python-cutadapt.
commit 89cb1e6f65ac33f1c62a68913e54a1e438fe0420
Author: Kevin Murray <kdmfoss at gmail.com>
Date: Fri Dec 2 13:56:23 2016 +1100
New upstream version 1.12
---
CHANGES.rst | 11 +++
PKG-INFO | 45 ++++++++-
README.rst | 2 +-
bin/_preamble.py | 21 -----
bin/cutadapt | 10 --
cutadapt.egg-info/PKG-INFO | 45 ++++++++-
cutadapt.egg-info/SOURCES.txt | 8 +-
cutadapt.egg-info/entry_points.txt | 3 +
cutadapt.egg-info/requires.txt | 1 +
cutadapt/_align.c | 36 +++++---
cutadapt/_seqio.c | 2 +-
cutadapt/_seqio.pyx | 2 +-
cutadapt/_version.py | 4 +-
cutadapt/adapters.py | 12 +--
cutadapt/filters.py | 2 +-
cutadapt/modifiers.py | 9 ++
cutadapt/scripts/cutadapt.py | 115 ++++++++++++-----------
cutadapt/seqio.py | 4 +-
cutadapt/xopen.py | 182 -------------------------------------
doc/colorspace.rst | 26 +++++-
doc/guide.rst | 157 +++++++++++++++++++++-----------
doc/recipes.rst | 48 +++++++---
setup.py | 7 +-
tests/cut/shortened.fastq | 12 +++
tests/cut/small-no-trim.fasta | 6 ++
tests/testmodifiers.py | 19 +++-
tests/tests.py | 8 ++
tests/testxopen.py | 101 --------------------
28 files changed, 422 insertions(+), 476 deletions(-)
diff --git a/CHANGES.rst b/CHANGES.rst
index d2328b9..e8b0a39 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -2,6 +2,17 @@
Changes
=======
+v1.12 (2016-11-28)
+------------------
+
+* Add read modification option ``--length`` (short: ``--l``), which will
+ shorten each read to the given length.
+* Cutadapt will no longer complain that it has nothing to do when you do not
+ give it any adapters. For example, you can use this to convert file formats:
+ ``cutadapt -o output.fasta input.fastq.gz`` converts FASTQ to FASTA.
+* The ``xopen`` module for opening compressed files was moved to a `separate
+ package on PyPI <https://pypi.python.org/pypi/xopen>`_.
+
v1.11 (2016-08-16)
------------------
diff --git a/PKG-INFO b/PKG-INFO
index 9c0d080..2004e95 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,12 +1,53 @@
Metadata-Version: 1.1
Name: cutadapt
-Version: 1.11
+Version: 1.12
Summary: trim adapters from high-throughput sequencing reads
Home-page: https://cutadapt.readthedocs.io/
Author: Marcel Martin
Author-email: marcel.martin at scilifelab.se
License: MIT
-Description: UNKNOWN
+Description: .. image:: https://travis-ci.org/marcelm/cutadapt.svg?branch=master
+ :target: https://travis-ci.org/marcelm/cutadapt
+
+ .. image:: https://img.shields.io/pypi/v/cutadapt.svg?branch=master
+ :target: https://pypi.python.org/pypi/cutadapt
+
+ ========
+ cutadapt
+ ========
+
+ Cutadapt finds and removes adapter sequences, primers, poly-A tails and other
+ types of unwanted sequence from your high-throughput sequencing reads.
+
+ Cleaning your data in this way is often required: Reads from small-RNA
+ sequencing contain the 3’ sequencing adapter because the read is longer than
+ the molecule that is sequenced. Amplicon reads start with a primer sequence.
+ Poly-A tails are useful for pulling out RNA from your sample, but often you
+ don’t want them to be in your reads.
+
+ Cutadapt helps with these trimming tasks by finding the adapter or primer
+ sequences in an error-tolerant way. It can also modify and filter reads in
+ various ways. Adapter sequences can contain IUPAC wildcard characters. Also,
+ paired-end reads and even colorspace data is supported. If you want, you can
+ also just demultiplex your input data, without removing adapter sequences at all.
+
+ Cutadapt comes with an extensive suite of automated tests and is available under
+ the terms of the MIT license.
+
+ If you use cutadapt, please cite
+ `DOI:10.14806/ej.17.1.200 <http://dx.doi.org/10.14806/ej.17.1.200>`_ .
+
+
+ Links
+ -----
+
+ * `Documentation <https://cutadapt.readthedocs.io/>`_
+ * `Source code <https://github.com/marcelm/cutadapt/>`_
+ * `Report an issue <https://github.com/marcelm/cutadapt/issues>`_
+ * `Project page on PyPI (Python package index) <https://pypi.python.org/pypi/cutadapt/>`_
+ * `Follow @marcelm_ on Twitter <https://twitter.com/marcelm_>`_
+ * `Wrapper for the Galaxy platform <https://bitbucket.org/lance_parsons/cutadapt_galaxy_wrapper>`_
+
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Console
diff --git a/README.rst b/README.rst
index fcae283..a1c052e 100644
--- a/README.rst
+++ b/README.rst
@@ -33,7 +33,7 @@ If you use cutadapt, please cite
Links
-----
-* `Documentation <https://cutadapt.readthedocs.org/>`_
+* `Documentation <https://cutadapt.readthedocs.io/>`_
* `Source code <https://github.com/marcelm/cutadapt/>`_
* `Report an issue <https://github.com/marcelm/cutadapt/issues>`_
* `Project page on PyPI (Python package index) <https://pypi.python.org/pypi/cutadapt/>`_
diff --git a/bin/_preamble.py b/bin/_preamble.py
deleted file mode 100644
index 55f392a..0000000
--- a/bin/_preamble.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Twisted Matrix Laboratories.
-#
-# Copied from Twisted (http://twistedmatrix.com/), see
-# http://twistedmatrix.com/trac/browser/trunk/LICENSE for the license.
-#
-# This makes sure that users don't have to set up their environment
-# specially in order to run these programs from bin/.
-
-# This helper is shared by many different actual scripts. It is not intended to
-# be packaged or installed, it is only a developer convenience. By the time
-# the package is actually installed somewhere, the environment should already be set
-# up properly without the help of this tool.
-
-import sys, os
-
-path = os.path.abspath(sys.argv[0])
-while os.path.dirname(path) != path:
- if os.path.exists(os.path.join(path, 'cutadapt', '__init__.py')):
- sys.path.insert(0, path)
- break
- path = os.path.dirname(path)
diff --git a/bin/cutadapt b/bin/cutadapt
deleted file mode 100755
index 02c4c8d..0000000
--- a/bin/cutadapt
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env python
-import sys
-
-try:
- import _preamble
-except ImportError:
- pass
-
-from cutadapt.scripts import cutadapt
-cutadapt.main()
diff --git a/cutadapt.egg-info/PKG-INFO b/cutadapt.egg-info/PKG-INFO
index 9c0d080..2004e95 100644
--- a/cutadapt.egg-info/PKG-INFO
+++ b/cutadapt.egg-info/PKG-INFO
@@ -1,12 +1,53 @@
Metadata-Version: 1.1
Name: cutadapt
-Version: 1.11
+Version: 1.12
Summary: trim adapters from high-throughput sequencing reads
Home-page: https://cutadapt.readthedocs.io/
Author: Marcel Martin
Author-email: marcel.martin at scilifelab.se
License: MIT
-Description: UNKNOWN
+Description: .. image:: https://travis-ci.org/marcelm/cutadapt.svg?branch=master
+ :target: https://travis-ci.org/marcelm/cutadapt
+
+ .. image:: https://img.shields.io/pypi/v/cutadapt.svg?branch=master
+ :target: https://pypi.python.org/pypi/cutadapt
+
+ ========
+ cutadapt
+ ========
+
+ Cutadapt finds and removes adapter sequences, primers, poly-A tails and other
+ types of unwanted sequence from your high-throughput sequencing reads.
+
+ Cleaning your data in this way is often required: Reads from small-RNA
+ sequencing contain the 3’ sequencing adapter because the read is longer than
+ the molecule that is sequenced. Amplicon reads start with a primer sequence.
+ Poly-A tails are useful for pulling out RNA from your sample, but often you
+ don’t want them to be in your reads.
+
+ Cutadapt helps with these trimming tasks by finding the adapter or primer
+ sequences in an error-tolerant way. It can also modify and filter reads in
+ various ways. Adapter sequences can contain IUPAC wildcard characters. Also,
+ paired-end reads and even colorspace data is supported. If you want, you can
+ also just demultiplex your input data, without removing adapter sequences at all.
+
+ Cutadapt comes with an extensive suite of automated tests and is available under
+ the terms of the MIT license.
+
+ If you use cutadapt, please cite
+ `DOI:10.14806/ej.17.1.200 <http://dx.doi.org/10.14806/ej.17.1.200>`_ .
+
+
+ Links
+ -----
+
+ * `Documentation <https://cutadapt.readthedocs.io/>`_
+ * `Source code <https://github.com/marcelm/cutadapt/>`_
+ * `Report an issue <https://github.com/marcelm/cutadapt/issues>`_
+ * `Project page on PyPI (Python package index) <https://pypi.python.org/pypi/cutadapt/>`_
+ * `Follow @marcelm_ on Twitter <https://twitter.com/marcelm_>`_
+ * `Wrapper for the Galaxy platform <https://bitbucket.org/lance_parsons/cutadapt_galaxy_wrapper>`_
+
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Console
diff --git a/cutadapt.egg-info/SOURCES.txt b/cutadapt.egg-info/SOURCES.txt
index 0a37ece..17c943b 100644
--- a/cutadapt.egg-info/SOURCES.txt
+++ b/cutadapt.egg-info/SOURCES.txt
@@ -6,8 +6,6 @@ README.rst
setup.cfg
setup.py
versioneer.py
-bin/_preamble.py
-bin/cutadapt
cutadapt/__init__.py
cutadapt/_align.c
cutadapt/_align.pyx
@@ -25,10 +23,11 @@ cutadapt/modifiers.py
cutadapt/qualtrim.py
cutadapt/report.py
cutadapt/seqio.py
-cutadapt/xopen.py
cutadapt.egg-info/PKG-INFO
cutadapt.egg-info/SOURCES.txt
cutadapt.egg-info/dependency_links.txt
+cutadapt.egg-info/entry_points.txt
+cutadapt.egg-info/requires.txt
cutadapt.egg-info/top_level.txt
cutadapt/scripts/__init__.py
cutadapt/scripts/cutadapt.py
@@ -51,7 +50,6 @@ tests/testqualtrim.py
tests/tests.py
tests/testseqio.py
tests/testtrim.py
-tests/testxopen.py
tests/utils.py
tests/cut/454.fa
tests/cut/SRR2040271_1.fastq
@@ -116,6 +114,8 @@ tests/cut/polya.fasta
tests/cut/rest.fa
tests/cut/restfront.fa
tests/cut/s_1_sequence.txt
+tests/cut/shortened.fastq
+tests/cut/small-no-trim.fasta
tests/cut/small.fasta
tests/cut/small.fastq
tests/cut/small.trimmed.fastq
diff --git a/cutadapt.egg-info/entry_points.txt b/cutadapt.egg-info/entry_points.txt
new file mode 100644
index 0000000..3e411b5
--- /dev/null
+++ b/cutadapt.egg-info/entry_points.txt
@@ -0,0 +1,3 @@
+[console_scripts]
+cutadapt = cutadapt.scripts.cutadapt:main
+
diff --git a/cutadapt.egg-info/requires.txt b/cutadapt.egg-info/requires.txt
new file mode 100644
index 0000000..855e83a
--- /dev/null
+++ b/cutadapt.egg-info/requires.txt
@@ -0,0 +1 @@
+xopen>=0.1.0
diff --git a/cutadapt/_align.c b/cutadapt/_align.c
index aec12bb..51ea974 100644
--- a/cutadapt/_align.c
+++ b/cutadapt/_align.c
@@ -1,10 +1,11 @@
-/* Generated by Cython 0.24 */
+/* Generated by Cython 0.24.1 */
/* BEGIN: Cython Metadata
{
"distutils": {
"depends": []
- }
+ },
+ "module_name": "cutadapt._align"
}
END: Cython Metadata */
@@ -15,7 +16,7 @@ END: Cython Metadata */
#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03020000)
#error Cython requires Python 2.6+ or Python 3.2+.
#else
-#define CYTHON_ABI "0_24"
+#define CYTHON_ABI "0_24_1"
#include <stddef.h>
#ifndef offsetof
#define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
@@ -118,6 +119,9 @@ END: Cython Metadata */
#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
#define PyUnicode_Contains(u, s) PySequence_Contains(u, s)
#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+ #define PyByteArray_Check(obj) PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
#define PyObject_Format(obj, fmt) PyObject_CallMethod(obj, "__format__", "O", fmt)
#endif
@@ -243,6 +247,11 @@ static CYTHON_INLINE float __PYX_NAN() {
return value;
}
#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
#define __PYX_ERR(f_index, lineno, Ln_error) \
@@ -1009,11 +1018,13 @@ typedef struct {
PyObject *yieldfrom;
PyObject *gi_name;
PyObject *gi_qualname;
+ PyObject *gi_modulename;
int resume_label;
char is_running;
} __pyx_CoroutineObject;
-static __pyx_CoroutineObject *__Pyx__Coroutine_New(PyTypeObject *type, __pyx_coroutine_body_t body,
- PyObject *closure, PyObject *name, PyObject *qualname);
+static __pyx_CoroutineObject *__Pyx__Coroutine_New(
+ PyTypeObject *type, __pyx_coroutine_body_t body, PyObject *closure,
+ PyObject *name, PyObject *qualname, PyObject *module_name);
static int __Pyx_Coroutine_clear(PyObject *self);
#if 1 || PY_VERSION_HEX < 0x030300B0
static int __Pyx_PyGen_FetchStopIterationValue(PyObject **pvalue);
@@ -1031,8 +1042,8 @@ static int __Pyx_patch_abc(void);
#define __Pyx_Generator_USED
static PyTypeObject *__pyx_GeneratorType = 0;
#define __Pyx_Generator_CheckExact(obj) (Py_TYPE(obj) == __pyx_GeneratorType)
-#define __Pyx_Generator_New(body, closure, name, qualname)\
- __Pyx__Coroutine_New(__pyx_GeneratorType, body, closure, name, qualname)
+#define __Pyx_Generator_New(body, closure, name, qualname, module_name)\
+ __Pyx__Coroutine_New(__pyx_GeneratorType, body, closure, name, qualname, module_name)
static PyObject *__Pyx_Generator_Next(PyObject *self);
static int __pyx_Generator_init(void);
@@ -2159,7 +2170,7 @@ static PyObject *__pyx_pf_8cutadapt_6_align_8DPMatrix_7__str___genexpr(PyObject
__Pyx_INCREF(((PyObject *)__pyx_cur_scope->__pyx_outer_scope));
__Pyx_GIVEREF(__pyx_cur_scope->__pyx_outer_scope);
{
- __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_8cutadapt_6_align_8DPMatrix_7__str___2generator, (PyObject *) __pyx_cur_scope, __pyx_n_s_genexpr, __pyx_n_s_DPMatrix___str___locals_genexpr); if (unlikely(!gen)) __PYX_ERR(0, 112, __pyx_L1_error)
+ __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_8cutadapt_6_align_8DPMatrix_7__str___2generator, (PyObject *) __pyx_cur_scope, __pyx_n_s_genexpr, __pyx_n_s_DPMatrix___str___locals_genexpr, __pyx_n_s_cutadapt__align); if (unlikely(!gen)) __PYX_ERR(0, 112, __pyx_L1_error)
__Pyx_DECREF(__pyx_cur_scope);
__Pyx_RefNannyFinishContext();
return (PyObject *) gen;
@@ -2301,7 +2312,7 @@ static PyObject *__pyx_pf_8cutadapt_6_align_8DPMatrix_7__str___3genexpr(PyObject
__Pyx_INCREF(((PyObject *)__pyx_cur_scope->__pyx_outer_scope));
__Pyx_GIVEREF(__pyx_cur_scope->__pyx_outer_scope);
{
- __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_8cutadapt_6_align_8DPMatrix_7__str___5generator1, (PyObject *) __pyx_cur_scope, __pyx_n_s_genexpr, __pyx_n_s_DPMatrix___str___locals_genexpr); if (unlikely(!gen)) __PYX_ERR(0, 114, __pyx_L1_error)
+ __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_8cutadapt_6_align_8DPMatrix_7__str___5generator1, (PyObject *) __pyx_cur_scope, __pyx_n_s_genexpr, __pyx_n_s_DPMatrix___str___locals_genexpr, __pyx_n_s_cutadapt__align); if (unlikely(!gen)) __PYX_ERR(0, 114, __pyx_L1_error)
__Pyx_DECREF(__pyx_cur_scope);
__Pyx_RefNannyFinishContext();
return (PyObject *) gen;
@@ -8789,8 +8800,9 @@ __Pyx_Coroutine_set_qualname(__pyx_CoroutineObject *self, PyObject *value)
Py_XDECREF(tmp);
return 0;
}
-static __pyx_CoroutineObject *__Pyx__Coroutine_New(PyTypeObject* type, __pyx_coroutine_body_t body,
- PyObject *closure, PyObject *name, PyObject *qualname) {
+static __pyx_CoroutineObject *__Pyx__Coroutine_New(
+ PyTypeObject* type, __pyx_coroutine_body_t body, PyObject *closure,
+ PyObject *name, PyObject *qualname, PyObject *module_name) {
__pyx_CoroutineObject *gen = PyObject_GC_New(__pyx_CoroutineObject, type);
if (gen == NULL)
return NULL;
@@ -8809,6 +8821,8 @@ static __pyx_CoroutineObject *__Pyx__Coroutine_New(PyTypeObject* type, __pyx_cor
gen->gi_qualname = qualname;
Py_XINCREF(name);
gen->gi_name = name;
+ Py_XINCREF(module_name);
+ gen->gi_modulename = module_name;
PyObject_GC_Track(gen);
return gen;
}
diff --git a/cutadapt/_seqio.c b/cutadapt/_seqio.c
index 5f491dc..04eacc2 100644
--- a/cutadapt/_seqio.c
+++ b/cutadapt/_seqio.c
@@ -3975,7 +3975,7 @@ PyMODINIT_FUNC PyInit__seqio(void)
__Pyx_INCREF(__pyx_n_s_xopen);
__Pyx_GIVEREF(__pyx_n_s_xopen);
PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_xopen);
- __pyx_t_2 = __Pyx_Import(__pyx_n_s_xopen, __pyx_t_1, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4, __pyx_L1_error)
+ __pyx_t_2 = __Pyx_Import(__pyx_n_s_xopen, __pyx_t_1, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_2);
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
__pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_xopen); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error)
diff --git a/cutadapt/_seqio.pyx b/cutadapt/_seqio.pyx
index e94f82e..d038485 100644
--- a/cutadapt/_seqio.pyx
+++ b/cutadapt/_seqio.pyx
@@ -1,7 +1,7 @@
# kate: syntax Python;
# cython: profile=False, emit_code_comments=False
from __future__ import print_function, division, absolute_import
-from .xopen import xopen
+from xopen import xopen
from .seqio import _shorten, FormatError, SequenceReader
diff --git a/cutadapt/_version.py b/cutadapt/_version.py
index 9b4fb9e..cd06da1 100644
--- a/cutadapt/_version.py
+++ b/cutadapt/_version.py
@@ -11,8 +11,8 @@ version_json = '''
{
"dirty": false,
"error": null,
- "full-revisionid": "46bab59782eb0930854a0a73640b011b4cf4b6cf",
- "version": "1.11"
+ "full-revisionid": "98f0e2f2f658646a5315f9f54ecc6cc64a5fd660",
+ "version": "1.12"
}
''' # END VERSION_JSON
diff --git a/cutadapt/adapters.py b/cutadapt/adapters.py
index f629c8f..8a8fb36 100644
--- a/cutadapt/adapters.py
+++ b/cutadapt/adapters.py
@@ -70,7 +70,7 @@ class AdapterParser(object):
self.constructor_args = kwargs
self.adapter_class = ColorspaceAdapter if colorspace else Adapter
- def parse(self, spec, name=None, cmdline_type='back'):
+ def _parse_no_file(self, spec, name=None, cmdline_type='back'):
"""
Parse an adapter specification not using ``file:`` notation and return
an object of an appropriate Adapter class. The notation for anchored
@@ -112,10 +112,10 @@ class AdapterParser(object):
return self.adapter_class(sequence, where, name=name, **self.constructor_args)
- def parse_with_file(self, spec, cmdline_type='back'):
+ def parse(self, spec, cmdline_type='back'):
"""
Parse an adapter specification and yield appropriate Adapter classes.
- This works like the parse() function above, but also supports the
+ This works like the _parse_no_file() function above, but also supports the
``file:`` notation for reading adapters from an external FASTA
file. Since a file can contain multiple adapters, this
function is a generator.
@@ -125,10 +125,10 @@ class AdapterParser(object):
with FastaReader(spec[5:]) as fasta:
for record in fasta:
name = record.name.split(None, 1)[0]
- yield self.parse(record.sequence, name, cmdline_type)
+ yield self._parse_no_file(record.sequence, name, cmdline_type)
else:
name, spec = self._extract_name(spec)
- yield self.parse(spec, name, cmdline_type)
+ yield self._parse_no_file(spec, name, cmdline_type)
def _extract_name(self, spec):
"""
@@ -154,7 +154,7 @@ class AdapterParser(object):
adapters = []
for specs, cmdline_type in (back, 'back'), (anywhere, 'anywhere'), (front, 'front'):
for spec in specs:
- adapters.extend(self.parse_with_file(spec, cmdline_type))
+ adapters.extend(self.parse(spec, cmdline_type))
return adapters
diff --git a/cutadapt/filters.py b/cutadapt/filters.py
index 3ab1e04..6bf79c3 100644
--- a/cutadapt/filters.py
+++ b/cutadapt/filters.py
@@ -15,7 +15,7 @@ The read is then assumed to have been "consumed", that is, either written
somewhere or filtered (should be discarded).
"""
from __future__ import print_function, division, absolute_import
-from .xopen import xopen
+from xopen import xopen
from . import seqio
# Constants used when returning from a Filter’s __call__ method to improve
diff --git a/cutadapt/modifiers.py b/cutadapt/modifiers.py
index af4944d..f2454ae 100644
--- a/cutadapt/modifiers.py
+++ b/cutadapt/modifiers.py
@@ -260,6 +260,15 @@ class QualityTrimmer(object):
return read[start:stop]
+class Shortener(object):
+ """Uncoditionally shorten a read to the given length"""
+ def __init__(self, length):
+ self.length = length
+
+ def __call__(self, read):
+ return read[:self.length]
+
+
class NEndTrimmer(object):
"""Trims Ns from the 3' and 5' end of reads"""
def __init__(self):
diff --git a/cutadapt/scripts/cutadapt.py b/cutadapt/scripts/cutadapt.py
index bc3fdbd..bb230d8 100755
--- a/cutadapt/scripts/cutadapt.py
+++ b/cutadapt/scripts/cutadapt.py
@@ -69,13 +69,13 @@ import functools
import logging
import platform
import textwrap
+from xopen import xopen
from cutadapt import seqio, __version__
-from cutadapt.xopen import xopen
from cutadapt.adapters import AdapterParser
from cutadapt.modifiers import (LengthTagModifier, SuffixRemover, PrefixSuffixAdder,
DoubleEncoder, ZeroCapper, PrimerTrimmer, QualityTrimmer, UnconditionalCutter,
- NEndTrimmer, AdapterCutter, NextseqQualityTrimmer)
+ NEndTrimmer, AdapterCutter, NextseqQualityTrimmer, Shortener)
from cutadapt.filters import (NoFilter, PairedNoFilter, Redirector, PairedRedirector,
LegacyPairedRedirector, TooShortReadFilter, TooLongReadFilter,
Demultiplexer, NContentFilter, DiscardUntrimmedFilter, DiscardTrimmedFilter)
@@ -99,50 +99,57 @@ class RestFileWriter(object):
print(rest, match.read.name, file=self.file)
-def process_single_reads(reader, modifiers, filters):
+class SingleEndPipeline:
"""
- Loop over reads, find adapters, trim reads, apply modifiers and
- output modified reads.
-
- Return a Statistics object.
+ Processing pipeline that loops over reads and applies modifiers and filters
"""
- n = 0 # no. of processed reads
- total_bp = 0
- for read in reader:
- n += 1
- total_bp += len(read.sequence)
- for modifier in modifiers:
- read = modifier(read)
- for filter in filters:
- if filter(read):
- break
-
- return Statistics(n=n, total_bp1=total_bp, total_bp2=None)
-
-
-def process_paired_reads(paired_reader, modifiers1, modifiers2, filters):
+ def __init__(self, reader, modifiers, filters):
+ self.reader = reader
+ self.modifiers = modifiers
+ self.filters = filters
+
+ def run(self):
+ """Run the pipeline. Return a Statistics object"""
+ n = 0 # no. of processed reads
+ total_bp = 0
+ for read in self.reader:
+ n += 1
+ total_bp += len(read.sequence)
+ for modifier in self.modifiers:
+ read = modifier(read)
+ for filter in self.filters:
+ if filter(read):
+ break
+ return Statistics(n=n, total_bp1=total_bp, total_bp2=None)
+
+
+class PairedEndPipeline:
"""
- Loop over reads, find adapters, trim reads, apply modifiers and
- output modified reads.
-
- Return a Statistics object.
+ Processing pipeline for paired-end reads.
"""
- n = 0 # no. of processed reads
- total1_bp = 0
- total2_bp = 0
- for read1, read2 in paired_reader:
- n += 1
- total1_bp += len(read1.sequence)
- total2_bp += len(read2.sequence)
- for modifier in modifiers1:
- read1 = modifier(read1)
- for modifier in modifiers2:
- read2 = modifier(read2)
- for filter in filters:
- # Stop writing as soon as one of the filters was successful.
- if filter(read1, read2):
- break
- return Statistics(n=n, total_bp1=total1_bp, total_bp2=total2_bp)
+ def __init__(self, paired_reader, modifiers1, modifiers2, filters):
+ self.paired_reader = paired_reader
+ self.modifiers1 = modifiers1
+ self.modifiers2 = modifiers2
+ self.filters = filters
+
+ def run(self):
+ n = 0 # no. of processed reads
+ total1_bp = 0
+ total2_bp = 0
+ for read1, read2 in self.paired_reader:
+ n += 1
+ total1_bp += len(read1.sequence)
+ total2_bp += len(read2.sequence)
+ for modifier in self.modifiers1:
+ read1 = modifier(read1)
+ for modifier in self.modifiers2:
+ read2 = modifier(read2)
+ for filter in self.filters:
+ # Stop writing as soon as one of the filters was successful.
+ if filter(read1, read2):
+ break
+ return Statistics(n=n, total_bp1=total1_bp, total_bp2=total2_bp)
def setup_logging(stdout=False, quiet=False):
@@ -238,6 +245,9 @@ def get_option_parser():
help="Assume that quality values in FASTQ are encoded as ascii(quality "
"+ QUALITY_BASE). This needs to be set to 64 for some old Illumina "
"FASTQ files. Default: %default")
+ group.add_option("--length", "-l", type=int, default=None, metavar="LENGTH",
+ help="Shorten reads to LENGTH. This and the following modifications"
+ "are applied after adapter trimming.")
group.add_option("--trim-n", action='store_true', default=False,
help="Trim N's on ends of reads.")
group.add_option("--length-tag", metavar="TAG",
@@ -605,15 +615,6 @@ def main(cmdlineargs=None, default_outfile=sys.stdout):
for adapter in adapters + adapters2:
adapter.enable_debug()
- if not adapters and not adapters2 and not cutoffs and \
- options.nextseq_trim is None and \
- options.cut == [] and options.cut2 == [] and \
- options.minimum_length == 0 and \
- options.maximum_length == sys.maxsize and \
- quality_filename is None and \
- options.max_n == -1 and not options.trim_n:
- parser.error("You need to provide at least one adapter sequence.")
-
# Create the single-end processing pipeline (a list of "modifiers")
modifiers = []
if options.cut:
@@ -638,6 +639,8 @@ def main(cmdlineargs=None, default_outfile=sys.stdout):
# Modifiers that apply to both reads of paired-end reads unless in legacy mode
modifiers_both = []
+ if options.length is not None:
+ modifiers_both.append(Shortener(options.length))
if options.trim_n:
modifiers_both.append(NEndTrimmer())
if options.length_tag:
@@ -680,6 +683,11 @@ def main(cmdlineargs=None, default_outfile=sys.stdout):
adapter_cutter2 = None
modifiers2.extend(modifiers_both)
+ if paired:
+ pipeline = PairedEndPipeline(reader, modifiers, modifiers2, filters)
+ else:
+ pipeline = SingleEndPipeline(reader, modifiers, filters)
+
logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version())
logger.info("Command line parameters: %s", " ".join(cmdlineargs))
logger.info("Trimming %s adapter%s with at most %.1f%% errors in %s mode ...",
@@ -696,10 +704,7 @@ def main(cmdlineargs=None, default_outfile=sys.stdout):
start_time = time.clock()
try:
- if paired:
- stats = process_paired_reads(reader, modifiers, modifiers2, filters)
- else:
- stats = process_single_reads(reader, modifiers, filters)
+ stats = pipeline.run()
except KeyboardInterrupt as e:
print("Interrupted", file=sys.stderr)
sys.exit(130)
diff --git a/cutadapt/seqio.py b/cutadapt/seqio.py
index 91518b8..fbb9aef 100644
--- a/cutadapt/seqio.py
+++ b/cutadapt/seqio.py
@@ -8,9 +8,11 @@ TODO
before the first space)
"""
from __future__ import print_function, division, absolute_import
+
import sys
from os.path import splitext
-from .xopen import xopen
+from xopen import xopen
+
from .compat import zip, basestring
__author__ = "Marcel Martin"
diff --git a/cutadapt/xopen.py b/cutadapt/xopen.py
deleted file mode 100644
index c1b8c90..0000000
--- a/cutadapt/xopen.py
+++ /dev/null
@@ -1,182 +0,0 @@
-"""
-Open compressed files transparently.
-"""
-from __future__ import print_function, division, absolute_import
-__author__ = 'Marcel Martin'
-
-import gzip
-import sys
-import io
-import os
-from subprocess import Popen, PIPE
-from .compat import PY3, basestring
-
-try:
- import bz2
-except ImportError:
- bz2 = None
-
-try:
- import lzma
-except ImportError:
- lzma = None
-
-if sys.version_info < (2, 7):
- buffered_reader = lambda x: x
- buffered_writer = lambda x: x
-else:
- buffered_reader = io.BufferedReader
- buffered_writer = io.BufferedWriter
-
-
-class GzipWriter:
- def __init__(self, path, mode='w'):
- self.outfile = open(path, mode)
- self.devnull = open(os.devnull, 'w')
- try:
- # Setting close_fds to True is necessary due to
- # http://bugs.python.org/issue12786
- self.process = Popen(['gzip'], stdin=PIPE, stdout=self.outfile,
- stderr=self.devnull, close_fds=True)
- except IOError as e:
- self.outfile.close()
- self.devnull.close()
- raise
-
- def write(self, arg):
- self.process.stdin.write(arg)
-
- def close(self):
- self.process.stdin.close()
- retcode = self.process.wait()
- self.outfile.close()
- self.devnull.close()
- if retcode != 0:
- raise IOError("Output gzip process terminated with exit code {0}".format(retcode))
-
- def __enter__(self):
- return self
-
- def __exit__(self, *exc_info):
- self.close()
-
-
-class GzipReader:
- def __init__(self, path):
- self.process = Popen(['gzip', '-cd', path], stdout=PIPE)
-
- def close(self):
- retcode = self.process.poll()
- if retcode is None:
- # still running
- self.process.terminate()
- self._raise_if_error()
-
- def __iter__(self):
- for line in self.process.stdout:
- yield line
- self.process.wait()
- self._raise_if_error()
-
- def _raise_if_error(self):
- """
- Raise EOFError if process is not running anymore and the
- exit code is nonzero.
- """
- retcode = self.process.poll()
- if retcode is not None and retcode != 0:
- raise EOFError("gzip process returned non-zero exit code {0}. Is the "
- "input file truncated or corrupt?".format(retcode))
-
- def read(self, *args):
- data = self.process.stdout.read(*args)
- if len(args) == 0 or args[0] <= 0:
- # wait for process to terminate until we check the exit code
- self.process.wait()
- self._raise_if_error()
-
- def __enter__(self):
- return self
-
- def __exit__(self, *exc_info):
- self.close()
-
-
-def xopen(filename, mode='r'):
- """
- Replacement for the "open" function that can also open files that have
- been compressed with gzip, bzip2 or xz. If the filename is '-', standard
- output (mode 'w') or input (mode 'r') is returned. If the filename ends
- with .gz, the file is opened with a pipe to the gzip program. If that
- does not work, then gzip.open() is used (the gzip module is slower than
- the pipe to the gzip program). If the filename ends with .bz2, it's
- opened as a bz2.BZ2File. Otherwise, the regular open() is used.
-
- mode can be: 'rt', 'rb', 'a', 'wt', or 'wb'
- Instead of 'rt' and 'wt', 'r' and 'w' can be used as abbreviations.
-
- In Python 2, the 't' and 'b' characters are ignored.
-
- Append mode ('a') is unavailable with BZ2 compression and will raise an error.
- """
- if mode == 'r':
- mode = 'rt'
- elif mode == 'w':
- mode = 'wt'
- if mode not in ('rt', 'rb', 'wt', 'wb', 'a'):
- raise ValueError("mode '{0}' not supported".format(mode))
- if not PY3:
- mode = mode[0]
- if not isinstance(filename, basestring):
- raise ValueError("the filename must be a string")
-
- # standard input and standard output handling
- if filename == '-':
- if not PY3:
- return sys.stdin if 'r' in mode else sys.stdout
- return dict(
- rt=sys.stdin,
- wt=sys.stdout,
- rb=sys.stdin.buffer,
- wb=sys.stdout.buffer)[mode]
-
- if filename.endswith('.bz2'):
- if bz2 is None:
- raise ImportError("Cannot open bz2 files: The bz2 module is not available")
- if PY3:
- if 't' in mode:
- return io.TextIOWrapper(bz2.BZ2File(filename, mode[0]))
- else:
- return bz2.BZ2File(filename, mode)
- else:
- return bz2.BZ2File(filename, mode)
- elif filename.endswith('.xz'):
- if lzma is None:
- raise ImportError("Cannot open xz files: The lzma module is not available "
- "(use Python 3.3 or newer)")
- return lzma.open(filename, mode)
- elif filename.endswith('.gz'):
- if PY3:
- if 't' in mode:
- # gzip.open in Python 3.2 does not support modes 'rt' and 'wt''
- return io.TextIOWrapper(gzip.open(filename, mode[0]))
- else:
- if 'r' in mode:
- return io.BufferedReader(gzip.open(filename, mode))
- else:
- return io.BufferedWriter(gzip.open(filename, mode))
- else:
- # rb/rt are equivalent in Py2
- if 'r' in mode:
- try:
- return GzipReader(filename)
- except IOError:
- # gzip not installed
- return buffered_reader(gzip.open(filename, mode))
- else:
- try:
- return GzipWriter(filename, mode)
- except IOError:
- return buffered_writer(gzip.open(filename, mode))
- else:
- return open(filename, mode)
diff --git a/doc/colorspace.rst b/doc/colorspace.rst
index fc9c599..c67e435 100644
--- a/doc/colorspace.rst
+++ b/doc/colorspace.rst
@@ -1,3 +1,5 @@
+.. _colorspace:
+
Colorspace reads
================
@@ -5,15 +7,29 @@ Cutadapt was designed to work with colorspace reads from the ABi SOLiD
sequencer. Colorspace trimming is activated by the ``--colorspace``
option (or use ``-c`` for short). The input reads can be given either:
-- in a FASTA file
+- in a FASTA file (typically extensions ``.csfasta`` or ``.csfa``)
- in a FASTQ file
- in a ``.csfasta`` and a ``.qual`` file (this is the native SOLiD
- format).
+ format). That is, cutadapt expects *two* file names in this case.
In all cases, the colors must be represented by the characters 0, 1, 2,
-3. Example input files are in the cutadapt distribution at
+3. Here is an example input file in ``.fastq`` format that is accepted::
+
+ @1_13_85_F3
+ T110020300.0113010210002110102330021
+ +
+ 7&9<&77)& <7))%4'657-1+9;9,.<8);.;8
+ @1_13_573_F3
+ T312311200.3021301101113203302010003
+ +
+ 6)3%)&&&& .1&(6:<'67..*,:75)'77&&&5
+
+Further example input files can be found in the cutadapt distribution at
``tests/data/solid.*``. The ``.csfasta``/``.qual`` file format is
-automatically assumed if two input files are given to cutadapt.
+automatically assumed if two input files are given to cutadapt, and when no
+paired-end trimming options are used.
+
+Cutadapt always converts input data given as a pair of FASTA/QUAL files to FASTQ.
In colorspace mode, the adapter sequences given to the ``-a``, ``-b``
and ``-g`` options can be given both as colors or as nucleotides. If
@@ -24,7 +40,7 @@ colorspace. For example, to trim an adapter from ``solid.csfasta`` and
cutadapt -c -a CGCCTTGGCCGTACAGCAG solid.csfasta solid.qual > output.fastq
In case you know the colorspace adapter sequence, you can also write
-``330201030313112312`` instead of ``CGCCTTGGCCGTACAGCAG`` and the result
+``330201030313112312`` instead of ``CGCCTTGGCCGTACAGCAG``, and the result
is the same.
Ambiguity in colorspace
diff --git a/doc/guide.rst b/doc/guide.rst
index 2c8092c..676faf8 100644
--- a/doc/guide.rst
+++ b/doc/guide.rst
@@ -5,14 +5,13 @@ User guide
Basic usage
===========
-If you just want to trim a 3' adapter, the basic command-line for cutadapt is::
+To trim a 3' adapter, the basic command-line for cutadapt is::
cutadapt -a AACCGGTT -o output.fastq input.fastq
-The sequence of the adapter is given with the ``-a`` option. Of course, you
-need to replace ``AACCGGTT`` with your actual adapter sequence. Reads are read
-from the input file ``input.fastq`` and written to the output file
-``output.fastq``.
+The sequence of the adapter is given with the ``-a`` option. You need to replace
+``AACCGGTT`` with your actual adapter sequence. Reads are read from the input
+file ``input.fastq`` and written to the output file ``output.fastq``.
Cutadapt searches for the adapter in all reads and removes it when it finds it.
All reads that were present in the input file will also be present in the output
@@ -20,29 +19,26 @@ file, some of them trimmed, some of them not. Even reads that were trimmed
entirely (because the adapter was found in the very beginning) are output. All
of this can be changed with command-line options, explained further down.
-A report is printed after cutadapt has finished processing the reads.
-
Input and output file formats
-----------------------------
Input files for cutadapt need to be in one the these formats:
-* FASTA (file name extensions: ``.fasta``, ``.fa``, ``.fna``, ``.csfasta``, ``.csfa``)
+* FASTA (file name extensions: ``.fasta``, ``.fa``, ``.fna``)
* FASTQ (extensions: ``.fastq``, ``.fq``)
-* A pair of a FASTA file and a ``.(cs)qual`` file
+* Any of the above, but compressed as ``.gz`` (even ``.bz2`` and ``.xz`` are supported).
+
+:ref:`Cutadapt’s support for processing of colorspace data is described
+elsewhere <colorspace>`.
-The latter format is (or was) used for colorspace data from the SOLiD
-instruments.
+Input and output file formats are recognized from the file name extension. You
+can override the input format with the ``--format`` option.
-The input file format is recognized from the file name extension (given in
-parentheses in the list above). You can also explicitly specify which format
-the input has by using the ``--format`` option.
+You can even use this -- without any adapter trimming -- to convert from
+FASTQ to FASTA::
-The output format is the same as the input format, except for the FASTA/QUAL
-pairs -- those will always be converted to FASTQ. Also, cutadapt does not check
-the output file name: If you input FASTQ data, but use ``-o output.fasta``, then
-the output file will actually be in FASTQ format.
+ cutadapt -o output.fasta input.fastq.gz
Compressed files
@@ -116,7 +112,7 @@ way:
order in which they are applied is the order in which they are listed in the
help shown by ``cutadapt --help`` under the “Additional read modifications”
heading. Adapter trimming itself does not appear in that list and is
- done after quality trimming and before ``N``-end trimming (``--trim-N``).
+ done after quality trimming and before length trimming (``--length``/``-l``).
2. :ref:`Filtering options <filtering>` are applied, such as removal of too
short or untrimmed reads. Some of the filters also allow to redirect a read
@@ -154,8 +150,8 @@ and depending on the adapter type:
|
By default, all adapters :ref:`are searched error-tolerantly <error-tolerance>`.
-Adapter sequences :ref:`may also contain the "N" wildcard
-character <wildcards>`.
+Adapter sequences :ref:`may also contain any IUPAC wildcard
+character <wildcards>` (such as ``N``).
In addition, it is possible to :ref:`remove a fixed number of
bases <cut-bases>` from the beginning or end of each read, and to :ref:`remove
@@ -262,7 +258,7 @@ high, even this read will be trimmed::
BADAPTERSOMETHING
-The ``B`` in the beginnig is seen as an insertion. If you also want to prevent
+The ``B`` in the beginning is seen as an insertion. If you also want to prevent
this from happening, use the option ``--no-indels`` to disallow insertions and
deletions entirely.
@@ -295,21 +291,52 @@ Using ``-a ADAPTER$`` will result in::
MYSEQUENCE
MYSEQUENCEADAPTERSOMETHINGELSE
-Only the middle read is trimmed at all.
+That is, only the middle read is trimmed at all.
.. _linked-adapters:
-Linked adapters
----------------
+Linked adapters (combined 5' and 3' adapter)
+--------------------------------------------
+
+If your sequence of interest ist “framed” by a 5' and a 3' adapter, and you want
+to remove both adapters, then you may want to use a linked adapter, which
+combines an anchored 5' adapter and a 3' adapter.
+
+Use ``-a ADAPTER1...ADAPTER2`` to search for a linked adapter. ADAPTER1 is
+interpreted as an anchored 5' adapter and ADAPTER2 as a regular 3' adapter.
+
+For a read to be trimmed at all, the 5' adapter must occur, but the the 3'
+adapter is optional. In the statistics printed by the program, a read is counted
+as “trimmed” no matter whether the 5' adapter or both the 5' and 3' adapter
+occur.
-This is a combination of a 5' and a 3' adapter. Use ``-a ADAPTER1...ADAPTER2``
-to search for a linked adapter. ADAPTER1 is interpreted as an anchored 5'
-adapter, which is searched for first. Only if ADAPTER1 is found will then
-ADAPTER2 be searched for, which is a regular 3' adapter.
+As an example, assume the 5' adapter is *FIRST* and the 3' adapter is *SECOND*
+and you have these input reads::
-This feature is experimental and will probably break when used in combination
-with some other options, such as ``--info-file``, ``--mask-adapter``.
+ FIRSTMYSEQUENCESECONDEXTRABASES
+ FIRSTMYSEQUENCESEC
+ FIRSTMYSEQUE
+ ANOTHERREADSECOND
+
+Trimming with ::
+
+ cutadapt -a FIRST...SECOND input.fastq > output.fastq
+
+will result in ::
+
+ MYSEQUENCE
+ MYSEQUENCE
+ MYSEQUE
+ ANOTHERREADSECOND
+
+The 3' adapter in the last read is not trimmed because the read does not contain
+the 5' adapter. Note that ``FIRST`` is always an anchored 5' adapter (:ref:`see
+the previous section <anchored-5adapters>`) although there is no ``^`` character
+in the beginning.
+
+This feature does not work when used in combination with some other options,
+such as ``--info-file``, ``--mask-adapter``.
.. _anywhere-adapters:
@@ -487,8 +514,8 @@ Wildcards do not work in colorspace.
Repeated bases in the adapter sequence
--------------------------------------
-If you have many repeated bases in the adapter sequence, such as many ``N``s or
-many ``A``s, you do not have to spell them out. For example, instead of writing
+If you have many repeated bases in the adapter sequence, such as many ``N`` s or
+many ``A`` s, you do not have to spell them out. For example, instead of writing
ten ``A`` in a row (``AAAAAAAAAA``), write ``A{10}`` instead. The number within
the curly braces specifies how often the character that preceeds it will be
repeated. This works also for IUPAC wildcard characters, as in ``N{5}``.
@@ -526,7 +553,7 @@ To remove the last seven bases of each read::
cutadapt -u -7 -o trimmed.fastq reads.fastq
The ``-u``/``--cut`` option can be combined with the other options, but
-the desired bases are removed *before* any adapter trimming.
+the ``--cut`` is applied *before* any adapter trimming.
.. _quality-trimming:
@@ -587,6 +614,21 @@ the trimming position. Therefore, the read is trimmed to the first four bases,
which have quality values 42, 40, 26, 27.
+Shortening reads to a fixed length
+----------------------------------
+
+To shorten each read down to a certain length, use the ``--length`` option or
+the short version ``-l``::
+
+ cutadapt -l 10 input.fastq > output.fastq
+
+This shortens all reads from ``input.fastq`` down to 10 bases. The removed bases
+are those on the 3' end.
+
+If you want to remove a fixed number of bases from each read, use
+:ref:`the --cut option instead <cut-bases>`.
+
+
Modifying read names
--------------------
@@ -629,13 +671,14 @@ each read. Steps not requested on the command-line are skipped.
1. Unconditional base removal with ``--cut``
2. Quality trimming (``-q``)
3. Adapter trimming (``-a``, ``-b``, ``-g`` and uppercase versions)
-4. N-end trimming (``--trim-n``)
-5. Length tag modification (``--length-tag``)
-6. Read name suffixe removal (``--strip-suffix``)
-7. Addition of prefix and suffix to read name (``-x``/``--prefix`` and ``-y``/``--suffix``)
-8. Double-encode the sequence (only colorspace)
-9. Replace negative quality values with zero (zero capping, only colorspace)
-10. Trim primer base (only colorspace)
+4. Read shortening (``--length``)
+5. N-end trimming (``--trim-n``)
+6. Length tag modification (``--length-tag``)
+7. Read name suffix removal (``--strip-suffix``)
+8. Addition of prefix and suffix to read name (``-x``/``--prefix`` and ``-y``/``--suffix``)
+9. Double-encode the sequence (only colorspace)
+10. Replace negative quality values with zero (zero capping, only colorspace)
+11. Trim primer base (only colorspace)
The last three steps are colorspace-specific.
@@ -760,6 +803,7 @@ The following command-line options are applied to *both* reads:
* ``--no-trim``
* ``--trim-n``
* ``--mask``
+* ``--length``
* ``--length-tag``
* ``--prefix``, ``--suffix``
* ``--strip-f3``
@@ -945,6 +989,10 @@ If your adapter sequences are all similar and differ only by a variable barcode
sequence, you should use a single adapter sequence instead that
:ref:`contains wildcard characters <wildcards>`.
+If you want to search for a combination of a 5' and a 3' adapter, you may want
+to provide them as a single so-called :ref:`"linked adapter" <linked-adapters>`
+instead.
+
.. _named-adapters:
@@ -1029,14 +1077,21 @@ once. So your sequence could look like this::
ADAPTERADAPTERADAPTERMYSEQUENCE
-To be on the safe side, you assume that there are at most 5 copies of the
+To be on the safe side, you assume that there are at most five copies of the
adapter sequence. This command can be used to trim the reads correctly::
- cutadapt -g ^ADAPTER -n 5 -o output.fastq input.fastq
+ cutadapt -g ^ADAPTER -n 5 -o output.fastq.gz input.fastq.gz
-This feature can also be used to search for *5'/3' linked adapters*. For example,
-when the 5' adapter is *FIRST* and the 3' adapter is *SECOND*, then the read
-could look like this::
+To search for a combination of a 5' and a 3' adapter, have a look
+at the :ref:`support for "linked adapters" <linked-adapters>` instead, which
+works better for that particular case because it is allows you to require that
+the 3' adapter is trimmed only when the 5' adapter also occurs, and it cannot
+happen that the same adapter is trimmed twice.
+
+Before cutadapt supported linked adapters, the ``--times`` option was the
+recommended way to search for 5'/3' linked adapters. For completeness, we
+describe how it was done. For example, when the 5' adapter is *FIRST* and the
+3' adapter is *SECOND*, then the read could look like this::
FIRSTMYSEQUENCESECOND
@@ -1045,12 +1100,6 @@ following command can be used to trim such a read::
cutadapt -g ^FIRST -a SECOND -n 2 ...
-Support for linked adapters is currently incomplete. For example, it is not
-possible to specify that SECOND should only be trimmed when FIRST also occurs.
-`See also this feature
-request <https://code.google.com/p/cutadapt/issues/detail?id=34>`_, and
-comment on it if you would like to see this implemented.
-
.. _truseq:
@@ -1156,7 +1205,7 @@ Bisulfite sequencing (RRBS)
===========================
When trimming reads that come from a library prepared with the RRBS (reduced
-representation bisulfit sequencing) protocol, the last two 3' bases must be
+representation bisulfite sequencing) protocol, the last two 3' bases must be
removed in addition to the adapter itself. This can be achieved by using not
the adapter sequence itself, but by adding two wildcard characters to its
beginning. If the adapter sequence is ``ADAPTER``, the command for trimming
@@ -1212,7 +1261,7 @@ The next piece of information is this::
No. of allowed errors:
0-9 bp: 0; 10-19 bp: 1; 20 bp: 2
-The adapter has, as was shown above, has a length of 20
+The adapter, as was shown above, has a length of 20
characters. We are using the default error rate of 0.1. What this
implies is shown above: Matches up to a length of 9 bp are allowed to
have no errors. Matches of lengths 10-19 bp are allowd to have 1 error
diff --git a/doc/recipes.rst b/doc/recipes.rst
index e04ae06..7a85ea0 100644
--- a/doc/recipes.rst
+++ b/doc/recipes.rst
@@ -1,16 +1,13 @@
-=======
-Recipes
-=======
+=============
+Recipes (FAQ)
+=============
-For some trimming applications, the pre-defined adapter types behave differently
-from what you would like to have. In this section, we show some ways in which
-cutadapt can be made to behave in the desired way.
+This section gives answers to frequently asked questions. It shows you how to
+get cutadapt to do what you want it to do!
-.. note:: This section is still being written.
-
-Avoiding internal adapter matches
----------------------------------
+Avoid internal adapter matches
+------------------------------
To force matches to be at the end of the read and thus avoiding internal
adapter matches, append a few ``X`` characters to the adapter sequence, like
@@ -20,8 +17,8 @@ the length of the adapter times the error rate. This is not the same as an
anchored 3' adapter since partial matches are still allowed.
-Removing more than one adapter
-------------------------------
+Remove more than one adapter
+----------------------------
If you want to remove a 5' and 3' adapter at the same time, :ref:`use the
support for linked adapters <linked-adapters>`.
@@ -47,8 +44,8 @@ as in this example::
cutadapt -g ^TTAAGGCC -g ^AAGCTTA input.fastq | cutadapt -a TACGGACT - > output.fastq
-Trimming poly-A tails
----------------------
+Trim poly-A tails
+-----------------
If you want to trim a poly-A tail from the 3' end of your reads, use the 3'
adapter type (``-a``) with an adapter sequence of many repeated ``A``
@@ -80,6 +77,29 @@ alternative, forcing the match to be located as much to the left as possible,
while still allowing for non-``A`` bases towards the end of the read.
+Trim a fixed number of bases after adapter trimming
+---------------------------------------------------
+
+If the adapters you want to remove are preceded by some unknown sequence (such
+as a random tag/molecular identifier), you can specify this as part of the
+adapter sequence in order to remove both in one go.
+
+For example, assume you want to trim Illumina adapters preceded by 10 bases
+that you want to trim as well. Instead of this command::
+
+ cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC ...
+
+Use this command::
+
+ cutadapt -O 13 -a N{10}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC ...
+
+The ``-O 13`` is the minimum overlap for an adapter match, where the 13 is
+computed as 3 plus 10 (where 3 is the default minimum overlap and 10 is the
+length of the unknown section). If you do not specify it, the adapter sequence
+would match the end of every read (because ``N`` matches anything), and ten
+bases would then be removed from every read.
+
+
Other things (unfinished)
-------------------------
diff --git a/setup.py b/setup.py
index fa09b1c..dbb2001 100644
--- a/setup.py
+++ b/setup.py
@@ -116,6 +116,9 @@ class sdist(versioneer_sdist):
cmdclass['build_ext'] = build_ext
cmdclass['sdist'] = sdist
+with open('README.rst') as f:
+ long_description = f.read()
+
setup(
name = 'cutadapt',
version = versioneer.get_version(),
@@ -123,11 +126,13 @@ setup(
author_email = 'marcel.martin at scilifelab.se',
url = 'https://cutadapt.readthedocs.io/',
description = 'trim adapters from high-throughput sequencing reads',
+ long_description = long_description,
license = 'MIT',
cmdclass = cmdclass,
ext_modules = extensions,
packages = ['cutadapt', 'cutadapt.scripts'],
- scripts = ['bin/cutadapt'],
+ install_requires = ['xopen>=0.1.0'],
+ entry_points = {'console_scripts': ['cutadapt = cutadapt.scripts.cutadapt:main']},
classifiers = [
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
diff --git a/tests/cut/shortened.fastq b/tests/cut/shortened.fastq
new file mode 100644
index 0000000..e3fea10
--- /dev/null
+++ b/tests/cut/shortened.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+CGTCC
++
+)3%)&
+ at prefix:1_13_1259/1
+AGCCG
++
+;<:&:
+ at prefix:1_13_1440/1
+CAAGA
++
+<=A:A
diff --git a/tests/cut/small-no-trim.fasta b/tests/cut/small-no-trim.fasta
new file mode 100644
index 0000000..14003a6
--- /dev/null
+++ b/tests/cut/small-no-trim.fasta
@@ -0,0 +1,6 @@
+>prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGATTAGACAAAT
+>prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT
+>prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
diff --git a/tests/testmodifiers.py b/tests/testmodifiers.py
index 5101755..93e009e 100644
--- a/tests/testmodifiers.py
+++ b/tests/testmodifiers.py
@@ -2,7 +2,8 @@
from __future__ import print_function, division, absolute_import
from cutadapt.seqio import Sequence
-from cutadapt.modifiers import UnconditionalCutter, NEndTrimmer, QualityTrimmer
+from cutadapt.modifiers import (UnconditionalCutter, NEndTrimmer, QualityTrimmer,
+ Shortener)
def test_unconditional_cutter():
uc = UnconditionalCutter(length=5)
@@ -34,3 +35,19 @@ def test_quality_trimmer():
qt = QualityTrimmer(10, 0, 33)
assert qt(read) == Sequence('read1', 'GTTTACGTA', '456789###')
+
+
+def test_shortener():
+ read = Sequence('read1', 'ACGTTTACGTA', '##456789###')
+
+ shortener = Shortener(0)
+ assert shortener(read) == Sequence('read1', '', '')
+
+ shortener = Shortener(1)
+ assert shortener(read) == Sequence('read1', 'A', '#')
+
+ shortener = Shortener(5)
+ assert shortener(read) == Sequence('read1', 'ACGTT', '##456')
+
+ shortener = Shortener(100)
+ assert shortener(read) == read
diff --git a/tests/tests.py b/tests/tests.py
index 12bc2ae..21f70e1 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -383,6 +383,14 @@ def test_fasta():
run('-a TTAGACATATCTCCGTCG', 'small.fasta', 'small.fastq')
+def test_fasta_no_trim():
+ run([], 'small-no-trim.fasta', 'small.fastq')
+
+
def test_issue_202():
"""Ensure --length-tag= also modifies the second header line"""
run('-a GGCTTC --length-tag=length=', 'SRR2040271_1.fastq', 'SRR2040271_1.fastq')
+
+
+def test_length():
+ run('--length 5', 'shortened.fastq', 'small.fastq')
diff --git a/tests/testxopen.py b/tests/testxopen.py
deleted file mode 100644
index 2d714c4..0000000
--- a/tests/testxopen.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# coding: utf-8
-from __future__ import print_function, division, absolute_import
-import gzip
-import os
-import random
-import sys
-from nose.tools import raises
-from cutadapt.xopen import xopen, lzma
-from .utils import temporary_path
-
-base = "tests/data/small.fastq"
-files = [ base + ext for ext in ['', '.gz', '.bz2' ] ]
-if lzma is not None:
- files.append(base + '.xz')
-
-def test_context_manager():
- major, minor = sys.version_info[0:2]
- for name in files:
- if major == 2 and minor == 6:
- continue # Py26 compression libraries do not support context manager protocol.
- with xopen(name, 'rt') as f:
- lines = list(f)
- assert len(lines) == 12
- assert lines[5] == 'AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT\n', name
- f.close()
-
-def test_append():
- for ext in ["", ".gz"]: # BZ2 does NOT support append
- text = "AB"
- if ext != "":
- text = text.encode("utf-8") # On Py3, need to send BYTES, not unicode
- reference = text + text
- print("Trying ext=%s" % ext)
- with temporary_path('truncated.fastq' + ext) as path:
- try:
- os.unlink(path)
- except OSError:
- pass
- with xopen(path, 'a') as f:
- f.write(text)
- with xopen(path, 'a') as f:
- f.write(text)
- with xopen(path, 'r') as f:
- for appended in f:
- pass
- try:
- reference = reference.decode("utf-8")
- except AttributeError:
- pass
- print(appended)
- print(reference)
- assert appended == reference
-
-def test_xopen_text():
- for name in files:
- f = xopen(name, 'rt')
- lines = list(f)
- assert len(lines) == 12
- assert lines[5] == 'AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT\n', name
- f.close()
-
-
-def test_xopen_binary():
- for name in files:
- f = xopen(name, 'rb')
- lines = list(f)
- assert len(lines) == 12
- assert lines[5] == b'AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT\n', name
- f.close()
-
-
-def create_truncated_file(path):
- # Random text
- text = ''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(200))
- f = xopen(path, 'w')
- f.write(text)
- f.close()
- f = open(path, 'a')
- f.truncate(os.stat(path).st_size - 10)
- f.close()
-
-
-# Disable these tests in Python 3.2 and 3.3
-if not ((3, 2) <= sys.version_info[:2] <= (3, 3)):
- @raises(EOFError)
- def test_truncated_gz():
- with temporary_path('truncated.gz') as path:
- create_truncated_file(path)
- f = xopen(path, 'r')
- f.read()
- f.close()
-
-
- @raises(EOFError)
- def test_truncated_gz_iter():
- with temporary_path('truncated.gz') as path:
- create_truncated_file(path)
- f = xopen(path, 'r')
- for line in f:
- pass
- f.close()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-cutadapt.git
More information about the debian-med-commit
mailing list