[med-svn] [cutadapt] 01/02: Imported Upstream version 1.10
Andreas Tille
tille at debian.org
Mon Jun 20 13:12:14 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository cutadapt.
commit 3967441bfe20edf8ef01b759dde762a87416909d
Author: Andreas Tille <tille at debian.org>
Date: Mon Jun 20 15:11:12 2016 +0200
Imported Upstream version 1.10
---
.gitignore | 16 +
.travis.yml | 20 +
CHANGES.rst | 355 +++++++
CITATION | 16 +
Dockerfile | 17 +
LICENSE | 19 +
MANIFEST.in | 17 +
README.rst | 41 +
bin/_preamble.py | 21 +
bin/cutadapt | 10 +
cutadapt/__init__.py | 23 +
cutadapt/_align.pyx | 533 +++++++++++
cutadapt/_qualtrim.pyx | 84 ++
cutadapt/_seqio.pyx | 138 +++
cutadapt/adapters.py | 569 +++++++++++
cutadapt/align.py | 35 +
cutadapt/colorspace.py | 83 ++
cutadapt/compat.py | 45 +
cutadapt/filters.py | 256 +++++
cutadapt/modifiers.py | 275 ++++++
cutadapt/qualtrim.py | 70 ++
cutadapt/report.py | 296 ++++++
cutadapt/scripts/__init__.py | 0
cutadapt/scripts/cutadapt.py | 726 ++++++++++++++
cutadapt/seqio.py | 756 +++++++++++++++
cutadapt/xopen.py | 182 ++++
doc/Makefile | 179 ++++
doc/_static/adapters.svg | 259 +++++
doc/_static/logo.svg | 94 ++
doc/changes.rst | 1 +
doc/colorspace.rst | 128 +++
doc/conf.py | 270 ++++++
doc/guide.rst | 1373 +++++++++++++++++++++++++++
doc/ideas.rst | 103 ++
doc/index.rst | 25 +
doc/installation.rst | 127 +++
doc/recipes.rst | 83 ++
setup.py | 148 +++
tests/.gitignore | 3 +
tests/__init__.py | 0
tests/cut/454.fa | 118 +++
tests/cut/anchored-back.fasta | 8 +
tests/cut/anchored.fasta | 8 +
tests/cut/anchored_no_indels.fasta | 12 +
tests/cut/anchored_no_indels_wildcard.fasta | 12 +
tests/cut/anywhere_repeat.fastq | 28 +
tests/cut/discard-untrimmed.fastq | 4 +
tests/cut/discard.fastq | 4 +
tests/cut/dos.fastq | 12 +
tests/cut/empty.fastq | 0
tests/cut/example.fa | 18 +
tests/cut/examplefront.fa | 18 +
tests/cut/illumina.fastq | 400 ++++++++
tests/cut/illumina.info.txt | 100 ++
tests/cut/illumina5.fastq | 20 +
tests/cut/illumina5.info.txt | 8 +
tests/cut/illumina64.fastq | 80 ++
tests/cut/interleaved.fastq | 16 +
tests/cut/issue46.fasta | 2 +
tests/cut/linked.fasta | 10 +
tests/cut/lowercase.fastq | 12 +
tests/cut/lowqual.fastq | 8 +
tests/cut/maxlen.fa | 14 +
tests/cut/maxn0.2.fasta | 6 +
tests/cut/maxn0.4.fasta | 8 +
tests/cut/maxn0.fasta | 4 +
tests/cut/maxn1.fasta | 8 +
tests/cut/maxn2.fasta | 10 +
tests/cut/minlen.fa | 16 +
tests/cut/minlen.noprimer.fa | 14 +
tests/cut/nextseq.fastq | 8 +
tests/cut/no-trim.fastq | 4 +
tests/cut/no_indels.fasta | 18 +
tests/cut/overlapa.fa | 40 +
tests/cut/overlapb.fa | 38 +
tests/cut/paired-filterboth.1.fastq | 16 +
tests/cut/paired-filterboth.2.fastq | 16 +
tests/cut/paired-m27.1.fastq | 16 +
tests/cut/paired-m27.2.fastq | 16 +
tests/cut/paired-onlyA.1.fastq | 16 +
tests/cut/paired-onlyA.2.fastq | 16 +
tests/cut/paired-separate.1.fastq | 16 +
tests/cut/paired-separate.2.fastq | 16 +
tests/cut/paired-too-short.1.fastq | 4 +
tests/cut/paired-too-short.2.fastq | 4 +
tests/cut/paired-trimmed.1.fastq | 12 +
tests/cut/paired-trimmed.2.fastq | 12 +
tests/cut/paired-untrimmed.1.fastq | 4 +
tests/cut/paired-untrimmed.2.fastq | 4 +
tests/cut/paired.1.fastq | 12 +
tests/cut/paired.2.fastq | 12 +
tests/cut/paired.m14.1.fastq | 12 +
tests/cut/paired.m14.2.fastq | 12 +
tests/cut/pairedq.1.fastq | 8 +
tests/cut/pairedq.2.fastq | 8 +
tests/cut/pairedu.1.fastq | 16 +
tests/cut/pairedu.2.fastq | 16 +
tests/cut/plus.fastq | 8 +
tests/cut/polya.fasta | 2 +
tests/cut/rest.fa | 18 +
tests/cut/restfront.fa | 18 +
tests/cut/s_1_sequence.txt | 8 +
tests/cut/small.fasta | 6 +
tests/cut/small.fastq | 12 +
tests/cut/small.trimmed.fastq | 8 +
tests/cut/small.untrimmed.fastq | 4 +
tests/cut/solid-no-zerocap.fastq | 120 +++
tests/cut/solid.fasta | 4 +
tests/cut/solid.fastq | 120 +++
tests/cut/solid5p-anchored.fasta | 32 +
tests/cut/solid5p-anchored.fastq | 64 ++
tests/cut/solid5p-anchored.notrim.fasta | 32 +
tests/cut/solid5p-anchored.notrim.fastq | 64 ++
tests/cut/solid5p.fasta | 32 +
tests/cut/solid5p.fastq | 64 ++
tests/cut/solidbfast.fastq | 120 +++
tests/cut/solidmaq.fastq | 120 +++
tests/cut/solidqual.fastq | 120 +++
tests/cut/sra.fastq | 24 +
tests/cut/stripped.fasta | 4 +
tests/cut/suffix.fastq | 120 +++
tests/cut/trimN3.fasta | 2 +
tests/cut/trimN5.fasta | 2 +
tests/cut/twoadapters.fasta | 6 +
tests/cut/twoadapters.first.fasta | 2 +
tests/cut/twoadapters.second.fasta | 2 +
tests/cut/twoadapters.unknown.fasta | 2 +
tests/cut/unconditional-back.fastq | 12 +
tests/cut/unconditional-both.fastq | 12 +
tests/cut/unconditional-front.fastq | 12 +
tests/cut/wildcard.fa | 4 +
tests/cut/wildcardN.fa | 6 +
tests/cut/wildcard_adapter.fa | 8 +
tests/cut/wildcard_adapter_anywhere.fa | 8 +
tests/data/454.fa | 118 +++
tests/data/E3M.fasta | 59 ++
tests/data/E3M.qual | 59 ++
tests/data/adapter.fasta | 4 +
tests/data/anchored-back.fasta | 8 +
tests/data/anchored.fasta | 8 +
tests/data/anchored_no_indels.fasta | 12 +
tests/data/anywhere_repeat.fastq | 28 +
tests/data/dos.fastq | 12 +
tests/data/empty.fastq | 0
tests/data/example.fa | 18 +
tests/data/illumina.fastq.gz | Bin 0 -> 7161 bytes
tests/data/illumina5.fastq | 20 +
tests/data/illumina64.fastq | 80 ++
tests/data/interleaved.fastq | 32 +
tests/data/issue46.fasta | 2 +
tests/data/lengths.fa | 28 +
tests/data/linked.fasta | 10 +
tests/data/lowqual.fastq | 8 +
tests/data/maxn.fasta | 12 +
tests/data/multiblock.fastq.gz | Bin 0 -> 262 bytes
tests/data/nextseq.fastq | 8 +
tests/data/no_indels.fasta | 20 +
tests/data/overlapa.fa | 40 +
tests/data/overlapb.fa | 38 +
tests/data/paired.1.fastq | 16 +
tests/data/paired.2.fastq | 16 +
tests/data/plus.fastq | 8 +
tests/data/polya.fasta | 6 +
tests/data/prefix-adapter.fasta | 2 +
tests/data/rest.fa | 18 +
tests/data/rest.txt | 5 +
tests/data/restfront.txt | 6 +
tests/data/s_1_sequence.txt.gz | Bin 0 -> 97 bytes
tests/data/simple.fasta | 7 +
tests/data/simple.fastq | 8 +
tests/data/small.fastq | 12 +
tests/data/small.fastq.bz2 | Bin 0 -> 222 bytes
tests/data/small.fastq.gz | Bin 0 -> 218 bytes
tests/data/small.fastq.xz | Bin 0 -> 260 bytes
tests/data/small.myownextension | 12 +
tests/data/solid.csfasta | 63 ++
tests/data/solid.fasta | 4 +
tests/data/solid.fastq | 120 +++
tests/data/solid.qual | 63 ++
tests/data/solid5p.fasta | 34 +
tests/data/solid5p.fastq | 64 ++
tests/data/sra.fastq | 24 +
tests/data/suffix-adapter.fasta | 2 +
tests/data/toolong.fa | 14 +
tests/data/tooshort.fa | 12 +
tests/data/tooshort.noprimer.fa | 14 +
tests/data/trimN3.fasta | 2 +
tests/data/trimN5.fasta | 2 +
tests/data/twoadapters.fasta | 6 +
tests/data/wildcard.fa | 4 +
tests/data/wildcardN.fa | 6 +
tests/data/wildcard_adapter.fa | 8 +
tests/data/withplus.fastq | 8 +
tests/testadapters.py | 125 +++
tests/testalign.py | 123 +++
tests/testcolorspace.py | 140 +++
tests/testfilters.py | 42 +
tests/testmodifiers.py | 36 +
tests/testpaired.py | 273 ++++++
tests/testqualtrim.py | 14 +
tests/tests.py | 383 ++++++++
tests/testseqio.py | 352 +++++++
tests/testtrim.py | 27 +
tests/testxopen.py | 101 ++
tests/utils.py | 50 +
tox.ini | 6 +
206 files changed, 12810 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3b6890e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+*.pyc
+MANIFEST
+build/
+dist/
+.coverage
+*~
+.tox
+galaxy/package/
+.pydevproject
+.project
+.settings
+cutadapt/_*.c
+cutadapt/*.so
+doc/_build
+*.pyo
+.idea/
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..4fb5846
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,20 @@
+sudo: false
+language: python
+cache:
+ directories:
+ - $HOME/.cache/pip
+python:
+ - "2.6"
+ - "2.7"
+ - "3.3"
+ - "3.4"
+ - "3.5"
+
+install:
+ - pip install --upgrade pip wheel
+ - pip install Cython
+ - pip install .
+
+script:
+ - nosetests -P tests
+
diff --git a/CHANGES.rst b/CHANGES.rst
new file mode 100644
index 0000000..c3c97b2
--- /dev/null
+++ b/CHANGES.rst
@@ -0,0 +1,355 @@
+=======
+Changes
+=======
+
+v1.10
+-----
+
+* Added a new “linked adapter” type, which can be used to search for a 5' and a
+ 3' adapter at the same time. Use ``-a ADAPTER1...ADAPTER2` to search
+ for a linked adapter. ADAPTER1 is interpreted as an anchored 5' adapter, which
+ is searched for first. Only if ADAPTER1 is found will ADAPTER2 be searched
+ for, which is a regular 3' adapter.
+* Added experimental ``--nextseq-trim`` option for quality trimming of NextSeq
+ data. This is necessary because that machine cannot distinguish between G and
+ reaching the end of the fragment (it encodes G as 'black').
+* Even when trimming FASTQ files, output can now be FASTA (quality values are
+ simply dropped). Use the ``-o``/``-p`` options with a file name that ends in
+ ``.fasta`` or ``.fa`` to enable this.
+* Cutadapt does not bundle pre-compiled C extension modules (``.so`` files)
+ anymore. This affects only users that run cutadapt directly from an unpacked
+ tarball. Install through ``pip`` or ``conda`` instead.
+* Fix issue #167: Option ``--quiet`` was not entirely quiet.
+* Fix issue #199: Be less strict when checking for properly-paired reads.
+* This is the last version of cutadapt to support Python 2.6. Future versions
+ will require at least Python 2.7.
+
+v1.9.1
+------
+
+* Added ``--pair-filter`` option, which :ref:`modifies how filtering criteria
+ apply to paired-end reads <filtering-paired>`
+* Add ``--too-short-paired-output`` and ``--too-long-paired-output`` options.
+* Fix incorrect number of trimmed bases reported if ``--times`` option was used.
+
+v1.9
+----
+
+* Indels in the alignment can now be disabled for all adapter types (use
+ ``--no-indels``).
+* Quality values are now printed in the info file (``--info-file``)
+ when trimming FASTQ files. Fixes issue #144.
+* Options ``--prefix`` and ``--suffix``, which modify read names, now accept the
+ placeholder ``{name}`` and will replace it with the name of the found adapter.
+ Fixes issue #104.
+* Interleaved FASTQ files: With the ``--interleaved`` switch, paired-end reads
+ will be read from and written to interleaved FASTQ files. Fixes issue #113.
+* Anchored 5' adapters can now be specified by writing ``-a SEQUENCE...`` (note
+ the three dots).
+* Fix ``--discard-untrimmed`` and ``--discard-trimmed`` not working as expected
+ in paired-end mode (issue #146).
+* The minimum overlap is now automatically reduced to the adapter length if it
+ is too large. Fixes part of issue #153.
+* Thanks to Wolfgang Gerlach, there is now a Dockerfile.
+* The new ``--debug`` switch makes cutadapt print out the alignment matrix.
+
+v1.8.3
+------
+
+* Fix issue #95: Untrimmed reads were not listed in the info file.
+* Fix issue #138: pip install cutadapt did not work with new setuptools versions.
+* Fix issue #137: Avoid a hang when writing to two or more gzip-compressed
+ output files in Python 2.6.
+
+v1.8.1
+------
+
+* Fix #110: Counts for 'too short' and 'too long' reads were swapped in statistics.
+* Fix #115: Make ``--trim-n`` work also on second read for paired-end data.
+
+v1.8
+----
+
+* Support single-pass paired-end trimming with the new ``-A``/``-G``/``-B``/``-U``
+ parameters. These work just like their -a/-g/-b/-u counterparts, but they
+ specify sequences that are removed from the *second read* in a pair.
+
+ Also, if you start using one of those options, the read modification options
+ such as ``-q`` (quality trimming) are applied to *both* reads. For backwards
+ compatibility, read modifications are applied to the first read only if
+ neither of ``-A``/``-G``/``-B``/``-U`` is used. See `the
+ documentation <http://cutadapt.readthedocs.org/en/latest/guide.html#paired-end>`_
+ for details.
+
+ This feature has not been extensively tested, so please give feedback if
+ something does not work.
+* The report output has been re-worked in order to accomodate the new paired-end
+ trimming mode. This also changes the way the report looks like in single-end
+ mode. It is hopefully now more accessible.
+* Chris Mitchell contributed a patch adding two new options: ``--trim-n``
+ removes any ``N`` bases from the read ends, and the ``--max-n`` option can be
+ used to filter out reads with too many ``N``.
+* Support notation for repeated bases in the adapter sequence: Write ``A{10}``
+ instead of ``AAAAAAAAAA``. Useful for poly-A trimming: Use ``-a A{100}`` to
+ get the longest possible tail.
+* Quality trimming at the 5' end of reads is now supported. Use ``-q 15,10`` to
+ trim the 5' end with a cutoff of 15 and the 3' end with a cutoff of 10.
+* Fix incorrectly reported statistics (> 100% trimmed bases) when ``--times``
+ set to a value greater than one.
+* Support .xz-compressed files (if running in Python 3.3 or later).
+* Started to use the GitHub issue tracker instead of Google Code. All old issues
+ have been moved.
+
+v1.7
+----
+* IUPAC characters are now supported. For example, use ``-a YACGT`` for an
+ adapter that matches both ``CACGT`` and ``TACGT`` with zero errors. Disable
+ with ``-N``. By default, IUPAC characters in the read are not interpreted in
+ order to avoid matches in reads that consist of many (low-quality) ``N``
+ bases. Use ``--match-read-wildcards`` to enable them also in the read.
+* Support for demultiplexing was added. This means that reads can be written to
+ different files depending on which adapter was found. See `the section in the
+ documentation <http://cutadapt.readthedocs.org/en/latest/guide.html#demultiplexing>`_
+ for how to use it. This is currently only supported for single-end reads.
+* Add support for anchored 3' adapters. Append ``$`` to the adapter sequence to
+ force the adapter to appear in the end of the read (as a suffix). Closes
+ issue #81.
+* Option ``--cut`` (``-u``) can now be specified twice, once for each end of the
+ read. Thanks to Rasmus Borup Hansen for the patch!
+* Options ``--minimum-length``/``--maximum-length`` (``-m``/``-M``) can be used
+ standalone. That is, cutadapt can be used to filter reads by length without
+ trimming adapters.
+* Fix bug: Adapters read from a FASTA file can now be anchored.
+
+v1.6
+----
+* Fix bug: Ensure ``--format=...`` can be used even with paired-end input.
+* Fix bug: Sometimes output files would be incomplete because they were not
+ closed correctly.
+* Alignment algorithm is a tiny bit faster.
+* Extensive work on the documentation. It's now available at
+ https://cutadapt.readthedocs.org/ .
+* For 3' adapters, statistics about the bases preceding the trimmed adapter
+ are collected and printed. If one of the bases is overrepresented, a warning
+ is shown since this points to an incomplete adapter sequence. This happens,
+ for example, when a TruSeq adapter is used but the A overhang is not taken
+ into account when running cutadapt.
+* Due to code cleanup, there is a change in behavior: If you use
+ ``--discard-trimmed`` or ``--discard-untrimmed`` in combination with
+ ``--too-short-output`` or ``--too-long-output``, then cutadapt now writes also
+ the discarded reads to the output files given by the ``--too-short`` or
+ ``--too-long`` options. If anyone complains, I will consider reverting this.
+* Galaxy support files are now in `a separate
+ repository <https://bitbucket.org/lance_parsons/cutadapt_galaxy_wrapper>`_.
+
+v1.5
+----
+* Adapter sequences can now be read from a FASTA file. For example, write
+ ``-a file:adapters.fasta`` to read 3' adapters from ``adapters.fasta``. This works
+ also for ``-b`` and ``-g``.
+* Add the option ``--mask-adapter``, which can be used to not remove adapters,
+ but to instead mask them with ``N`` characters. Thanks to Vittorio Zamboni
+ for contributing this feature!
+* U characters in the adapter sequence are automatically converted to T.
+* Do not run Cython at installation time unless the --cython option is provided.
+* Add the option -u/--cut, which can be used to unconditionally remove a number
+ of bases from the beginning or end of each read.
+* Make ``--zero-cap`` the default for colorspace reads.
+* When the new option ``--quiet`` is used, no report is printed after all reads
+ have been processed.
+* When processing paired-end reads, cutadapt now checks whether the reads are
+ properly paired.
+* To properly handle paired-end reads, an option --untrimmed-paired-output was
+ added.
+
+v1.4
+----
+* This release of cutadapt reduces the overhead of reading and writing files.
+ On my test data set, a typical run of cutadapt (with a single adapter) takes
+ 40% less time due to the following two changes.
+* Reading and writing of FASTQ files is faster (thanks to Cython).
+* Reading and writing of gzipped files is faster (up to 2x) on systems
+ where the ``gzip`` program is available.
+* The quality trimming function is four times faster (also due to Cython).
+* Fix the statistics output for 3' colorspace adapters: The reported lengths were one
+ too short. Thanks to Frank Wessely for reporting this.
+* Support the ``--no-indels`` option. This disallows insertions and deletions while
+ aligning the adapter. Currently, the option is only available for anchored 5' adapters.
+ This fixes issue 69.
+* As a sideeffect of implementing the --no-indels option: For colorspace, the
+ length of a read (for ``--minimum-length`` and ``--maximum-length``) is now computed after
+ primer base removal (when ``--trim-primer`` is specified).
+* Added one column to the info file that contains the name of the found adapter.
+* Add an explanation about colorspace ambiguity to the README
+
+v1.3
+----
+* Preliminary paired-end support with the ``--paired-output`` option (contributed by
+ James Casbon). See the README section on how to use it.
+* Improved statistics.
+* Fix incorrectly reported amount of quality-trimmed Mbp (issue 57, fix by Chris Penkett)
+* Add the ``--too-long-output`` option.
+* Add the ``--no-trim`` option, contributed by Dave Lawrence.
+* Port handwritten C alignment module to Cython.
+* Fix the ``--rest-file`` option (issue 56)
+* Slightly speed up alignment of 5' adapters.
+* Support bzip2-compressed files.
+
+v1.2
+----
+* At least 25% faster processing of .csfasta/.qual files due to faster parser.
+* Between 10% and 30% faster writing of gzip-compressed output files.
+* Support 5' adapters in colorspace, even when no primer trimming is requested.
+* Add the ``--info-file`` option, which has a line for each found adapter.
+* Named adapters are possible. Usage: ``-a My_Adapter=ACCGTA`` assigns the name "My_adapter".
+* Improve alignment algorithm for better poly-A trimming when there are sequencing errors.
+ Previously, not the longest possible poly-A tail would be trimmed.
+* James Casbon contributed the ``--discard-untrimmed`` option.
+
+v1.1
+----
+* Allow to "anchor" 5' adapters (``-g``), forcing them to be a prefix of the read.
+ To use this, add the special character ``^`` to the beginning of the adapter sequence.
+* Add the "-N" option, which allows 'N' characters within adapters to match literally.
+* Speedup of approx. 25% when reading from .gz files and using Python 2.7.
+* Allow to only trim qualities when no adapter is given on the command-line.
+* Add a patch by James Casbon: include read names (ids) in rest file
+* Use nosetest for testing. To run, install nose and run "nosetests".
+* When using cutadapt without installing it, you now need to run ``bin/cutadapt`` due to
+ a new directory layout.
+* Allow to give a colorspace adapter in basespace (gets automatically converted).
+* Allow to search for 5' adapters (those specified with ``-g``) in colorspace.
+* Speed up the alignment by a factor of at least 3 by using Ukkonen's algorithm.
+ The total runtime decreases by about 30% in the tested cases.
+* allow to deal with colorspace FASTQ files from the SRA that contain a fake
+ additional quality in the beginning (use ``--format sra-fastq``)
+
+v1.0
+----
+* ASCII-encoded quality values were assumed to be encoded as ascii(quality+33).
+ With the new parameter ``--quality-base``, this can be changed to ascii(quality+64),
+ as used in some versions of the Illumina pipeline. (Fixes issue 7.)
+* Allow to specify that adapters were ligated to the 5' end of reads. This change
+ is based on a patch contributed by James Casbon.
+* Due to cutadapt being published in EMBnet.journal, I found it appropriate
+ to call this release version 1.0. Please see
+ http://journal.embnet.org/index.php/embnetjournal/article/view/200 for the
+ article and I would be glad if you cite it.
+* Add Galaxy support, contributed by Lance Parsons.
+* Patch by James Casbon: Allow N wildcards in read or adapter or both.
+ Wildcard matching of 'N's in the adapter is always done. If 'N's within reads
+ should also match without counting as error, this needs to be explicitly
+ requested via ``--match-read-wildcards``.
+
+v0.9.5
+------
+* Fix issue 20: Make the report go to standard output when ``-o``/``--output`` is
+ specified.
+* Recognize `.fq` as an extension for FASTQ files
+* many more unit tests
+* The alignment algorithm has changed. It will now find some adapters that
+ previously were missed. Note that this will produce different output than
+ older cutadapt versions!
+
+ Before this change, finding an adapter would work as follows:
+
+ - Find an alignment between adapter and read -- longer alignments are
+ better.
+ - If the number of errors in the alignment (divided by length) is above the
+ maximum error rate, report the adapter as not being found.
+
+ Sometimes, the long alignment that is found had too many errors, but a
+ shorter alignment would not. The adapter was then incorrectly seen as "not
+ found". The new alignment algorithm checks the error rate while aligning and only
+ reports alignments that do not have too many errors.
+
+v0.9.4
+------
+* now compatible with Python 3
+* Add the ``--zero-cap`` option, which changes negative quality values to zero.
+ This is a workaround to avoid segmentation faults in BWA. The option is now
+ enabled by default when ``--bwa``/``--maq`` is used.
+* Lots of unit tests added. Run them with ``cd tests && ./tests.sh``.
+* Fix issue 16: ``--discard-trimmed`` did not work.
+* Allow to override auto-detection of input file format with the new ``-f``/``--format``
+ parameter. This mostly fixes issue 12.
+* Don't break when input file is empty.
+
+v0.9.2
+------
+* Install a single ``cutadapt`` Python package instead of multiple Python
+ modules. This avoids cluttering the global namespace and should lead to less
+ problems with other Python modules. Thanks to Steve Lianoglou for
+ pointing this out to me!
+* ignore case (ACGT vs acgt) when comparing the adapter with the read sequence
+* .FASTA/.QUAL files (not necessarily colorspace) can now be read (some
+ 454 software uses this format)
+* Move some functions into their own modules
+* lots of refactoring: replace the fasta module with a much nicer seqio module.
+* allow to input FASTA/FASTQ on standard input (also FASTA/FASTQ is
+ autodetected)
+
+v0.9
+----
+* add ``--too-short-output`` and ``--untrimmed-output``, based on patch by Paul Ryvkin (thanks!)
+* add ``--maximum-length`` parameter: discard reads longer than a specified length
+* group options by category in ``--help`` output
+* add ``--length-tag`` option. allows to fix read length in FASTA/Q comment lines
+ (e.g., ``length=123`` becomes ``length=58`` after trimming) (requested by Paul Ryvkin)
+* add ``-q``/``--quality-cutoff`` option for trimming low-quality ends (uses the same algorithm
+ as BWA)
+* some refactoring
+* the filename ``-`` is now interpreted as standard in or standard output
+
+v0.8
+----
+* Change default behavior of searching for an adapter: The adapter is now assumed to
+ be an adapter that has been ligated to the 3' end. This should be the correct behavior
+ for at least the SOLiD small RNA protocol (SREK) and also for the Illumina protocol.
+ To get the old behavior, which uses a heuristic to determine whether the adapter was
+ ligated to the 5' or 3' end and then trimmed the read accordingly, use the new
+ ``-b`` (``--anywhere``) option.
+* Clear up how the statistics after processing all reads are printed.
+* Fix incorrect statistics. Adapters starting at pos. 0 were correctly trimmed,
+ but not counted.
+* Modify scoring scheme: Improves trimming (some reads that should have been
+ trimmed were not). Increases no. of trimmed reads in one of our SOLiD data sets
+ from 36.5 to 37.6%.
+* Speed improvements (20% less runtime on my test data set).
+
+v0.7
+----
+* Useful exit codes
+* Better error reporting when malformed files are encountered
+* Add ``--minimum-length`` parameter for discarding reads that are shorter than
+ a specified length after trimming.
+* Generalize the alignment function a bit. This is preparation for
+ supporting adapters that are specific to either the 5' or 3' end.
+* pure Python fallback for alignment function for when the C module cannot
+ be used.
+
+v0.6
+----
+* Support gzipped input and output.
+* Print timing information in statistics.
+
+v0.5
+----
+* add ``--discard`` option which makes cutadapt discard reads in which an adapter occurs
+
+v0.4
+----
+* (more) correctly deal with multiple adapters: If a long adapter matches with lots of
+ errors, then this could lead to a a shorter adapter matching with few errors getting ignored.
+
+v0.3
+----
+* fix huge memory usage (entire input file was unintentionally read into memory)
+
+v0.2
+----
+* allow FASTQ input
+
+v0.1
+----
+* initial release
diff --git a/CITATION b/CITATION
new file mode 100644
index 0000000..a1e62e2
--- /dev/null
+++ b/CITATION
@@ -0,0 +1,16 @@
+Marcel Martin. Cutadapt removes adapter sequences from high-throughput sequencing reads.
+EMBnet.journal, 17(1):10-12, May 2011.
+DOI: http://dx.doi.org/10.14806/ej.17.1.200
+
+ at ARTICLE{Martin2011Cutadapt,
+ author = {Marcel Martin},
+ title = {Cutadapt removes adapter sequences from high-throughput sequencing reads},
+ journal = {EMBnet.journal},
+ year = 2011,
+ month = may,
+ volume = 17,
+ pages = {10--12},
+ number = 1,
+ doi = {http://dx.doi.org/10.14806/ej.17.1.200},
+ url = {http://journal.embnet.org/index.php/embnetjournal/article/view/200}
+}
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..e0d8145
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,17 @@
+FROM debian:jessie
+
+RUN apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+ python2.7-dev \
+ cython
+
+ADD . /cutadapt/
+
+RUN cd /cutadapt/ && python setup.py install && python setup.py build_ext -i
+
+ENTRYPOINT ["/cutadapt/bin/cutadapt"]
+CMD ["--help"]
+
+# git clone https://github.com/marcelm/cutadapt.git
+# cd cutadapt
+# docker build -t marcelm/cutadapt:latest .
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..df04e21
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2010-2016 Marcel Martin <marcel.martin at scilifelab.se>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..7d8f3b2
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,17 @@
+# documentation
+include README.rst
+include CHANGES.rst
+include CITATION
+include LICENSE
+include doc/*.rst
+include doc/conf.py
+include doc/Makefile
+include cutadapt/*.pyx
+include cutadapt/_align.c
+include cutadapt/_qualtrim.c
+include cutadapt/_seqio.c
+include bin/_preamble.py
+include tests/test*.py
+include tests/utils.py
+graft tests/data
+graft tests/cut
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..fcae283
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,41 @@
+.. image:: https://travis-ci.org/marcelm/cutadapt.svg?branch=master
+ :target: https://travis-ci.org/marcelm/cutadapt
+
+.. image:: https://img.shields.io/pypi/v/cutadapt.svg?branch=master
+ :target: https://pypi.python.org/pypi/cutadapt
+
+========
+cutadapt
+========
+
+Cutadapt finds and removes adapter sequences, primers, poly-A tails and other
+types of unwanted sequence from your high-throughput sequencing reads.
+
+Cleaning your data in this way is often required: Reads from small-RNA
+sequencing contain the 3’ sequencing adapter because the read is longer than
+the molecule that is sequenced. Amplicon reads start with a primer sequence.
+Poly-A tails are useful for pulling out RNA from your sample, but often you
+don’t want them to be in your reads.
+
+Cutadapt helps with these trimming tasks by finding the adapter or primer
+sequences in an error-tolerant way. It can also modify and filter reads in
+various ways. Adapter sequences can contain IUPAC wildcard characters. Also,
+paired-end reads and even colorspace data is supported. If you want, you can
+also just demultiplex your input data, without removing adapter sequences at all.
+
+Cutadapt comes with an extensive suite of automated tests and is available under
+the terms of the MIT license.
+
+If you use cutadapt, please cite
+`DOI:10.14806/ej.17.1.200 <http://dx.doi.org/10.14806/ej.17.1.200>`_ .
+
+
+Links
+-----
+
+* `Documentation <https://cutadapt.readthedocs.org/>`_
+* `Source code <https://github.com/marcelm/cutadapt/>`_
+* `Report an issue <https://github.com/marcelm/cutadapt/issues>`_
+* `Project page on PyPI (Python package index) <https://pypi.python.org/pypi/cutadapt/>`_
+* `Follow @marcelm_ on Twitter <https://twitter.com/marcelm_>`_
+* `Wrapper for the Galaxy platform <https://bitbucket.org/lance_parsons/cutadapt_galaxy_wrapper>`_
diff --git a/bin/_preamble.py b/bin/_preamble.py
new file mode 100644
index 0000000..55f392a
--- /dev/null
+++ b/bin/_preamble.py
@@ -0,0 +1,21 @@
+# Copyright (c) Twisted Matrix Laboratories.
+#
+# Copied from Twisted (http://twistedmatrix.com/), see
+# http://twistedmatrix.com/trac/browser/trunk/LICENSE for the license.
+#
+# This makes sure that users don't have to set up their environment
+# specially in order to run these programs from bin/.
+
+# This helper is shared by many different actual scripts. It is not intended to
+# be packaged or installed, it is only a developer convenience. By the time
+# the package is actually installed somewhere, the environment should already be set
+# up properly without the help of this tool.
+
+import sys, os
+
+path = os.path.abspath(sys.argv[0])
+while os.path.dirname(path) != path:
+ if os.path.exists(os.path.join(path, 'cutadapt', '__init__.py')):
+ sys.path.insert(0, path)
+ break
+ path = os.path.dirname(path)
diff --git a/bin/cutadapt b/bin/cutadapt
new file mode 100755
index 0000000..02c4c8d
--- /dev/null
+++ b/bin/cutadapt
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+import sys
+
+try:
+ import _preamble
+except ImportError:
+ pass
+
+from cutadapt.scripts import cutadapt
+cutadapt.main()
diff --git a/cutadapt/__init__.py b/cutadapt/__init__.py
new file mode 100644
index 0000000..e3422c7
--- /dev/null
+++ b/cutadapt/__init__.py
@@ -0,0 +1,23 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+import sys
+
+__version__ = '1.10'
+
+def check_importability(): # pragma: no cover
+ try:
+ import cutadapt._align
+ except ImportError as e:
+ if 'undefined symbol' in str(e):
+ print("""
+ERROR: A required extension module could not be imported because it is
+incompatible with your system. A quick fix is to recompile the extension
+modules with the following command:
+
+ {0} setup.py build_ext -i
+
+See the documentation for alternative ways of installing the program.
+
+The original error message follows.
+""".format(sys.executable))
+ raise
diff --git a/cutadapt/_align.pyx b/cutadapt/_align.pyx
new file mode 100644
index 0000000..57bc0f8
--- /dev/null
+++ b/cutadapt/_align.pyx
@@ -0,0 +1,533 @@
+# cython: profile=False, emit_code_comments=False
+from cpython.mem cimport PyMem_Malloc, PyMem_Free, PyMem_Realloc
+
+DEF START_WITHIN_SEQ1 = 1
+DEF START_WITHIN_SEQ2 = 2
+DEF STOP_WITHIN_SEQ1 = 4
+DEF STOP_WITHIN_SEQ2 = 8
+DEF SEMIGLOBAL = 15
+
+# structure for a DP matrix entry
+ctypedef struct _Entry:
+ int cost
+ int matches # no. of matches in this alignment
+ int origin # where the alignment originated: negative for positions within seq1, positive for pos. within seq2
+
+
+ctypedef struct _Match:
+ int origin
+ int cost
+ int matches
+ int ref_stop
+ int query_stop
+
+
+def _acgt_table():
+ """
+ Return a translation table that maps A, C, G, T characters to the lower
+ four bits of a byte. Other characters (including possibly IUPAC characters)
+ are mapped to zero.
+
+ Lowercase versions are also translated, and U is treated the same as T.
+ """
+ d = dict(A=1, C=2, G=4, T=8, U=8)
+ t = bytearray(b'\0') * 256
+ for c, v in d.items():
+ t[ord(c)] = v
+ t[ord(c.lower())] = v
+ return bytes(t)
+
+
+def _iupac_table():
+ """
+ Return a translation table for IUPAC characters.
+
+ The table maps ASCII-encoded IUPAC nucleotide characters to bytes in which
+ the four least significant bits are used to represent one nucleotide each.
+
+ Whether two characters x and y match can then be checked with the
+ expression "x & y != 0".
+ """
+ A = 1
+ C = 2
+ G = 4
+ T = 8
+ d = dict(
+ X=0,
+ A=A,
+ C=C,
+ G=G,
+ T=T,
+ U=T,
+ R=A|G,
+ Y=C|T,
+ S=G|C,
+ W=A|T,
+ K=G|T,
+ M=A|C,
+ B=C|G|T,
+ D=A|G|T,
+ H=A|C|T,
+ V=A|C|G,
+ N=A|C|G|T
+ )
+ t = bytearray(b'\0') * 256
+ for c, v in d.items():
+ t[ord(c)] = v
+ t[ord(c.lower())] = v
+ return bytes(t)
+
+
+cdef bytes ACGT_TABLE = _acgt_table()
+cdef bytes IUPAC_TABLE = _iupac_table()
+
+
+class DPMatrix:
+ """
+ Representation of the dynamic-programming matrix.
+
+ This used only when debugging is enabled in the Aligner class since the
+ matrix is normally not stored in full.
+
+ Entries in the matrix may be None, in which case that value was not
+ computed.
+ """
+ def __init__(self, reference, query):
+ m = len(reference)
+ n = len(query)
+ self._rows = [ [None] * (n+1) for _ in range(m + 1) ]
+ self.reference = reference
+ self.query = query
+
+ def set_entry(self, int i, int j, cost):
+ """
+ Set an entry in the dynamic programming matrix.
+ """
+ self._rows[i][j] = cost
+
+ def __str__(self):
+ """
+ Return a representation of the matrix as a string.
+ """
+ rows = [' ' + ' '.join(c.rjust(2) for c in self.query)]
+ for c, row in zip(' ' + self.reference, self._rows):
+ r = c + ' ' + ' '.join(' ' if v is None else '{0:2d}'.format(v) for v in row)
+ rows.append(r)
+ return '\n'.join(rows)
+
+
+cdef class Aligner:
+ """
+ TODO documentation still uses s1 (reference) and s2 (query).
+
+ Locate one string within another by computing an optimal semiglobal
+ alignment between string1 and string2.
+
+ The alignment uses unit costs, which means that mismatches, insertions and deletions are
+ counted as one error.
+
+ flags is a bitwise 'or' of the allowed flags.
+ To allow skipping of a prefix of string1 at no cost, set the
+ START_WITHIN_SEQ1 flag.
+ To allow skipping of a prefix of string2 at no cost, set the
+ START_WITHIN_SEQ2 flag.
+ If both are set, a prefix of string1 or of string1 is skipped,
+ never both.
+ Similarly, set STOP_WITHIN_SEQ1 and STOP_WITHIN_SEQ2 to
+ allow skipping of suffixes of string1 or string2. Again, when both
+ flags are set, never suffixes in both strings are skipped.
+ If all flags are set, this results in standard semiglobal alignment.
+
+ The skipped parts are described with two intervals (start1, stop1),
+ (start2, stop2).
+
+ For example, an optimal semiglobal alignment of SISSI and MISSISSIPPI looks like this:
+
+ ---SISSI---
+ MISSISSIPPI
+
+ start1, stop1 = 0, 5
+ start2, stop2 = 3, 8
+ (with zero errors)
+
+ The aligned parts are string1[start1:stop1] and string2[start2:stop2].
+
+ The error rate is: errors / length where length is (stop1 - start1).
+
+ An optimal alignment fulfills all of these criteria:
+
+ - its error_rate is at most max_error_rate
+ - Among those alignments with error_rate <= max_error_rate, the alignment contains
+ a maximal number of matches (there is no alignment with more matches).
+ - If there are multiple alignments with the same no. of matches, then one that
+ has minimal no. of errors is chosen.
+ - If there are still multiple candidates, choose the alignment that starts at the
+ leftmost position within the read.
+
+ The alignment itself is not returned, only the tuple
+ (start1, stop1, start2, stop2, matches, errors), where the first four fields have the
+ meaning as described, matches is the number of matches and errors is the number of
+ errors in the alignment.
+
+ It is always the case that at least one of start1 and start2 is zero.
+
+ IUPAC wildcard characters can be allowed in the reference and the query
+ by setting the appropriate flags.
+
+ If neither flag is set, the full ASCII alphabet is used for comparison.
+ If any of the flags is set, all non-IUPAC characters in the sequences
+ compare as 'not equal'.
+ """
+ cdef int m
+ cdef _Entry* column # one column of the DP matrix
+ cdef double max_error_rate
+ cdef int flags
+ cdef int _insertion_cost
+ cdef int _deletion_cost
+ cdef int _min_overlap
+ cdef bint wildcard_ref
+ cdef bint wildcard_query
+ cdef bint debug
+ cdef object _dpmatrix
+ cdef bytes _reference # TODO rename to translated_reference or so
+ cdef str str_reference
+
+ def __cinit__(self, str reference, double max_error_rate, int flags=SEMIGLOBAL, bint wildcard_ref=False, bint wildcard_query=False):
+ self.max_error_rate = max_error_rate
+ self.flags = flags
+ self.wildcard_ref = wildcard_ref
+ self.wildcard_query = wildcard_query
+ self.str_reference = reference
+ self.reference = reference
+ self._min_overlap = 1
+ self.debug = False
+ self._dpmatrix = None
+ self._insertion_cost = 1
+ self._deletion_cost = 1
+
+ property min_overlap:
+ def __get__(self):
+ return self._min_overlap
+
+ def __set__(self, int value):
+ if value < 1:
+ raise ValueError('Minimum overlap must be at least 1')
+ self._min_overlap = value
+
+ property indel_cost:
+ """
+ Matches cost 0, mismatches cost 1. Only insertion/deletion costs can be
+ changed.
+ """
+ def __set__(self, value):
+ if value < 1:
+ raise ValueError('Insertion/deletion cost must be at leat 1')
+ self._insertion_cost = value
+ self._deletion_cost = value
+
+ property reference:
+ def __get__(self):
+ return self._reference
+
+ def __set__(self, str reference):
+ mem = <_Entry*> PyMem_Realloc(self.column, (len(reference) + 1) * sizeof(_Entry))
+ if not mem:
+ raise MemoryError()
+ self.column = mem
+ self._reference = reference.encode('ascii')
+ self.m = len(reference)
+ if self.wildcard_ref:
+ self._reference = self._reference.translate(IUPAC_TABLE)
+ elif self.wildcard_query:
+ self._reference = self._reference.translate(ACGT_TABLE)
+ self.str_reference = reference
+
+ property dpmatrix:
+ """
+ The dynamic programming matrix as a DPMatrix object. This attribute is
+ usually None, unless debugging has been enabled with enable_debug().
+ """
+ def __get__(self):
+ return self._dpmatrix
+
+ def enable_debug(self):
+ """
+ Store the dynamic programming matrix while running the locate() method
+ and make it available in the .dpmatrix attribute.
+ """
+ self.debug = True
+
+ def locate(self, str query):
+ """
+ locate(query) -> (refstart, refstop, querystart, querystop, matches, errors)
+
+ Find the query within the reference associated with this aligner. The
+ intervals (querystart, querystop) and (refstart, refstop) give the
+ location of the match.
+
+ That is, the substrings query[querystart:querystop] and
+ self.reference[refstart:refstop] were found to align best to each other,
+ with the given number of matches and the given number of errors.
+
+ The alignment itself is not returned.
+ """
+ cdef char* s1 = self._reference
+ cdef bytes query_bytes = query.encode('ascii')
+ cdef char* s2 = query_bytes
+ cdef int m = self.m
+ cdef int n = len(query)
+ cdef _Entry* column = self.column
+ cdef double max_error_rate = self.max_error_rate
+ cdef bint start_in_ref = self.flags & START_WITHIN_SEQ1
+ cdef bint start_in_query = self.flags & START_WITHIN_SEQ2
+ cdef bint stop_in_ref = self.flags & STOP_WITHIN_SEQ1
+ cdef bint stop_in_query = self.flags & STOP_WITHIN_SEQ2
+
+ if self.wildcard_query:
+ query_bytes = query_bytes.translate(IUPAC_TABLE)
+ s2 = query_bytes
+ elif self.wildcard_ref:
+ query_bytes = query_bytes.translate(ACGT_TABLE)
+ s2 = query_bytes
+ cdef bint compare_ascii = not (self.wildcard_query or self.wildcard_ref)
+ """
+ DP Matrix:
+ query (j)
+ ----------> n
+ |
+ ref (i) |
+ |
+ V
+ m
+ """
+ cdef int i, j
+
+ # maximum no. of errors
+ cdef int k = <int> (max_error_rate * m)
+
+ # Determine largest and smallest column we need to compute
+ cdef int max_n = n
+ cdef int min_n = 0
+ if not start_in_query:
+ # costs can only get worse after column m
+ max_n = min(n, m + k)
+ if not stop_in_query:
+ min_n = max(0, n - m - k)
+
+ # Fill column min_n.
+ #
+ # Four cases:
+ # not startin1, not startin2: c(i,j) = max(i,j); origin(i, j) = 0
+ # startin1, not startin2: c(i,j) = j ; origin(i, j) = min(0, j - i)
+ # not startin1, startin2: c(i,j) = i ; origin(i, j) =
+ # startin1, startin2: c(i,j) = min(i,j)
+
+ # TODO (later)
+ # fill out columns only until 'last'
+ if not start_in_ref and not start_in_query:
+ for i in range(m + 1):
+ column[i].matches = 0
+ column[i].cost = max(i, min_n) * self._insertion_cost
+ column[i].origin = 0
+ elif start_in_ref and not start_in_query:
+ for i in range(m + 1):
+ column[i].matches = 0
+ column[i].cost = min_n * self._insertion_cost
+ column[i].origin = min(0, min_n - i)
+ elif not start_in_ref and start_in_query:
+ for i in range(m + 1):
+ column[i].matches = 0
+ column[i].cost = i * self._insertion_cost
+ column[i].origin = max(0, min_n - i)
+ else:
+ for i in range(m + 1):
+ column[i].matches = 0
+ column[i].cost = min(i, min_n) * self._insertion_cost
+ column[i].origin = min_n - i
+
+ if self.debug:
+ self._dpmatrix = DPMatrix(self.str_reference, query)
+ for i in range(m + 1):
+ self._dpmatrix.set_entry(i, min_n, column[i].cost)
+ cdef _Match best
+ best.ref_stop = m
+ best.query_stop = n
+ best.cost = m + n
+ best.origin = 0
+ best.matches = 0
+
+ # Ukkonen's trick: index of the last cell that is less than k.
+ cdef int last = min(m, k + 1)
+ if start_in_ref:
+ last = m
+
+ cdef int cost_diag
+ cdef int cost_deletion
+ cdef int cost_insertion
+ cdef int origin, cost, matches
+ cdef int length
+ cdef bint characters_equal
+ cdef _Entry tmp_entry
+
+ with nogil:
+ # iterate over columns
+ for j in range(min_n + 1, max_n + 1):
+ # remember first entry
+ tmp_entry = column[0]
+
+ # fill in first entry in this column
+ if start_in_query:
+ column[0].origin = j
+ else:
+ column[0].cost = j * self._insertion_cost
+ for i in range(1, last + 1):
+ if compare_ascii:
+ characters_equal = (s1[i-1] == s2[j-1])
+ else:
+ characters_equal = (s1[i-1] & s2[j-1]) != 0
+ if characters_equal:
+ # Characters match: This cannot be an indel.
+ cost = tmp_entry.cost
+ origin = tmp_entry.origin
+ matches = tmp_entry.matches + 1
+ else:
+ # Characters do not match.
+ cost_diag = tmp_entry.cost + 1
+ cost_deletion = column[i].cost + self._deletion_cost
+ cost_insertion = column[i-1].cost + self._insertion_cost
+
+ if cost_diag <= cost_deletion and cost_diag <= cost_insertion:
+ # MISMATCH
+ cost = cost_diag
+ origin = tmp_entry.origin
+ matches = tmp_entry.matches
+ elif cost_insertion <= cost_deletion:
+ # INSERTION
+ cost = cost_insertion
+ origin = column[i-1].origin
+ matches = column[i-1].matches
+ else:
+ # DELETION
+ cost = cost_deletion
+ origin = column[i].origin
+ matches = column[i].matches
+
+ # remember current cell for next iteration
+ tmp_entry = column[i]
+
+ column[i].cost = cost
+ column[i].origin = origin
+ column[i].matches = matches
+ if self.debug:
+ with gil:
+ for i in range(last + 1):
+ self._dpmatrix.set_entry(i, j, column[i].cost)
+ while last >= 0 and column[last].cost > k:
+ last -= 1
+ # last can be -1 here, but will be incremented next.
+ # TODO if last is -1, can we stop searching?
+ if last < m:
+ last += 1
+ elif stop_in_query:
+ # Found a match. If requested, find best match in last row.
+ # length of the aligned part of the reference
+ length = m + min(column[m].origin, 0)
+ cost = column[m].cost
+ matches = column[m].matches
+ if length >= self._min_overlap and cost <= length * max_error_rate and (matches > best.matches or (matches == best.matches and cost < best.cost)):
+ # update
+ best.matches = matches
+ best.cost = cost
+ best.origin = column[m].origin
+ best.ref_stop = m
+ best.query_stop = j
+ if cost == 0 and matches == m:
+ # exact match, stop early
+ break
+ # column finished
+
+ if max_n == n:
+ first_i = 0 if stop_in_ref else m
+ # search in last column # TODO last?
+ for i in range(first_i, m+1):
+ length = i + min(column[i].origin, 0)
+ cost = column[i].cost
+ matches = column[i].matches
+ if length >= self._min_overlap and cost <= length * max_error_rate and (matches > best.matches or (matches == best.matches and cost < best.cost)):
+ # update best
+ best.matches = matches
+ best.cost = cost
+ best.origin = column[i].origin
+ best.ref_stop = i
+ best.query_stop = n
+ if best.cost == m + n:
+ # best.cost was initialized with this value.
+ # If it is unchanged, no alignment was found that has
+ # an error rate within the allowed range.
+ return None
+
+ cdef int start1, start2
+ if best.origin >= 0:
+ start1 = 0
+ start2 = best.origin
+ else:
+ start1 = -best.origin
+ start2 = 0
+
+ assert best.ref_stop - start1 > 0 # Do not return empty alignments.
+ return (start1, best.ref_stop, start2, best.query_stop, best.matches, best.cost)
+
+ def __dealloc__(self):
+ PyMem_Free(self.column)
+
+
+def locate(str reference, str query, double max_error_rate, int flags=SEMIGLOBAL, bint wildcard_ref=False, bint wildcard_query=False, int min_overlap=1):
+ aligner = Aligner(reference, max_error_rate, flags, wildcard_ref, wildcard_query)
+ aligner.min_overlap = min_overlap
+ return aligner.locate(query)
+
+
+def compare_prefixes(str ref, str query, bint wildcard_ref=False, bint wildcard_query=False):
+ """
+ Find out whether one string is the prefix of the other one, allowing
+ IUPAC wildcards in ref and/or query if the appropriate flag is set.
+
+ This is used to find an anchored 5' adapter (type 'FRONT') in the 'no indels' mode.
+ This is very simple as only the number of errors needs to be counted.
+
+ This function returns a tuple compatible with what Aligner.locate outputs.
+ """
+ cdef int m = len(ref)
+ cdef int n = len(query)
+ cdef bytes query_bytes = query.encode('ascii')
+ cdef bytes ref_bytes = ref.encode('ascii')
+ cdef char* r_ptr
+ cdef char* q_ptr
+ cdef int length = min(m, n)
+ cdef int i, matches = 0
+ cdef bint compare_ascii = False
+
+ if wildcard_ref:
+ ref_bytes = ref_bytes.translate(IUPAC_TABLE)
+ elif wildcard_query:
+ ref_bytes = ref_bytes.translate(ACGT_TABLE)
+ else:
+ compare_ascii = True
+ if wildcard_query:
+ query_bytes = query_bytes.translate(IUPAC_TABLE)
+ elif wildcard_ref:
+ query_bytes = query_bytes.translate(ACGT_TABLE)
+
+ if compare_ascii:
+ for i in range(length):
+ if ref[i] == query[i]:
+ matches += 1
+ else:
+ r_ptr = ref_bytes
+ q_ptr = query_bytes
+ for i in range(length):
+ if (r_ptr[i] & q_ptr[i]) != 0:
+ matches += 1
+
+ # length - matches = no. of errors
+ return (0, length, 0, length, matches, length - matches)
diff --git a/cutadapt/_qualtrim.pyx b/cutadapt/_qualtrim.pyx
new file mode 100644
index 0000000..3bd88c7
--- /dev/null
+++ b/cutadapt/_qualtrim.pyx
@@ -0,0 +1,84 @@
+# kate: syntax Python;
+# cython: profile=False, emit_code_comments=False
+"""
+Quality trimming.
+"""
+
+def quality_trim_index(str qualities, int cutoff_front, int cutoff_back, int base=33):
+ """
+ Find the positions at which to trim low-quality ends from a nucleotide sequence.
+ Return tuple (start, stop) that indicates the good-quality segment.
+
+ Qualities are assumed to be ASCII-encoded as chr(qual + base).
+
+ The algorithm is the same as the one used by BWA within the function
+ 'bwa_trim_read':
+ - Subtract the cutoff value from all qualities.
+ - Compute partial sums from all indices to the end of the sequence.
+ - Trim sequence at the index at which the sum is minimal.
+ """
+ cdef int s
+ cdef int max_qual
+ cdef int stop = len(qualities)
+ cdef int start = 0
+ cdef int i
+
+ # find trim position for 5' end
+ s = 0
+ max_qual = 0
+ for i in range(len(qualities)):
+ s += cutoff_front - (ord(qualities[i]) - base)
+ if s < 0:
+ break
+ if s > max_qual:
+ max_qual = s
+ start = i + 1
+
+ # same for 3' end
+ max_qual = 0
+ s = 0
+ for i in reversed(xrange(len(qualities))):
+ s += cutoff_back - (ord(qualities[i]) - base)
+ if s < 0:
+ break
+ if s > max_qual:
+ max_qual = s
+ stop = i
+ if start >= stop:
+ start, stop = 0, 0
+ return (start, stop)
+
+
+def nextseq_trim_index(sequence, int cutoff, int base=33):
+ """
+ Variant of the above quality trimming routine that works on NextSeq data.
+ With Illumina NextSeq, bases are encoded with two colors. 'No color' (a
+ dark cycle) usually means that a 'G' was sequenced, but that also occurs
+ when sequencing falls off the end of the fragment. The read then contains
+ a run of high-quality G bases in the end.
+
+ This routine works as the one above, but counts qualities belonging to 'G'
+ bases as being equal to cutoff - 1.
+ """
+ bases = sequence.sequence
+ qualities = sequence.qualities
+ cdef:
+ int s = 0
+ int max_qual = 0
+ int max_i = len(qualities)
+ int i, q
+
+ s = 0
+ max_qual = 0
+ max_i = len(qualities)
+ for i in reversed(xrange(max_i)):
+ q = ord(qualities[i]) - base
+ if bases[i] == 'G':
+ q = cutoff - 1
+ s += cutoff - q
+ if s < 0:
+ break
+ if s > max_qual:
+ max_qual = s
+ max_i = i
+ return max_i
diff --git a/cutadapt/_seqio.pyx b/cutadapt/_seqio.pyx
new file mode 100644
index 0000000..b687c0b
--- /dev/null
+++ b/cutadapt/_seqio.pyx
@@ -0,0 +1,138 @@
+# kate: syntax Python;
+# cython: profile=False, emit_code_comments=False
+from __future__ import print_function, division, absolute_import
+from .xopen import xopen
+from .seqio import _shorten, FormatError, SequenceReader
+
+
+cdef class Sequence(object):
+ """
+ A record in a FASTQ file. Also used for FASTA (then the qualities attribute
+ is None). qualities is a string and it contains the qualities encoded as
+ ascii(qual+33).
+
+ If an adapter has been matched to the sequence, the 'match' attribute is
+ set to the corresponding Match instance.
+ """
+ cdef:
+ public str name
+ public str sequence
+ public str qualities
+ public str name2
+ public object match
+ public object match_info
+
+ def __init__(self, str name, str sequence, str qualities=None, str name2='', match=None,
+ match_info=None):
+ """Set qualities to None if there are no quality values"""
+ self.name = name
+ self.sequence = sequence
+ self.qualities = qualities
+ self.name2 = name2
+ self.match = match
+ self.match_info = match_info
+ if qualities is not None and len(qualities) != len(sequence):
+ rname = _shorten(name)
+ raise FormatError("In read named {0!r}: length of quality sequence ({1}) and length "
+ "of read ({2}) do not match".format(
+ rname, len(qualities), len(sequence)))
+
+ def __getitem__(self, key):
+ """slicing"""
+ return self.__class__(
+ self.name,
+ self.sequence[key],
+ self.qualities[key] if self.qualities is not None else None,
+ self.name2,
+ self.match,
+ self.match_info)
+
+ def __repr__(self):
+ qstr = ''
+ if self.qualities is not None:
+ qstr = ', qualities={0!r}'.format(_shorten(self.qualities))
+ return '<Sequence(name={0!r}, sequence={1!r}{2})>'.format(_shorten(self.name), _shorten(self.sequence), qstr)
+
+ def __len__(self):
+ return len(self.sequence)
+
+ def __richcmp__(self, other, int op):
+ if 2 <= op <= 3:
+ eq = self.name == other.name and \
+ self.sequence == other.sequence and \
+ self.qualities == other.qualities
+ if op == 2:
+ return eq
+ else:
+ return not eq
+ else:
+ raise NotImplementedError()
+
+ def __reduce__(self):
+ return (Sequence, (self.name, self.sequence, self.qualities, self.name2))
+
+
+class FastqReader(SequenceReader):
+ """
+ Reader for FASTQ files. Does not support multi-line FASTQ files.
+ """
+ def __init__(self, file, sequence_class=Sequence):
+ """
+ file is a filename or a file-like object.
+ If file is a filename, then .gz files are supported.
+ """
+ super(FastqReader, self).__init__(file)
+ self.sequence_class = sequence_class
+ self.delivers_qualities = True
+
+ def __iter__(self):
+ """
+ Yield Sequence objects
+ """
+ cdef int i = 0
+ cdef int strip
+ cdef str line, name, qualities, sequence, name2
+ sequence_class = self.sequence_class
+
+ it = iter(self._file)
+ line = next(it)
+ if not (line and line[0] == '@'):
+ raise FormatError("Line {0} in FASTQ file is expected to start with '@', but found {1!r}".format(i+1, line[:10]))
+ strip = -2 if line.endswith('\r\n') else -1
+ name = line[1:strip]
+
+ i = 1
+ for line in it:
+ if i == 0:
+ if not (line and line[0] == '@'):
+ raise FormatError("Line {0} in FASTQ file is expected to start with '@', but found {1!r}".format(i+1, line[:10]))
+ name = line[1:strip]
+ elif i == 1:
+ sequence = line[:strip]
+ elif i == 2:
+ if line == '+\n': # check most common case first
+ name2 = ''
+ else:
+ line = line[:strip]
+ if not (line and line[0] == '+'):
+ raise FormatError("Line {0} in FASTQ file is expected to start with '+', but found {1!r}".format(i+1, line[:10]))
+ if len(line) > 1:
+ if not line[1:] == name:
+ raise FormatError(
+ "At line {0}: Sequence descriptions in the FASTQ file don't match "
+ "({1!r} != {2!r}).\n"
+ "The second sequence description must be either empty "
+ "or equal to the first description.".format(i+1,
+ name, line[1:]))
+ name2 = name
+ else:
+ name2 = ''
+ elif i == 3:
+ if len(line) == len(sequence) - strip:
+ qualities = line[:strip]
+ else:
+ qualities = line.rstrip('\r\n')
+ yield sequence_class(name, sequence, qualities, name2=name2)
+ i = (i + 1) % 4
+ if i != 0:
+ raise FormatError("FASTQ file ended prematurely")
diff --git a/cutadapt/adapters.py b/cutadapt/adapters.py
new file mode 100644
index 0000000..f629c8f
--- /dev/null
+++ b/cutadapt/adapters.py
@@ -0,0 +1,569 @@
+# coding: utf-8
+"""
+Adapters
+"""
+from __future__ import print_function, division, absolute_import
+import sys
+import re
+from collections import defaultdict
+from cutadapt import align, colorspace
+from cutadapt.seqio import ColorspaceSequence, FastaReader
+
+# Constants for the find_best_alignment function.
+# The function is called with SEQ1 as the adapter, SEQ2 as the read.
+# TODO get rid of those constants, use strings instead
+BACK = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ1
+FRONT = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ2 | align.START_WITHIN_SEQ1
+PREFIX = align.STOP_WITHIN_SEQ2
+SUFFIX = align.START_WITHIN_SEQ2
+ANYWHERE = align.SEMIGLOBAL
+LINKED = 'linked'
+
+
+def parse_braces(sequence):
+ """
+ Replace all occurrences of ``x{n}`` (where x is any character) with n
+ occurrences of x. Raise ValueError if the expression cannot be parsed.
+
+ >>> parse_braces('TGA{5}CT')
+ TGAAAAACT
+ """
+ # Simple DFA with four states, encoded in prev
+ result = ''
+ prev = None
+ for s in re.split('(\{|\})', sequence):
+ if s == '':
+ continue
+ if prev is None:
+ if s == '{':
+ raise ValueError('"{" must be used after a character')
+ if s == '}':
+ raise ValueError('"}" cannot be used here')
+ prev = s
+ result += s
+ elif prev == '{':
+ prev = int(s)
+ if not 0 <= prev <= 10000:
+ raise ValueError('Value {} invalid'.format(prev))
+ elif isinstance(prev, int):
+ if s != '}':
+ raise ValueError('"}" expected')
+ result = result[:-1] + result[-1] * prev
+ prev = None
+ else:
+ if s != '{':
+ raise ValueError('Expected "{"')
+ prev = '{'
+ # Check if we are in a non-terminating state
+ if isinstance(prev, int) or prev == '{':
+ raise ValueError("Unterminated expression")
+ return result
+
+
+class AdapterParser(object):
+ """
+ Factory for Adapter classes that all use the same parameters (error rate,
+ indels etc.). The given **kwargs will be passed to the Adapter constructors.
+ """
+ def __init__(self, colorspace=False, **kwargs):
+ self.colorspace = colorspace
+ self.constructor_args = kwargs
+ self.adapter_class = ColorspaceAdapter if colorspace else Adapter
+
+ def parse(self, spec, name=None, cmdline_type='back'):
+ """
+ Parse an adapter specification not using ``file:`` notation and return
+ an object of an appropriate Adapter class. The notation for anchored
+ 5' and 3' adapters is supported. If the name parameter is None, then
+ an attempt is made to extract the name from the specification
+ (If spec is 'name=ADAPTER', name will be 'name'.)
+
+ cmdline_type -- describes which commandline parameter was used (``-a``
+ is 'back', ``-b`` is 'anywhere', and ``-g`` is 'front').
+ """
+ if name is None:
+ name, spec = self._extract_name(spec)
+ sequence = spec
+ types = dict(back=BACK, front=FRONT, anywhere=ANYWHERE)
+ if cmdline_type not in types:
+ raise ValueError('cmdline_type cannot be {0!r}'.format(cmdline_type))
+ where = types[cmdline_type]
+ if where == FRONT and spec.startswith('^'): # -g ^ADAPTER
+ sequence, where = spec[1:], PREFIX
+ elif where == BACK:
+ sequence1, middle, sequence2 = spec.partition('...')
+ if middle == '...':
+ if not sequence1: # -a ...ADAPTER
+ sequence = sequence1[3:]
+ elif not sequence2: # -a ADAPTER...
+ sequence, where = spec[:-3], PREFIX
+ else: # -a ADAPTER1...ADAPTER2
+ if self.colorspace:
+ raise NotImplementedError('Using linked adapters in colorspace is not supported')
+ if sequence1.startswith('^') or sequence2.endswith('$'):
+ raise NotImplementedError('Using "$" or "^" when '
+ 'specifying a linked adapter is not supported')
+ return LinkedAdapter(sequence1, sequence2, name=name,
+ **self.constructor_args)
+ elif spec.endswith('$'): # -a ADAPTER$
+ sequence, where = spec[:-1], SUFFIX
+ if not sequence:
+ raise ValueError("The adapter sequence is empty.")
+
+ return self.adapter_class(sequence, where, name=name, **self.constructor_args)
+
+ def parse_with_file(self, spec, cmdline_type='back'):
+ """
+ Parse an adapter specification and yield appropriate Adapter classes.
+ This works like the parse() function above, but also supports the
+ ``file:`` notation for reading adapters from an external FASTA
+ file. Since a file can contain multiple adapters, this
+ function is a generator.
+ """
+ if spec.startswith('file:'):
+ # read adapter sequences from a file
+ with FastaReader(spec[5:]) as fasta:
+ for record in fasta:
+ name = record.name.split(None, 1)[0]
+ yield self.parse(record.sequence, name, cmdline_type)
+ else:
+ name, spec = self._extract_name(spec)
+ yield self.parse(spec, name, cmdline_type)
+
+ def _extract_name(self, spec):
+ """
+ Parse an adapter specification given as 'name=adapt' into 'name' and 'adapt'.
+ """
+ fields = spec.split('=', 1)
+ if len(fields) > 1:
+ name, spec = fields
+ name = name.strip()
+ else:
+ name = None
+ spec = spec.strip()
+ return name, spec
+
+ def parse_multi(self, back, anywhere, front):
+ """
+ Parse all three types of commandline options that can be used to
+ specify adapters. back, anywhere and front are lists of strings,
+ corresponding to the respective commandline types (-a, -b, -g).
+
+ Return a list of appropriate Adapter classes.
+ """
+ adapters = []
+ for specs, cmdline_type in (back, 'back'), (anywhere, 'anywhere'), (front, 'front'):
+ for spec in specs:
+ adapters.extend(self.parse_with_file(spec, cmdline_type))
+ return adapters
+
+
+class Match(object):
+ """
+ TODO creating instances of this class is relatively slow and responsible for quite some runtime.
+ """
+ __slots__ = ['astart', 'astop', 'rstart', 'rstop', 'matches', 'errors', 'front', 'adapter', 'read', 'length']
+ def __init__(self, astart, astop, rstart, rstop, matches, errors, front, adapter, read):
+ self.astart = astart
+ self.astop = astop
+ self.rstart = rstart
+ self.rstop = rstop
+ self.matches = matches
+ self.errors = errors
+ self.front = self._guess_is_front() if front is None else front
+ self.adapter = adapter
+ self.read = read
+ # Number of aligned characters in the adapter. If there are
+ # indels, this may be different from the number of characters
+ # in the read.
+ self.length = self.astop - self.astart
+ assert self.length > 0
+ assert self.errors / self.length <= self.adapter.max_error_rate
+ assert self.length - self.errors > 0
+
+ def __str__(self):
+ return 'Match(astart={0}, astop={1}, rstart={2}, rstop={3}, matches={4}, errors={5})'.format(
+ self.astart, self.astop, self.rstart, self.rstop, self.matches, self.errors)
+
+ def _guess_is_front(self):
+ """
+ Return whether this is guessed to be a front adapter.
+
+ The match is assumed to be a front adapter when the first base of
+ the read is involved in the alignment to the adapter.
+ """
+ return self.rstart == 0
+
+ def wildcards(self, wildcard_char='N'):
+ """
+ Return a string that contains, for each wildcard character,
+ the character that it matches. For example, if the adapter
+ ATNGNA matches ATCGTA, then the string 'CT' is returned.
+
+ If there are indels, this is not reliable as the full alignment
+ is not available.
+ """
+ wildcards = [ self.read.sequence[self.rstart + i:self.rstart + i + 1] for i in range(self.length)
+ if self.adapter.sequence[self.astart + i] == wildcard_char and self.rstart + i < len(self.read.sequence) ]
+ return ''.join(wildcards)
+
+ def rest(self):
+ """
+ Return the part of the read before this match if this is a
+ 'front' (5') adapter,
+ return the part after the match if this is not a 'front' adapter (3').
+ This can be an empty string.
+ """
+ if self.front:
+ return self.read.sequence[:self.rstart]
+ else:
+ return self.read.sequence[self.rstop:]
+
+ def get_info_record(self):
+ seq = self.read.sequence
+ qualities = self.read.qualities
+ info = (
+ self.read.name,
+ self.errors,
+ self.rstart,
+ self.rstop,
+ seq[0:self.rstart],
+ seq[self.rstart:self.rstop],
+ seq[self.rstop:],
+ self.adapter.name
+ )
+ if qualities:
+ info += (
+ qualities[0:self.rstart],
+ qualities[self.rstart:self.rstop],
+ qualities[self.rstop:]
+ )
+ else:
+ info += ('','','')
+
+ return info
+
+def _generate_adapter_name(_start=[1]):
+ name = str(_start[0])
+ _start[0] += 1
+ return name
+
+
+class Adapter(object):
+ """
+ An adapter knows how to match itself to a read.
+ In particular, it knows where it should be within the read and how to interpret
+ wildcard characters.
+
+ where -- One of the BACK, FRONT, PREFIX, SUFFIX or ANYWHERE constants.
+ This influences where the adapter is allowed to appear within in the
+ read and also which part of the read is removed.
+
+ sequence -- The adapter sequence as string. Will be converted to uppercase.
+ Also, Us will be converted to Ts.
+
+ max_error_rate -- Maximum allowed error rate. The error rate is
+ the number of errors in the alignment divided by the length
+ of the part of the alignment that matches the adapter.
+
+ minimum_overlap -- Minimum length of the part of the alignment
+ that matches the adapter.
+
+ read_wildcards -- Whether IUPAC wildcards in the read are allowed.
+
+ adapter_wildcards -- Whether IUPAC wildcards in the adapter are
+ allowed.
+
+ name -- optional name of the adapter. If not provided, the name is set to a
+ unique number.
+ """
+ def __init__(self, sequence, where, max_error_rate=0.1, min_overlap=3,
+ read_wildcards=False, adapter_wildcards=True, name=None, indels=True):
+ self.debug = False
+ self.name = _generate_adapter_name() if name is None else name
+ self.sequence = parse_braces(sequence.upper().replace('U', 'T'))
+ assert len(self.sequence) > 0
+ self.where = where
+ self.max_error_rate = max_error_rate
+ self.min_overlap = min(min_overlap, len(self.sequence))
+ self.indels = indels
+ self.adapter_wildcards = adapter_wildcards and not set(self.sequence) <= set('ACGT')
+ self.read_wildcards = read_wildcards
+ # redirect trimmed() to appropriate function depending on adapter type
+ trimmers = {
+ FRONT: self._trimmed_front,
+ PREFIX: self._trimmed_front,
+ BACK: self._trimmed_back,
+ SUFFIX: self._trimmed_back,
+ ANYWHERE: self._trimmed_anywhere
+ }
+ self.trimmed = trimmers[where]
+ if where == ANYWHERE:
+ self._front_flag = None # means: guess
+ else:
+ self._front_flag = where not in (BACK, SUFFIX)
+ # statistics about length of removed sequences
+ self.lengths_front = defaultdict(int)
+ self.lengths_back = defaultdict(int)
+ self.errors_front = defaultdict(lambda: defaultdict(int))
+ self.errors_back = defaultdict(lambda: defaultdict(int))
+ self.adjacent_bases = { 'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0 }
+
+ self.aligner = align.Aligner(self.sequence, self.max_error_rate,
+ flags=self.where, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
+ self.aligner.min_overlap = self.min_overlap
+ if not self.indels:
+ # TODO
+ # When indels are disallowed, an entirely different algorithm
+ # should be used.
+ self.aligner.indel_cost = 100000
+
+ def __repr__(self):
+ return '<Adapter(name="{name}", sequence="{sequence}", where={where}, '\
+ 'max_error_rate={max_error_rate}, min_overlap={min_overlap}, '\
+ 'read_wildcards={read_wildcards}, '\
+ 'adapter_wildcards={adapter_wildcards}, '\
+ 'indels={indels})>'.format(**vars(self))
+
+ def enable_debug(self):
+ """
+ Print out the dynamic programming matrix after matching a read to an
+ adapter.
+ """
+ self.debug = True
+ self.aligner.enable_debug()
+
+ def match_to(self, read):
+ """
+ Attempt to match this adapter to the given read.
+
+ Return an Match instance if a match was found;
+ return None if no match was found given the matching criteria (minimum
+ overlap length, maximum error rate).
+ """
+ read_seq = read.sequence.upper()
+ pos = -1
+ # try to find an exact match first unless wildcards are allowed
+ if not self.adapter_wildcards:
+ if self.where == PREFIX:
+ pos = 0 if read_seq.startswith(self.sequence) else -1
+ elif self.where == SUFFIX:
+ pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1
+ else:
+ pos = read_seq.find(self.sequence)
+ if pos >= 0:
+ match = Match(
+ 0, len(self.sequence), pos, pos + len(self.sequence),
+ len(self.sequence), 0, self._front_flag, self, read)
+ else:
+ # try approximate matching
+ if not self.indels and self.where in (PREFIX, SUFFIX):
+ if self.where == PREFIX:
+ alignment = align.compare_prefixes(self.sequence, read_seq,
+ wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
+ else:
+ alignment = align.compare_suffixes(self.sequence, read_seq,
+ wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
+ astart, astop, rstart, rstop, matches, errors = alignment
+ if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate:
+ match = Match(*(alignment + (self._front_flag, self, read)))
+ else:
+ match = None
+ else:
+ alignment = self.aligner.locate(read_seq)
+ if self.debug:
+ print(self.aligner.dpmatrix) # pragma: no cover
+ if alignment is None:
+ match = None
+ else:
+ astart, astop, rstart, rstop, matches, errors = alignment
+ match = Match(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read)
+
+ if match is None:
+ return None
+ assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match
+ assert match.length >= self.min_overlap
+ return match
+
+ def _trimmed_anywhere(self, match):
+ """Return a trimmed read"""
+ if match.front:
+ return self._trimmed_front(match)
+ else:
+ return self._trimmed_back(match)
+
+ def _trimmed_front(self, match):
+ """Return a trimmed read"""
+ # TODO move away
+ self.lengths_front[match.rstop] += 1
+ self.errors_front[match.rstop][match.errors] += 1
+ return match.read[match.rstop:]
+
+ def _trimmed_back(self, match):
+ """Return a trimmed read without the 3' (back) adapter"""
+ # TODO move away
+ self.lengths_back[len(match.read) - match.rstart] += 1
+ self.errors_back[len(match.read) - match.rstart][match.errors] += 1
+ adjacent_base = match.read.sequence[match.rstart-1:match.rstart]
+ if adjacent_base not in 'ACGT':
+ adjacent_base = ''
+ self.adjacent_bases[adjacent_base] += 1
+ return match.read[:match.rstart]
+
+ def __len__(self):
+ return len(self.sequence)
+
+
+class ColorspaceAdapter(Adapter):
+ def __init__(self, *args, **kwargs):
+ super(ColorspaceAdapter, self).__init__(*args, **kwargs)
+ has_nucleotide_seq = False
+ if set(self.sequence) <= set('ACGT'):
+ # adapter was given in basespace
+ self.nucleotide_sequence = self.sequence
+ has_nucleotide_seq = True
+ self.sequence = colorspace.encode(self.sequence)[1:]
+ if self.where in (PREFIX, FRONT) and not has_nucleotide_seq:
+ raise ValueError("A 5' colorspace adapter needs to be given in nucleotide space")
+ self.aligner.reference = self.sequence
+
+ def match_to(self, read):
+ """Return Match instance"""
+ if self.where != PREFIX:
+ return super(ColorspaceAdapter, self).match_to(read)
+ # create artificial adapter that includes a first color that encodes the
+ # transition from primer base into adapter
+ asequence = colorspace.ENCODE[read.primer + self.nucleotide_sequence[0:1]] + self.sequence
+
+ pos = 0 if read.sequence.startswith(asequence) else -1
+ if pos >= 0:
+ match = Match(
+ 0, len(asequence), pos, pos + len(asequence),
+ len(asequence), 0, self._front_flag, self, read)
+ else:
+ # try approximate matching
+ self.aligner.reference = asequence
+ alignment = self.aligner.locate(read.sequence)
+ if self.debug:
+ print(self.aligner.dpmatrix) # pragma: no cover
+ if alignment is not None:
+ match = Match(*(alignment + (self._front_flag, self, read)))
+ else:
+ match = None
+
+ if match is None:
+ return None
+ assert match.length > 0 and match.errors / match.length <= self.max_error_rate
+ assert match.length >= self.min_overlap
+ return match
+
+ def _trimmed_front(self, match):
+ """Return a trimmed read"""
+ read = match.read
+ self.lengths_front[match.rstop] += 1
+ self.errors_front[match.rstop][match.errors] += 1
+ # to remove a front adapter, we need to re-encode the first color following the adapter match
+ color_after_adapter = read.sequence[match.rstop:match.rstop + 1]
+ if not color_after_adapter:
+ # the read is empty
+ return read[match.rstop:]
+ base_after_adapter = colorspace.DECODE[self.nucleotide_sequence[-1:] + color_after_adapter]
+ new_first_color = colorspace.ENCODE[read.primer + base_after_adapter]
+ new_read = read[:]
+ new_read.sequence = new_first_color + read.sequence[(match.rstop + 1):]
+ new_read.qualities = read.qualities[match.rstop:] if read.qualities else None
+ return new_read
+
+ def _trimmed_back(self, match):
+ """Return a trimmed read"""
+ # trim one more color if long enough
+ adjusted_rstart = max(match.rstart - 1, 0)
+ self.lengths_back[len(match.read) - adjusted_rstart] += 1
+ self.errors_back[len(match.read) - adjusted_rstart][match.errors] += 1
+ return match.read[:adjusted_rstart]
+
+ def __repr__(self):
+ return '<ColorspaceAdapter(sequence={0!r}, where={1})>'.format(self.sequence, self.where)
+
+
+class LinkedMatch(object):
+ """
+ Represent a match of a LinkedAdapter.
+
+ TODO
+ It shouldn’t be necessary to have both a Match and a LinkedMatch class.
+ """
+ def __init__(self, front_match, back_match, adapter):
+ self.front_match = front_match
+ self.back_match = back_match
+ self.adapter = adapter
+ assert front_match is not None
+
+
+class LinkedAdapter(object):
+ """
+ """
+ def __init__(self, front_sequence, back_sequence,
+ front_anchored=True, back_anchored=False, name=None, **kwargs):
+ """
+ kwargs are passed on to individual Adapter constructors
+ """
+ assert front_anchored and not back_anchored
+ where1 = PREFIX if front_anchored else FRONT
+ where2 = SUFFIX if back_anchored else BACK
+ self.front_anchored = front_anchored
+ self.back_anchored = back_anchored
+
+ # The following attributes are needed for the report
+ self.where = LINKED
+ self.name = _generate_adapter_name() if name is None else name
+ self.front_adapter = Adapter(front_sequence, where=where1, name=None, **kwargs)
+ self.back_adapter = Adapter(back_sequence, where=where2, name=None, **kwargs)
+
+ def enable_debug(self):
+ self.front_adapter.enable_debug()
+ self.back_adapter.enable_debug()
+
+ def match_to(self, read):
+ """
+ Match the linked adapters against the given read. If the 'front' adapter
+ is not found, the 'back' adapter is not searched for.
+ """
+ front_match = self.front_adapter.match_to(read)
+ if front_match is None:
+ return None
+ # TODO use match.trimmed() instead as soon as that does not update
+ # statistics anymore
+ read = read[front_match.rstop:]
+ back_match = self.back_adapter.match_to(read)
+ return LinkedMatch(front_match, back_match, self)
+
+ def trimmed(self, match):
+ front_trimmed = self.front_adapter.trimmed(match.front_match)
+ if match.back_match:
+ return self.back_adapter.trimmed(match.back_match)
+ else:
+ return front_trimmed
+
+ # Lots of forwarders (needed for the report). I’m sure this can be done
+ # in a better way.
+
+ @property
+ def lengths_front(self):
+ return self.front_adapter.lengths_front
+
+ @property
+ def lengths_back(self):
+ return self.back_adapter.lengths_back
+
+ @property
+ def errors_front(self):
+ return self.front_adapter.errors_front
+
+ @property
+ def errors_back(self):
+ return self.back_adapter.errors_back
+
+ @property
+ def adjacent_bases(self):
+ return self.back_adapter.adjacent_bases
diff --git a/cutadapt/align.py b/cutadapt/align.py
new file mode 100644
index 0000000..aabd208
--- /dev/null
+++ b/cutadapt/align.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+"""
+Alignment module.
+"""
+from __future__ import print_function, division, absolute_import
+
+from cutadapt._align import Aligner, compare_prefixes, locate
+
+# flags for global alignment
+
+# The interpretation of the first flag is:
+# An initial portion of seq1 may be skipped at no cost.
+# This is equivalent to saying that in the alignment,
+# gaps in the beginning of seq2 are free.
+#
+# The other flags have an equivalent meaning.
+START_WITHIN_SEQ1 = 1
+START_WITHIN_SEQ2 = 2
+STOP_WITHIN_SEQ1 = 4
+STOP_WITHIN_SEQ2 = 8
+
+# Use this to get regular semiglobal alignment
+# (all gaps in the beginning or end are free)
+SEMIGLOBAL = START_WITHIN_SEQ1 | START_WITHIN_SEQ2 | STOP_WITHIN_SEQ1 | STOP_WITHIN_SEQ2
+
+
+def compare_suffixes(s1, s2, wildcard_ref=False, wildcard_query=False):
+ """
+ Find out whether one string is the suffix of the other one, allowing
+ mismatches. Used to find an anchored 3' adapter when no indels are allowed.
+ """
+ s1 = s1[::-1]
+ s2 = s2[::-1]
+ _, length, _, _, matches, errors = compare_prefixes(s1, s2, wildcard_ref, wildcard_query)
+ return (len(s1) - length, len(s1), len(s2) - length, len(s2), matches, errors)
diff --git a/cutadapt/colorspace.py b/cutadapt/colorspace.py
new file mode 100644
index 0000000..4512941
--- /dev/null
+++ b/cutadapt/colorspace.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+"""
+Colorspace conversion routines.
+
+Inspired by agapython/util/Dibase.py from Corona lite,
+but reimplemented to avoid licensing issues.
+
+Encoding Table
+
+ A C G T
+A 0 1 2 3
+C 1 0 3 2
+G 2 3 0 1
+T 3 2 1 0
+"""
+from __future__ import print_function, division, absolute_import
+
+__author__ = 'Marcel Martin'
+
+
+def _initialize_dicts():
+ """
+ Create the colorspace encoding and decoding dictionaries.
+ """
+ enc = {}
+ for i, c1 in enumerate("ACGT"):
+ enc['N' + c1] = '4'
+ enc[c1 + 'N'] = '4'
+ enc['.' + c1] = '4'
+ enc[c1 + '.'] = '4'
+ for j, c2 in enumerate("ACGT"):
+ # XOR of nucleotides gives color
+ enc[c1 + c2] = chr(ord('0') + (i ^ j))
+ enc.update({ 'NN': '4', 'N.': '4', '.N': '4', '..': '4'})
+
+ dec = {}
+ for i, c1 in enumerate("ACGT"):
+ dec['.' + str(i)] = 'N'
+ dec['N' + str(i)] = 'N'
+ dec[c1 + '4'] = 'N'
+ dec[c1 + '.'] = 'N'
+ for j, c2 in enumerate("ACGT"):
+ # XOR of nucleotides gives color
+ dec[c1 + chr(ord('0') + (i ^ j))] = c2
+ dec['N4'] = 'N'
+
+ return (enc, dec)
+
+
+def encode(s):
+ """
+ Given a sequence of nucleotides, convert them to
+ colorspace. Only uppercase characters are allowed.
+ >>> encode("ACGGTC")
+ "A13012"
+ """
+ if not s:
+ return s
+ r = s[0:1]
+ for i in range(len(s) - 1):
+ r += ENCODE[s[i:i+2]]
+ return r
+
+
+def decode(s):
+ """
+ Decode a sequence of colors to nucleotide space.
+ The first character in s must be a nucleotide.
+ Only uppercase characters are allowed.
+ >>> decode("A13012")
+ "ACGGTC"
+ """
+ if len(s) < 2:
+ return s
+ x = s[0]
+ result = x
+ for c in s[1:]:
+ x = DECODE[x + c]
+ result += x
+ return result
+
+
+(ENCODE, DECODE) = _initialize_dicts()
diff --git a/cutadapt/compat.py b/cutadapt/compat.py
new file mode 100644
index 0000000..2289948
--- /dev/null
+++ b/cutadapt/compat.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""
+Minimal Py2/Py3 compatibility library.
+"""
+from __future__ import print_function, division, absolute_import
+import sys
+PY3 = sys.version > '3'
+
+
+if PY3:
+ maketrans = str.maketrans
+ basestring = str
+ zip = zip
+ next = next
+
+ def bytes_to_str(s):
+ return s.decode('ascii')
+
+ def str_to_bytes(s):
+ return s.encode('ascii')
+
+ def force_str(s):
+ if isinstance(s, bytes):
+ return s.decode('ascii')
+ else:
+ return s
+ from io import StringIO
+
+else:
+ def bytes_to_str(s):
+ return s
+
+ def str_to_bytes(s):
+ return s
+
+ def force_str(s):
+ return s
+
+ def next(it):
+ return it.next()
+
+ from string import maketrans
+ basestring = basestring
+ from itertools import izip as zip
+ from StringIO import StringIO
diff --git a/cutadapt/filters.py b/cutadapt/filters.py
new file mode 100644
index 0000000..3ab1e04
--- /dev/null
+++ b/cutadapt/filters.py
@@ -0,0 +1,256 @@
+# coding: utf-8
+"""
+Classes for writing and filtering of processed reads.
+
+A Filter is a callable that has the read as its only argument. If it is called,
+it returns True if the read should be filtered (discarded), and False if not.
+
+To be used, a filter needs to be wrapped in one of the redirector classes.
+They are called so because they can redirect filtered reads to a file if so
+desired. They also keep statistics.
+
+To determine what happens to a read, a list of redirectors with different
+filters is created and each redirector is called in turn until one returns True.
+The read is then assumed to have been "consumed", that is, either written
+somewhere or filtered (should be discarded).
+"""
+from __future__ import print_function, division, absolute_import
+from .xopen import xopen
+from . import seqio
+
+# Constants used when returning from a Filter’s __call__ method to improve
+# readability (it is unintuitive that "return True" means "discard the read").
+DISCARD = True
+KEEP = False
+
+
+class NoFilter(object):
+ """
+ No filtering, just send each read to the given writer.
+ """
+ def __init__(self, writer):
+ self.filtered = 0
+ self.writer = writer
+ self.filter = filter
+ self.written = 0 # no of written reads TODO move to writer
+ self.written_bp = [0, 0]
+
+ def __call__(self, read):
+ self.writer.write(read)
+ self.written += 1
+ self.written_bp[0] += len(read)
+ return DISCARD
+
+
+class PairedNoFilter(object):
+ """
+ No filtering, just send each paired-end read to the given writer.
+ """
+ def __init__(self, writer):
+ self.filtered = 0
+ self.writer = writer
+ self.written = 0 # no of written reads or read pairs TODO move to writer
+ self.written_bp = [0, 0]
+
+ def __call__(self, read1, read2):
+ self.writer.write(read1, read2)
+ self.written += 1
+ self.written_bp[0] += len(read1)
+ self.written_bp[1] += len(read2)
+ return DISCARD
+
+
+class Redirector(object):
+ """
+ Redirect discarded reads to the given writer. This is for single-end reads.
+ """
+ def __init__(self, writer, filter):
+ self.filtered = 0
+ self.writer = writer
+ self.filter = filter
+ self.written = 0 # no of written reads TODO move to writer
+ self.written_bp = [0, 0]
+
+ def __call__(self, read):
+ if self.filter(read):
+ self.filtered += 1
+ if self.writer is not None:
+ self.writer.write(read)
+ self.written += 1
+ self.written_bp[0] += len(read)
+ return DISCARD
+ return KEEP
+
+
+class PairedRedirector(object):
+ """
+ Redirect discarded reads to the given writer. This is for paired-end reads,
+ using the 'new-style' filtering where both reads are inspected. That is,
+ the entire pair is discarded if at least 1 or 2 of the reads match the
+ filtering criteria.
+ """
+ def __init__(self, writer, filter, min_affected=1):
+ """
+ min_affected -- values 1 and 2 are allowed.
+ 1 means: the pair is discarded if any read matches
+ 2 means: the pair is discarded if both reads match
+ """
+ if not min_affected in (1, 2):
+ raise ValueError("min_affected must be 1 or 2")
+ self.filtered = 0
+ self.writer = writer
+ self.filter = filter
+ self._min_affected = min_affected
+ self.written = 0 # no of written reads or read pairs TODO move to writer
+ self.written_bp = [0, 0]
+
+ def __call__(self, read1, read2):
+ if self.filter(read1) + self.filter(read2) >= self._min_affected:
+ self.filtered += 1
+ # discard read
+ if self.writer is not None:
+ self.writer.write(read1, read2)
+ self.written += 1
+ self.written_bp[0] += len(read1)
+ self.written_bp[1] += len(read2)
+ return DISCARD
+ return KEEP
+
+
+class LegacyPairedRedirector(object):
+ """
+ Redirect discarded reads to the given writer. This is for paired-end reads,
+ using the 'legacy' filtering mode (backwards compatibility). That is, if
+ the first read matches the filtering criteria, the pair is discarded. The
+ second read is not inspected.
+ """
+ def __init__(self, writer, filter):
+ self.filtered = 0
+ self.writer = writer
+ self.filter = filter
+ self.written = 0 # no of written reads or read pairs TODO move to writer
+ self.written_bp = [0, 0]
+
+ def __call__(self, read1, read2):
+ if self.filter(read1):
+ self.filtered += 1
+ # discard read
+ if self.writer is not None:
+ self.writer.write(read1, read2)
+ self.written += 1
+ self.written_bp[0] += len(read1)
+ self.written_bp[1] += len(read2)
+ return DISCARD
+ return KEEP
+
+
+class TooShortReadFilter(object):
+ # TODO paired_outfile is left at its default value None (read2 is silently discarded)
+ def __init__(self, minimum_length):
+ self.minimum_length = minimum_length
+
+ def __call__(self, read):
+ return len(read) < self.minimum_length
+
+
+class TooLongReadFilter(object):
+ def __init__(self, maximum_length):
+ self.maximum_length = maximum_length
+
+ def __call__(self, read):
+ return len(read) > self.maximum_length
+
+
+class NContentFilter(object):
+ """
+ Discards a reads that has a number of 'N's over a given threshold. It handles both raw counts of Ns as well
+ as proportions. Note, for raw counts, it is a greater than comparison, so a cutoff
+ of '1' will keep reads with a single N in it.
+ """
+ def __init__(self, count):
+ """
+ Count -- if it is below 1.0, it will be considered a proportion, and above and equal to
+ 1 will be considered as discarding reads with a number of N's greater than this cutoff.
+ """
+ assert count >= 0
+ self.is_proportion = count < 1.0
+ self.cutoff = count
+
+ def __call__(self, read):
+ """Return True when the read should be discarded"""
+ n_count = read.sequence.lower().count('n')
+ if self.is_proportion:
+ if len(read) == 0:
+ return False
+ return n_count / len(read) > self.cutoff
+ else:
+ return n_count > self.cutoff
+
+
+class DiscardUntrimmedFilter(object):
+ """
+ Return True if read is untrimmed.
+ """
+ def __call__(self, read):
+ return read.match is None
+
+
+class DiscardTrimmedFilter(object):
+ """
+ Return True if read is trimmed.
+ """
+ def __call__(self, read):
+ return read.match is not None
+
+
+class Demultiplexer(object):
+ """
+ Demultiplex trimmed reads. Reads are written to different output files
+ depending on which adapter matches. Files are created when the first read
+ is written to them.
+ """
+ def __init__(self, path_template, untrimmed_path, colorspace, qualities):
+ """
+ path_template must contain the string '{name}', which will be replaced
+ with the name of the adapter to form the final output path.
+ Reads without an adapter match are written to the file named by
+ untrimmed_path.
+ """
+ assert '{name}' in path_template
+ self.template = path_template
+ self.untrimmed_path = untrimmed_path
+ self.untrimmed_writer = None
+ self.writers = dict()
+ self.written = 0
+ self.written_bp = [0, 0]
+ self.colorspace = colorspace
+ self.qualities = qualities
+
+ def __call__(self, read1, read2=None):
+ if read2 is None:
+ # single-end read
+ if read1.match is None:
+ if self.untrimmed_writer is None and self.untrimmed_path is not None:
+ self.untrimmed_writer = seqio.open(self.untrimmed_path,
+ mode='w', colorspace=self.colorspace, qualities=self.qualities)
+ if self.untrimmed_writer is not None:
+ self.written += 1
+ self.written_bp[0] += len(read1)
+ self.untrimmed_writer.write(read1)
+ else:
+ name = read1.match.adapter.name
+ if name not in self.writers:
+ self.writers[name] = seqio.open(self.template.replace('{name}', name),
+ mode='w', colorspace=self.colorspace, qualities=self.qualities)
+ self.written += 1
+ self.written_bp[0] += len(read1)
+ self.writers[name].write(read1)
+ return DISCARD
+ else:
+ assert False, "Not supported" # pragma: no cover
+
+ def close(self):
+ for w in self.writers.values():
+ w.close()
+ if self.untrimmed_writer is not None:
+ self.untrimmed_writer.close()
diff --git a/cutadapt/modifiers.py b/cutadapt/modifiers.py
new file mode 100644
index 0000000..af4944d
--- /dev/null
+++ b/cutadapt/modifiers.py
@@ -0,0 +1,275 @@
+# coding: utf-8
+"""
+This module implements all the read modifications that cutadapt supports.
+A modifier must be callable. It is implemented as a function if no parameters
+need to be stored, and as a class with a __call__ method if there are parameters
+(or statistics).
+"""
+from __future__ import print_function, division, absolute_import
+import re
+from cutadapt.qualtrim import quality_trim_index, nextseq_trim_index
+from cutadapt.compat import maketrans
+
+
+class AdapterCutter(object):
+ """
+ Repeatedly find one of multiple adapters in reads.
+ The number of times the search is repeated is specified by the
+ times parameter.
+ """
+
+ def __init__(self, adapters, times=1, wildcard_file=None, info_file=None,
+ rest_writer=None, action='trim'):
+ """
+ adapters -- list of Adapter objects
+
+ action -- What to do with a found adapter: None, 'trim', or 'mask'
+ """
+ self.adapters = adapters
+ self.times = times
+ self.wildcard_file = wildcard_file
+ self.info_file = info_file
+ self.rest_writer = rest_writer
+ self.action = action
+ self.with_adapters = 0
+ self.keep_match_info = self.info_file is not None
+
+ def _best_match(self, read):
+ """
+ Find the best matching adapter in the given read.
+
+ Return either a Match instance or None if there are no matches.
+ """
+ best = None
+ for adapter in self.adapters:
+ match = adapter.match_to(read)
+ if match is None:
+ continue
+
+ # the no. of matches determines which adapter fits best
+ if best is None or match.matches > best.matches:
+ best = match
+ return best
+
+ def _write_info(self, read):
+ """
+ Write to the info, wildcard and rest files.
+ # TODO
+ # This design with a read having a .match attribute and
+ # a match having a .read attribute is really confusing.
+ """
+ match = read.match
+ if self.rest_writer and match:
+ self.rest_writer.write(match)
+
+ if self.wildcard_file and match:
+ print(match.wildcards(), read.name, file=self.wildcard_file)
+
+ if self.info_file:
+ if read.match_info:
+ for m in read.match_info:
+ print(*m, sep='\t', file=self.info_file)
+ else:
+ seq = read.sequence
+ qualities = read.qualities if read.qualities is not None else ''
+ print(read.name, -1, seq, qualities, sep='\t', file=self.info_file)
+
+ def __call__(self, read):
+ """
+ Determine the adapter that best matches the given read.
+ Since the best adapter is searched repeatedly, a list
+ of Match instances is returned, which
+ need to be applied consecutively to the read.
+ The list is empty if there are no adapter matches.
+
+ The read is converted to uppercase before it is compared to the adapter
+ sequences.
+
+ Cut found adapters from a single read. Return modified read.
+ """
+ matches = []
+
+ # try at most self.times times to remove an adapter
+ trimmed_read = read
+ for t in range(self.times):
+ match = self._best_match(trimmed_read)
+ if match is None:
+ # nothing found
+ break
+ matches.append(match)
+ trimmed_read = match.adapter.trimmed(match)
+
+ if not matches:
+ trimmed_read.match = None
+ trimmed_read.match_info = None
+ self._write_info(trimmed_read)
+ return trimmed_read
+
+ if __debug__:
+ assert len(trimmed_read) < len(read), "Trimmed read isn't shorter than original"
+
+ if self.action == 'trim':
+ # read is already trimmed, nothing to do
+ pass
+ elif self.action == 'mask':
+ # add N from last modification
+ masked_sequence = trimmed_read.sequence
+ for match in sorted(matches, reverse=True, key=lambda m: m.astart):
+ ns = 'N' * (len(match.read.sequence) -
+ len(match.adapter.trimmed(match).sequence))
+ # add N depending on match position
+ if match.front:
+ masked_sequence = ns + masked_sequence
+ else:
+ masked_sequence += ns
+ # set masked sequence as sequence with original quality
+ trimmed_read.sequence = masked_sequence
+ trimmed_read.qualities = matches[0].read.qualities
+
+ assert len(trimmed_read.sequence) == len(read)
+ elif self.action is None:
+ trimmed_read = read
+
+ trimmed_read.match = matches[-1]
+ if self.keep_match_info:
+ trimmed_read.match_info = [match.get_info_record() for match in matches]
+ self._write_info(trimmed_read)
+
+ self.with_adapters += 1
+ return trimmed_read
+
+
+class UnconditionalCutter(object):
+ """
+ A modifier that unconditionally removes the first n or the last n bases from a read.
+
+ If the length is positive, the bases are removed from the beginning of the read.
+ If the length is negative, the bases are removed from the end of the read.
+ """
+ def __init__(self, length):
+ self.length = length
+
+ def __call__(self, read):
+ if self.length > 0:
+ return read[self.length:]
+ elif self.length < 0:
+ return read[:self.length]
+
+
+class LengthTagModifier(object):
+ """
+ Replace "length=..." strings in read names.
+ """
+ def __init__(self, length_tag):
+ self.regex = re.compile(r"\b" + length_tag + r"[0-9]*\b")
+ self.length_tag = length_tag
+
+ def __call__(self, read):
+ read = read[:]
+ if read.name.find(self.length_tag) >= 0:
+ read.name = self.regex.sub(self.length_tag + str(len(read.sequence)), read.name)
+ return read
+
+
+class SuffixRemover(object):
+ """
+ Remove a given suffix from read names.
+ """
+ def __init__(self, suffix):
+ self.suffix = suffix
+
+ def __call__(self, read):
+ read = read[:]
+ if read.name.endswith(self.suffix):
+ read.name = read.name[:-len(self.suffix)]
+ return read
+
+
+class PrefixSuffixAdder(object):
+ """
+ Add a suffix and a prefix to read names
+ """
+ def __init__(self, prefix, suffix):
+ self.prefix = prefix
+ self.suffix = suffix
+
+ def __call__(self, read):
+ read = read[:]
+ adapter_name = 'no_adapter' if read.match is None else read.match.adapter.name
+ read.name = self.prefix.replace('{name}', adapter_name) + read.name + \
+ self.suffix.replace('{name}', adapter_name)
+ return read
+
+
+class DoubleEncoder(object):
+ """
+ Double-encode colorspace reads, using characters ACGTN to represent colors.
+ """
+ def __init__(self):
+ self.double_encode_trans = maketrans('0123.', 'ACGTN')
+
+ def __call__(self, read):
+ read = read[:]
+ read.sequence = read.sequence.translate(self.double_encode_trans)
+ return read
+
+
+class ZeroCapper(object):
+ """
+ Change negative quality values of a read to zero
+ """
+ def __init__(self, quality_base=33):
+ qb = quality_base
+ self.zero_cap_trans = maketrans(''.join(map(chr, range(qb))), chr(qb) * qb)
+
+ def __call__(self, read):
+ read = read[:]
+ read.qualities = read.qualities.translate(self.zero_cap_trans)
+ return read
+
+
+def PrimerTrimmer(read):
+ """Trim primer base from colorspace reads"""
+ read = read[1:]
+ read.primer = ''
+ return read
+
+
+class NextseqQualityTrimmer(object):
+ def __init__(self, cutoff, base):
+ self.cutoff = cutoff
+ self.base = base
+ self.trimmed_bases = 0
+
+ def __call__(self, read):
+ stop = nextseq_trim_index(read, self.cutoff, self.base)
+ self.trimmed_bases += len(read) - stop
+ return read[:stop]
+
+
+class QualityTrimmer(object):
+ def __init__(self, cutoff_front, cutoff_back, base):
+ self.cutoff_front = cutoff_front
+ self.cutoff_back = cutoff_back
+ self.base = base
+ self.trimmed_bases = 0
+
+ def __call__(self, read):
+ start, stop = quality_trim_index(read.qualities, self.cutoff_front, self.cutoff_back, self.base)
+ self.trimmed_bases += len(read) - (stop - start)
+ return read[start:stop]
+
+
+class NEndTrimmer(object):
+ """Trims Ns from the 3' and 5' end of reads"""
+ def __init__(self):
+ self.start_trim = re.compile(r'^N+')
+ self.end_trim = re.compile(r'N+$')
+
+ def __call__(self, read):
+ sequence = read.sequence
+ start_cut = self.start_trim.match(sequence)
+ end_cut = self.end_trim.search(sequence)
+ start_cut = start_cut.end() if start_cut else 0
+ end_cut = end_cut.start() if end_cut else len(read)
+ return read[start_cut:end_cut]
diff --git a/cutadapt/qualtrim.py b/cutadapt/qualtrim.py
new file mode 100644
index 0000000..ea79132
--- /dev/null
+++ b/cutadapt/qualtrim.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+"""
+Quality trimming.
+"""
+from __future__ import print_function, division, absolute_import
+
+import sys
+
+if sys.version > '3':
+ xrange = range
+
+
+def quality_trim_index(qualities, cutoff, base=33):
+ """
+ Find the position at which to trim a low-quality end from a nucleotide sequence.
+
+ Qualities are assumed to be ASCII-encoded as chr(qual + base).
+
+ The algorithm is the same as the one used by BWA within the function
+ 'bwa_trim_read':
+ - Subtract the cutoff value from all qualities.
+ - Compute partial sums from all indices to the end of the sequence.
+ - Trim sequence at the index at which the sum is minimal.
+ """
+ s = 0
+ max_qual = 0
+ max_i = len(qualities)
+ for i in reversed(xrange(max_i)):
+ q = ord(qualities[i]) - base
+ s += cutoff - q
+ if s < 0:
+ break
+ if s > max_qual:
+ max_qual = s
+ max_i = i
+ return max_i
+
+
+def nextseq_trim_index(sequence, cutoff, base=33):
+ """
+ Variant of the above quality trimming routine that works on NextSeq data.
+ With Illumina NextSeq, bases are encoded with two colors. 'No color' (a
+ dark cycle) usually means that a 'G' was sequenced, but that also occurs
+ when sequencing falls off the end of the fragment. The read then contains
+ a run of high-quality G bases in the end.
+
+ This routine works as the one above, but counts qualities belonging to 'G'
+ bases as being equal to cutoff - 1.
+ """
+ bases = sequence.sequence
+ qualities = sequence.qualities
+ s = 0
+ max_qual = 0
+ max_i = len(qualities)
+ for i in reversed(xrange(max_i)):
+ q = ord(qualities[i]) - base
+ if bases[i] == 'G':
+ q = cutoff - 1
+ s += cutoff - q
+ if s < 0:
+ break
+ if s > max_qual:
+ max_qual = s
+ max_i = i
+ return max_i
+
+try:
+ from cutadapt._qualtrim import quality_trim_index, nextseq_trim_index
+except:
+ pass
diff --git a/cutadapt/report.py b/cutadapt/report.py
new file mode 100644
index 0000000..35b3641
--- /dev/null
+++ b/cutadapt/report.py
@@ -0,0 +1,296 @@
+# coding: utf-8
+"""
+Routines for printing a report.
+"""
+from __future__ import print_function, division, absolute_import
+
+import sys
+from collections import namedtuple
+from contextlib import contextmanager
+import textwrap
+from .adapters import BACK, FRONT, PREFIX, SUFFIX, ANYWHERE, LINKED
+from .modifiers import QualityTrimmer, AdapterCutter
+from .filters import (NoFilter, PairedNoFilter, TooShortReadFilter, TooLongReadFilter,
+ DiscardTrimmedFilter, DiscardUntrimmedFilter, Demultiplexer, NContentFilter)
+
+
+class Statistics:
+ def __init__(self, n, total_bp1, total_bp2):
+ """
+ n -- total number of reads
+ total_bp1 -- number of bases in first reads
+ total_bp2 -- number of bases in second reads (set to None for single-end data)
+ """
+ self.n = n
+ self.total_bp = total_bp1
+ self.total_bp1 = total_bp1
+ if total_bp2 is None:
+ self.paired = False
+ else:
+ self.paired = True
+ self.total_bp2 = total_bp2
+ self.total_bp += total_bp2
+
+ def collect(self, adapters_pair, time, modifiers, modifiers2, writers):
+ self.time = max(time, 0.01)
+ self.too_short = None
+ self.too_long = None
+ self.written = 0
+ self.written_bp = [0, 0]
+ self.too_many_n = None
+ # Collect statistics from writers/filters
+ for w in writers:
+ if isinstance(w, (NoFilter, PairedNoFilter, Demultiplexer)) or isinstance(w.filter, (DiscardTrimmedFilter, DiscardUntrimmedFilter)):
+ self.written += w.written
+ if self.n > 0:
+ self.written_fraction = self.written / self.n
+ self.written_bp = self.written_bp[0] + w.written_bp[0], self.written_bp[1] + w.written_bp[1]
+ elif isinstance(w.filter, TooShortReadFilter):
+ self.too_short = w.filtered
+ elif isinstance(w.filter, TooLongReadFilter):
+ self.too_long = w.filtered
+ elif isinstance(w.filter, NContentFilter):
+ self.too_many_n = w.filtered
+ assert self.written is not None
+
+ # Collect statistics from modifiers
+ self.with_adapters = [0, 0]
+ self.quality_trimmed_bp = [0, 0]
+ self.did_quality_trimming = False
+ for i, modifiers_list in [(0, modifiers), (1, modifiers2)]:
+ for modifier in modifiers_list:
+ if isinstance(modifier, QualityTrimmer):
+ self.quality_trimmed_bp[i] = modifier.trimmed_bases
+ self.did_quality_trimming = True
+ elif isinstance(modifier, AdapterCutter):
+ self.with_adapters[i] += modifier.with_adapters
+ self.with_adapters_fraction = [ (v / self.n if self.n > 0 else 0) for v in self.with_adapters ]
+ self.quality_trimmed = sum(self.quality_trimmed_bp)
+ self.quality_trimmed_fraction = self.quality_trimmed / self.total_bp if self.total_bp > 0 else 0.0
+
+ self.total_written_bp = sum(self.written_bp)
+ self.total_written_bp_fraction = self.total_written_bp / self.total_bp if self.total_bp > 0 else 0.0
+
+ if self.n > 0:
+ if self.too_short is not None:
+ self.too_short_fraction = self.too_short / self.n
+ if self.too_long is not None:
+ self.too_long_fraction = self.too_long / self.n
+ if self.too_many_n is not None:
+ self.too_many_n_fraction = self.too_many_n / self.n
+
+
+ADAPTER_TYPES = {
+ BACK: "regular 3'",
+ FRONT: "regular 5'",
+ PREFIX: "anchored 5'",
+ SUFFIX: "anchored 3'",
+ ANYWHERE: "variable 5'/3'",
+ LINKED: "linked",
+}
+
+
+def print_error_ranges(adapter_length, error_rate):
+ print("No. of allowed errors:")
+ prev = 0
+ for errors in range(1, int(error_rate * adapter_length) + 1):
+ r = int(errors / error_rate)
+ print("{0}-{1} bp: {2};".format(prev, r - 1, errors - 1), end=' ')
+ prev = r
+ if prev == adapter_length:
+ print("{0} bp: {1}".format(adapter_length, int(error_rate * adapter_length)))
+ else:
+ print("{0}-{1} bp: {2}".format(prev, adapter_length, int(error_rate * adapter_length)))
+ print()
+
+
+def print_histogram(d, adapter_length, n, error_rate, errors):
+ """
+ Print a histogram. Also, print the no. of reads expected to be
+ trimmed by chance (assuming a uniform distribution of nucleotides in the reads).
+ d -- a dictionary mapping lengths of trimmed sequences to their respective frequency
+ adapter_length -- adapter length
+ n -- total no. of reads.
+ """
+ h = []
+ for length in sorted(d):
+ # when length surpasses adapter_length, the
+ # probability does not increase anymore
+ estimated = n * 0.25 ** min(length, adapter_length)
+ h.append( (length, d[length], estimated) )
+
+ print("length", "count", "expect", "max.err", "error counts", sep="\t")
+ for length, count, estimate in h:
+ max_errors = max(errors[length].keys())
+ errs = ' '.join(str(errors[length][e]) for e in range(max_errors+1))
+ print(length, count, "{0:.1F}".format(estimate), int(error_rate*min(length, adapter_length)), errs, sep="\t")
+ print()
+
+
+def print_adjacent_bases(bases, sequence):
+ """
+ Print a summary of the bases preceding removed adapter sequences.
+ Print a warning if one of the bases is overrepresented and there are
+ at least 20 preceding bases available.
+
+ Return whether a warning was printed.
+ """
+ total = sum(bases.values())
+ if total == 0:
+ return False
+ print('Bases preceding removed adapters:')
+ warnbase = None
+ for base in ['A', 'C', 'G', 'T', '']:
+ b = base if base != '' else 'none/other'
+ fraction = 1.0 * bases[base] / total
+ print(' {0}: {1:.1%}'.format(b, fraction))
+ if fraction > 0.8 and base != '':
+ warnbase = b
+ if total >= 20 and warnbase is not None:
+ print('WARNING:')
+ print(' The adapter is preceded by "{0}" extremely often.'.format(warnbase))
+ print(' The provided adapter sequence may be incomplete.')
+ print(' To fix the problem, add "{0}" to the beginning of the adapter sequence.'.format(warnbase))
+ print()
+ return True
+ print()
+ return False
+
+
+ at contextmanager
+def redirect_standard_output(file):
+ if file is None:
+ yield
+ return
+ old_stdout = sys.stdout
+ sys.stdout = file
+ yield
+ sys.stdout = old_stdout
+
+
+def print_report(stats, adapters_pair):
+ """Print report to standard output."""
+ if stats.n == 0:
+ print("No reads processed! Either your input file is empty or you used the wrong -f/--format parameter.")
+ return
+ print("Finished in {0:.2F} s ({1:.0F} us/read; {2:.2F} M reads/minute).".format(
+ stats.time, 1E6 * stats.time / stats.n, stats.n / stats.time * 60 / 1E6))
+
+ report = "\n=== Summary ===\n\n"
+ if stats.paired:
+ report += textwrap.dedent("""\
+ Total read pairs processed: {n:13,d}
+ Read 1 with adapter: {with_adapters[0]:13,d} ({with_adapters_fraction[0]:.1%})
+ Read 2 with adapter: {with_adapters[1]:13,d} ({with_adapters_fraction[1]:.1%})
+ """)
+ else:
+ report += textwrap.dedent("""\
+ Total reads processed: {n:13,d}
+ Reads with adapters: {with_adapters[0]:13,d} ({with_adapters_fraction[0]:.1%})
+ """)
+ if stats.too_short is not None:
+ report += "{pairs_or_reads} that were too short: {too_short:13,d} ({too_short_fraction:.1%})\n"
+ if stats.too_long is not None:
+ report += "{pairs_or_reads} that were too long: {too_long:13,d} ({too_long_fraction:.1%})\n"
+ if stats.too_many_n is not None:
+ report += "{pairs_or_reads} with too many N: {too_many_n:13,d} ({too_many_n_fraction:.1%})\n"
+
+ report += textwrap.dedent("""\
+ {pairs_or_reads} written (passing filters): {written:13,d} ({written_fraction:.1%})
+
+ Total basepairs processed: {total_bp:13,d} bp
+ """)
+ if stats.paired:
+ report += " Read 1: {total_bp1:13,d} bp\n"
+ report += " Read 2: {total_bp2:13,d} bp\n"
+
+ if stats.did_quality_trimming:
+ report += "Quality-trimmed: {quality_trimmed:13,d} bp ({quality_trimmed_fraction:.1%})\n"
+ if stats.paired:
+ report += " Read 1: {quality_trimmed_bp[0]:13,d} bp\n"
+ report += " Read 2: {quality_trimmed_bp[1]:13,d} bp\n"
+
+ report += "Total written (filtered): {total_written_bp:13,d} bp ({total_written_bp_fraction:.1%})\n"
+ if stats.paired:
+ report += " Read 1: {written_bp[0]:13,d} bp\n"
+ report += " Read 2: {written_bp[1]:13,d} bp\n"
+ v = vars(stats)
+ v['pairs_or_reads'] = "Pairs" if stats.paired else "Reads"
+ try:
+ report = report.format(**v)
+ except ValueError:
+ # Python 2.6 does not support the comma format specifier (PEP 378)
+ report = report.replace(",d}", "d}").format(**v)
+ print(report)
+
+ warning = False
+ for which_in_pair in (0, 1):
+ for adapter in adapters_pair[which_in_pair]:
+ total_front = sum(adapter.lengths_front.values())
+ total_back = sum(adapter.lengths_back.values())
+ total = total_front + total_back
+ where = adapter.where
+ assert where in (ANYWHERE, LINKED) or (where in (BACK, SUFFIX) and total_front == 0) or (where in (FRONT, PREFIX) and total_back == 0)
+
+ if stats.paired:
+ extra = 'First read: ' if which_in_pair == 0 else 'Second read: '
+ else:
+ extra = ''
+
+ print("=" * 3, extra + "Adapter", adapter.name, "=" * 3)
+ print()
+ if where == LINKED:
+ print("Sequence: {0}...{1}; Type: linked; Length: {2}+{3}; Trimmed: {4} times; Half matches: {5}".
+ format(adapter.front_adapter.sequence,
+ adapter.back_adapter.sequence,
+ len(adapter.front_adapter.sequence),
+ len(adapter.back_adapter.sequence),
+ total_front, total_back))
+ else:
+ print("Sequence: {0}; Type: {1}; Length: {2}; Trimmed: {3} times.".
+ format(adapter.sequence, ADAPTER_TYPES[adapter.where],
+ len(adapter.sequence), total))
+ if total == 0:
+ print()
+ continue
+ if where == ANYWHERE:
+ print(total_front, "times, it overlapped the 5' end of a read")
+ print(total_back, "times, it overlapped the 3' end or was within the read")
+ print()
+ print_error_ranges(len(adapter), adapter.max_error_rate)
+ print("Overview of removed sequences (5')")
+ print_histogram(adapter.lengths_front, len(adapter), stats.n, adapter.max_error_rate, adapter.errors_front)
+ print()
+ print("Overview of removed sequences (3' or within)")
+ print_histogram(adapter.lengths_back, len(adapter), stats.n, adapter.max_error_rate, adapter.errors_back)
+ elif where == LINKED:
+ print()
+ print_error_ranges(len(adapter.front_adapter), adapter.front_adapter.max_error_rate)
+ print_error_ranges(len(adapter.back_adapter), adapter.back_adapter.max_error_rate)
+ print("Overview of removed sequences at 5' end")
+ print_histogram(adapter.front_adapter.lengths_front,
+ len(adapter.front_adapter), stats.n,
+ adapter.front_adapter.max_error_rate,
+ adapter.front_adapter.errors_front)
+ print()
+ print("Overview of removed sequences at 3' end")
+ print_histogram(adapter.back_adapter.lengths_back,
+ len(adapter.back_adapter), stats.n,
+ adapter.back_adapter.max_error_rate, adapter.back_adapter.errors_back)
+ elif where in (FRONT, PREFIX):
+ print()
+ print_error_ranges(len(adapter), adapter.max_error_rate)
+ print("Overview of removed sequences")
+ print_histogram(adapter.lengths_front, len(adapter), stats.n, adapter.max_error_rate, adapter.errors_front)
+ else:
+ assert where in (BACK, SUFFIX)
+ print()
+ print_error_ranges(len(adapter), adapter.max_error_rate)
+ warning = warning or print_adjacent_bases(adapter.adjacent_bases, adapter.sequence)
+ print("Overview of removed sequences")
+ print_histogram(adapter.lengths_back, len(adapter), stats.n, adapter.max_error_rate, adapter.errors_back)
+
+ if warning:
+ print('WARNING:')
+ print(' One or more of your adapter sequences may be incomplete.')
+ print(' Please see the detailed output above.')
diff --git a/cutadapt/scripts/__init__.py b/cutadapt/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cutadapt/scripts/cutadapt.py b/cutadapt/scripts/cutadapt.py
new file mode 100755
index 0000000..7a7b0af
--- /dev/null
+++ b/cutadapt/scripts/cutadapt.py
@@ -0,0 +1,726 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# kate: word-wrap off; remove-trailing-spaces all;
+#
+# Copyright (c) 2010-2016 Marcel Martin <marcel.martin at scilifelab.se>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+cutadapt version %version
+Copyright (C) 2010-2016 Marcel Martin <marcel.martin at scilifelab.se>
+
+cutadapt removes adapter sequences from high-throughput sequencing reads.
+
+Usage:
+ cutadapt -a ADAPTER [options] [-o output.fastq] input.fastq
+
+For paired-end reads:
+ cutadapt -a ADAPT1 -A ADAPT2 [options] -o out1.fastq -p out2.fastq in1.fastq in2.fastq
+
+Replace "ADAPTER" with the actual sequence of your 3' adapter. IUPAC wildcard
+characters are supported. The reverse complement is *not* automatically
+searched. All reads from input.fastq will be written to output.fastq with the
+adapter sequence removed. Adapter matching is error-tolerant. Multiple adapter
+sequences can be given (use further -a options), but only the best-matching
+adapter will be removed.
+
+Input may also be in FASTA format. Compressed input and output is supported and
+auto-detected from the file name (.gz, .xz, .bz2). Use the file name '-' for
+standard input/output. Without the -o option, output is sent to standard output.
+
+Citation:
+
+Marcel Martin. Cutadapt removes adapter sequences from high-throughput
+sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011.
+http://dx.doi.org/10.14806/ej.17.1.200
+
+Use "cutadapt --help" to see all command-line options.
+See http://cutadapt.readthedocs.org/ for full documentation.
+"""
+
+from __future__ import print_function, division, absolute_import
+
+# Print a helpful error message if the extension modules cannot be imported.
+from cutadapt import check_importability
+check_importability()
+
+import sys
+import time
+import errno
+from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
+import functools
+import logging
+import platform
+import textwrap
+
+from cutadapt import seqio, __version__
+from cutadapt.xopen import xopen
+from cutadapt.adapters import AdapterParser
+from cutadapt.modifiers import (LengthTagModifier, SuffixRemover, PrefixSuffixAdder,
+ DoubleEncoder, ZeroCapper, PrimerTrimmer, QualityTrimmer, UnconditionalCutter,
+ NEndTrimmer, AdapterCutter, NextseqQualityTrimmer)
+from cutadapt.filters import (NoFilter, PairedNoFilter, Redirector, PairedRedirector,
+ LegacyPairedRedirector, TooShortReadFilter, TooLongReadFilter,
+ Demultiplexer, NContentFilter, DiscardUntrimmedFilter, DiscardTrimmedFilter)
+from cutadapt.report import Statistics, print_report, redirect_standard_output
+from cutadapt.compat import next
+
+logger = logging.getLogger()
+
+class CutadaptOptionParser(OptionParser):
+ def get_usage(self):
+ return self.usage.lstrip().replace('%version', __version__)
+
+
+class RestFileWriter(object):
+ def __init__(self, file):
+ self.file = file
+
+ def write(self, match):
+ rest = match.rest()
+ if len(rest) > 0:
+ print(rest, match.read.name, file=self.file)
+
+
+def process_single_reads(reader, modifiers, filters):
+ """
+ Loop over reads, find adapters, trim reads, apply modifiers and
+ output modified reads.
+
+ Return a Statistics object.
+ """
+ n = 0 # no. of processed reads
+ total_bp = 0
+ for read in reader:
+ n += 1
+ total_bp += len(read.sequence)
+ for modifier in modifiers:
+ read = modifier(read)
+ for filter in filters:
+ if filter(read):
+ break
+
+ return Statistics(n=n, total_bp1=total_bp, total_bp2=None)
+
+
+def process_paired_reads(paired_reader, modifiers1, modifiers2, filters):
+ """
+ Loop over reads, find adapters, trim reads, apply modifiers and
+ output modified reads.
+
+ Return a Statistics object.
+ """
+ n = 0 # no. of processed reads
+ total1_bp = 0
+ total2_bp = 0
+ for read1, read2 in paired_reader:
+ n += 1
+ total1_bp += len(read1.sequence)
+ total2_bp += len(read2.sequence)
+ for modifier in modifiers1:
+ read1 = modifier(read1)
+ for modifier in modifiers2:
+ read2 = modifier(read2)
+ for filter in filters:
+ # Stop writing as soon as one of the filters was successful.
+ if filter(read1, read2):
+ break
+ return Statistics(n=n, total_bp1=total1_bp, total_bp2=total2_bp)
+
+
+def setup_logging(stdout=False, quiet=False):
+ """
+ Attach handler to the global logger object
+ """
+ # Due to backwards compatibility, logging output is sent to standard output
+ # instead of standard error if the -o option is used.
+ stream_handler = logging.StreamHandler(sys.stdout if stdout else sys.stderr)
+ stream_handler.setFormatter(logging.Formatter('%(message)s'))
+ stream_handler.setLevel(logging.ERROR if quiet else logging.INFO)
+ logger.setLevel(logging.INFO)
+ logger.addHandler(stream_handler)
+
+
+def get_option_parser():
+ parser = CutadaptOptionParser(usage=__doc__, version=__version__)
+
+ parser.add_option("--debug", action='store_true', default=False,
+ help="Print debugging information.")
+ parser.add_option("-f", "--format",
+ help="Input file format; can be either 'fasta', 'fastq' or 'sra-fastq'. "
+ "Ignored when reading csfasta/qual files. Default: auto-detect "
+ "from file name extension.")
+
+ group = OptionGroup(parser, "Finding adapters:",
+ description="Parameters -a, -g, -b specify adapters to be removed from "
+ "each read (or from the first read in a pair if data is paired). "
+ "If specified multiple times, only the best matching adapter is "
+ "trimmed (but see the --times option). When the special notation "
+ "'file:FILE' is used, adapter sequences are read from the given "
+ "FASTA file.")
+ group.add_option("-a", "--adapter", action="append", default=[], metavar="ADAPTER",
+ dest="adapters",
+ help="Sequence of an adapter ligated to the 3' end (paired data: of the "
+ "first read). The adapter and subsequent bases are trimmed. If a "
+ "'$' character is appended ('anchoring'), the adapter is only "
+ "found if it is a suffix of the read.")
+ group.add_option("-g", "--front", action="append", default=[], metavar="ADAPTER",
+ help="Sequence of an adapter ligated to the 5' end (paired data: of the "
+ "first read). The adapter and any preceding bases are trimmed. "
+ "Partial matches at the 5' end are allowed. If a '^' character is "
+ "prepended ('anchoring'), the adapter is only found if it is a "
+ "prefix of the read.")
+ group.add_option("-b", "--anywhere", action="append", default=[], metavar="ADAPTER",
+ help="Sequence of an adapter that may be ligated to the 5' or 3' end "
+ "(paired data: of the first read). Both types of matches as "
+ "described under -a und -g are allowed. If the first base of the "
+ "read is part of the match, the behavior is as with -g, otherwise "
+ "as with -a. This option is mostly for rescuing failed library "
+ "preparations - do not use if you know which end your adapter was "
+ "ligated to!")
+ group.add_option("-e", "--error-rate", type=float, default=0.1,
+ help="Maximum allowed error rate (no. of errors divided by the length "
+ "of the matching region). Default: %default")
+ group.add_option("--no-indels", action='store_false', dest='indels', default=True,
+ help="Allow only mismatches in alignments. "
+ "Default: allow both mismatches and indels")
+ group.add_option("-n", "--times", type=int, metavar="COUNT", default=1,
+ help="Remove up to COUNT adapters from each read. Default: %default")
+ group.add_option("-O", "--overlap", type=int, metavar="MINLENGTH", default=3,
+ help="If the overlap between the read and the adapter is shorter than "
+ "MINLENGTH, the read is not modified. Reduces the no. of bases "
+ "trimmed due to random adapter matches. Default: %default")
+ group.add_option("--match-read-wildcards", action="store_true", default=False,
+ help="Interpret IUPAC wildcards in reads. Default: %default")
+ group.add_option("-N", "--no-match-adapter-wildcards", action="store_false",
+ default=True, dest='match_adapter_wildcards',
+ help="Do not interpret IUPAC wildcards in adapters.")
+ group.add_option("--no-trim", dest='action', action='store_const', const=None,
+ help="Match and redirect reads to output/untrimmed-output as usual, "
+ "but do not remove adapters.")
+ group.add_option("--mask-adapter", dest='action', action='store_const', const='mask',
+ help="Mask adapters with 'N' characters instead of trimming them.")
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Additional read modifications")
+ group.add_option("-u", "--cut", action='append', default=[], type=int, metavar="LENGTH",
+ help="Remove bases from each read (first read only if paired). "
+ "If LENGTH is positive, remove bases from the beginning. "
+ "If LENGTH is negative, remove bases from the end. "
+ "Can be used twice if LENGTHs have different signs.")
+ group.add_option("-q", "--quality-cutoff", default=None, metavar="[5'CUTOFF,]3'CUTOFF",
+ help="Trim low-quality bases from 5' and/or 3' ends of each read before "
+ "adapter removal. Applied to both reads if data is paired. If one "
+ "value is given, only the 3' end is trimmed. If two "
+ "comma-separated cutoffs are given, the 5' end is trimmed with "
+ "the first cutoff, the 3' end with the second.")
+ group.add_option("--nextseq-trim", type=int, default=None, metavar="3'CUTOFF",
+ help="NextSeq-specific quality trimming (each read). Trims also dark "
+ "cycles appearing as high-quality G bases (EXPERIMENTAL).")
+ group.add_option("--quality-base", type=int, default=33,
+ help="Assume that quality values in FASTQ are encoded as ascii(quality "
+ "+ QUALITY_BASE). This needs to be set to 64 for some old Illumina "
+ "FASTQ files. Default: %default")
+ group.add_option("--trim-n", action='store_true', default=False,
+ help="Trim N's on ends of reads.")
+ group.add_option("-x", "--prefix", default='',
+ help="Add this prefix to read names. Use {name} to insert the name of the matching adapter.")
+ group.add_option("-y", "--suffix", default='',
+ help="Add this suffix to read names; can also include {name}")
+ group.add_option("--strip-suffix", action='append', default=[],
+ help="Remove this suffix from read names if present. Can be given multiple times.")
+ group.add_option("--length-tag", metavar="TAG",
+ help="Search for TAG followed by a decimal number in the description "
+ "field of the read. Replace the decimal number with the correct "
+ "length of the trimmed read. For example, use --length-tag 'length=' "
+ "to correct fields like 'length=123'.")
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Filtering of processed reads")
+ group.add_option("--discard-trimmed", "--discard", action='store_true', default=False,
+ help="Discard reads that contain an adapter. Also use -O to avoid "
+ "discarding too many randomly matching reads!")
+ group.add_option("--discard-untrimmed", "--trimmed-only", action='store_true', default=False,
+ help="Discard reads that do not contain the adapter.")
+ group.add_option("-m", "--minimum-length", type=int, default=0, metavar="LENGTH",
+ help="Discard trimmed reads that are shorter than LENGTH. Reads that "
+ "are too short even before adapter removal are also discarded. In "
+ "colorspace, an initial primer is not counted. Default: 0")
+ group.add_option("-M", "--maximum-length", type=int, default=sys.maxsize, metavar="LENGTH",
+ help="Discard trimmed reads that are longer than LENGTH. "
+ "Reads that are too long even before adapter removal "
+ "are also discarded. In colorspace, an initial primer "
+ "is not counted. Default: no limit")
+ group.add_option("--max-n", type=float, default=-1.0, metavar="COUNT",
+ help="Discard reads with too many N bases. If COUNT is an integer, it "
+ "is treated as the absolute number of N bases. If it is between 0 "
+ "and 1, it is treated as the proportion of N's allowed in a read.")
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Output")
+ group.add_option("--quiet", default=False, action='store_true',
+ help="Print only error messages.")
+ group.add_option("-o", "--output", metavar="FILE",
+ help="Write trimmed reads to FILE. FASTQ or FASTA format is chosen "
+ "depending on input. The summary report is sent to standard output. "
+ "Use '{name}' in FILE to demultiplex reads into multiple "
+ "files. Default: write to standard output")
+ group.add_option("--info-file", metavar="FILE",
+ help="Write information about each read and its adapter matches into FILE. "
+ "See the documentation for the file format.")
+ group.add_option("-r", "--rest-file", metavar="FILE",
+ help="When the adapter matches in the middle of a read, write the "
+ "rest (after the adapter) into FILE.")
+ group.add_option("--wildcard-file", metavar="FILE",
+ help="When the adapter has N bases (wildcards), write adapter bases "
+ "matching wildcard positions to FILE. When there are indels in the "
+ "alignment, this will often not be accurate.")
+ group.add_option("--too-short-output", metavar="FILE",
+ help="Write reads that are too short (according to length specified by "
+ "-m) to FILE. Default: discard reads")
+ group.add_option("--too-long-output", metavar="FILE",
+ help="Write reads that are too long (according to length specified by "
+ "-M) to FILE. Default: discard reads")
+ group.add_option("--untrimmed-output", default=None, metavar="FILE",
+ help="Write reads that do not contain the adapter to FILE. Default: "
+ "output to same file as trimmed reads")
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Colorspace options")
+ group.add_option("-c", "--colorspace", action='store_true', default=False,
+ help="Enable colorspace mode: Also trim the color that is adjacent to the found adapter.")
+ group.add_option("-d", "--double-encode", action='store_true', default=False,
+ help="Double-encode colors (map 0,1,2,3,4 to A,C,G,T,N).")
+ group.add_option("-t", "--trim-primer", action='store_true', default=False,
+ help="Trim primer base and the first color (which is the transition "
+ "to the first nucleotide)")
+ group.add_option("--strip-f3", action='store_true', default=False,
+ help="Strip the _F3 suffix of read names")
+ group.add_option("--maq", "--bwa", action='store_true', default=False,
+ help="MAQ- and BWA-compatible colorspace output. This enables -c, -d, "
+ "-t, --strip-f3 and -y '/1'.")
+ group.add_option("--no-zero-cap", dest='zero_cap', action='store_false',
+ help="Do not change negative quality values to zero in colorspace "
+ "data. By default, they are since many tools have problems with "
+ "negative qualities.")
+ group.add_option("--zero-cap", "-z", action='store_true',
+ help="Change negative quality values to zero. This is enabled "
+ "by default when -c/--colorspace is also enabled. Use the above option "
+ "to disable it.")
+ parser.set_defaults(zero_cap=None, action='trim')
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Paired-end options", description="The "
+ "-A/-G/-B/-U options work like their -a/-b/-g/-u counterparts, but "
+ "are applied to the second read in each pair.")
+ group.add_option("-A", dest='adapters2', action='append', default=[], metavar='ADAPTER',
+ help="3' adapter to be removed from second read in a pair.")
+ group.add_option("-G", dest='front2', action='append', default=[], metavar='ADAPTER',
+ help="5' adapter to be removed from second read in a pair.")
+ group.add_option("-B", dest='anywhere2', action='append', default=[], metavar='ADAPTER',
+ help="5'/3 adapter to be removed from second read in a pair.")
+ group.add_option("-U", dest='cut2', action='append', default=[], type=int, metavar="LENGTH",
+ help="Remove LENGTH bases from second read in a pair (see --cut).")
+ group.add_option("-p", "--paired-output", metavar="FILE",
+ help="Write second read in a pair to FILE.")
+ # Setting the default for pair_filter to None allows us to find out whether
+ # the option was used at all.
+ group.add_option("--pair-filter", metavar='(any|both)', default=None,
+ choices=("any", "both"),
+ help="Which of the reads in a paired-end read have to match the "
+ "filtering criterion in order for it to be filtered. "
+ "Default: any")
+ group.add_option("--interleaved", action='store_true', default=False,
+ help="Read and write interleaved paired-end reads.")
+ group.add_option("--untrimmed-paired-output", metavar="FILE",
+ help="Write second read in a pair to this FILE when no adapter "
+ "was found in the first read. Use this option together with "
+ "--untrimmed-output when trimming paired-end reads. Default: output "
+ "to same file as trimmed reads")
+ group.add_option("--too-short-paired-output", metavar="FILE", default=None,
+ help="Write second read in a pair to this file if pair is too short. "
+ "Use together with --too-short-output.")
+ group.add_option("--too-long-paired-output", metavar="FILE", default=None,
+ help="Write second read in a pair to this file if pair is too long. "
+ "Use together with --too-long-output.")
+ parser.add_option_group(group)
+
+ return parser
+
+
+def main(cmdlineargs=None, default_outfile=sys.stdout):
+ """
+ Main function that evaluates command-line parameters and iterates
+ over all reads.
+
+ default_outfile is the file to which trimmed reads are sent if the ``-o``
+ parameter is not used.
+ """
+ parser = get_option_parser()
+ if cmdlineargs is None:
+ cmdlineargs = sys.argv[1:]
+ options, args = parser.parse_args(args=cmdlineargs)
+ # Setup logging only if there are not already any handlers (can happen when
+ # this function is being called externally such as from unit tests)
+ if not logging.root.handlers:
+ setup_logging(stdout=bool(options.output), quiet=options.quiet)
+
+ if len(args) == 0:
+ parser.error("At least one parameter needed: name of a FASTA or FASTQ file.")
+ elif len(args) > 2:
+ parser.error("Too many parameters.")
+ input_filename = args[0]
+ if input_filename.endswith('.qual'):
+ parser.error("If a .qual file is given, it must be the second argument.")
+
+ # Find out which 'mode' we need to use.
+ # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given)
+ paired = False
+ if options.paired_output:
+ # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U).
+ # This exists for backwards compatibility ('legacy mode').
+ paired = 'first'
+ # Any of these options switch off legacy mode
+ if (options.adapters2 or options.front2 or options.anywhere2 or
+ options.cut2 or options.interleaved or options.pair_filter or
+ options.too_short_paired_output or options.too_long_paired_output):
+ # Full paired-end trimming when both -p and -A/-G/-B/-U given
+ # Read modifications (such as quality trimming) are applied also to second read.
+ paired = 'both'
+
+ if paired and len(args) == 1 and not options.interleaved:
+ parser.error("When paired-end trimming is enabled via -A/-G/-B/-U or -p, "
+ "two input files are required.")
+ if options.interleaved and len(args) != 1:
+ parser.error("When reading interleaved files, only one input file may "
+ "be given.")
+ if not paired:
+ if options.untrimmed_paired_output:
+ parser.error("Option --untrimmed-paired-output can only be used when "
+ "trimming paired-end reads (with option -p).")
+
+ # Assign input_paired_filename and quality_filename
+ input_paired_filename = None
+ quality_filename = None
+ if paired:
+ if not options.interleaved:
+ input_paired_filename = args[1]
+ if not options.paired_output:
+ parser.error("When paired-end trimming is enabled via -A/-G/-B/-U, "
+ "a second output file needs to be specified via -p (--paired-output).")
+ if not options.output:
+ parser.error("When you use -p or --paired-output, you must also "
+ "use the -o option.")
+ if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output):
+ parser.error("When trimming paired-end reads, you must use either none "
+ "or both of the --untrimmed-output/--untrimmed-paired-output options.")
+ if options.too_short_output and not options.too_short_paired_output:
+ parser.error("When using --too-short-output with paired-end "
+ "reads, you also need to use --too-short-paired-output")
+ if options.too_long_output and not options.too_long_paired_output:
+ parser.error("When using --too-long-output with paired-end "
+ "reads, you also need to use --too-long-paired-output")
+ elif len(args) == 2:
+ quality_filename = args[1]
+ if options.format is not None:
+ parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.")
+
+ if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']:
+ parser.error("The input file format must be either 'fasta', 'fastq' or "
+ "'sra-fastq' (not '{0}').".format(options.format))
+
+ # Open input file(s)
+ try:
+ reader = seqio.open(input_filename, file2=input_paired_filename,
+ qualfile=quality_filename, colorspace=options.colorspace,
+ fileformat=options.format, interleaved=options.interleaved)
+ except (seqio.UnknownFileType, IOError) as e:
+ parser.error(e)
+
+ if options.quality_cutoff is not None:
+ cutoffs = options.quality_cutoff.split(',')
+ if len(cutoffs) == 1:
+ try:
+ cutoffs = [0, int(cutoffs[0])]
+ except ValueError as e:
+ parser.error("Quality cutoff value not recognized: {0}".format(e))
+ elif len(cutoffs) == 2:
+ try:
+ cutoffs = [int(cutoffs[0]), int(cutoffs[1])]
+ except ValueError as e:
+ parser.error("Quality cutoff value not recognized: {0}".format(e))
+ else:
+ parser.error("Expected one value or two values separated by comma for the quality cutoff")
+ else:
+ cutoffs = None
+
+ open_writer = functools.partial(seqio.open, mode='w',
+ qualities=reader.delivers_qualities, colorspace=options.colorspace,
+ interleaved=options.interleaved)
+
+ if options.pair_filter is None:
+ options.pair_filter = 'any'
+ min_affected = 2 if options.pair_filter == 'both' else 1
+ if not paired:
+ filter_wrapper = Redirector
+ elif paired == 'first':
+ filter_wrapper = LegacyPairedRedirector
+ elif paired == 'both':
+ filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected)
+ filters = []
+ # TODO open_files = []
+ too_short_writer = None # too short reads go here
+ # TODO pass file name to TooShortReadFilter, add a .close() method?
+ if options.minimum_length > 0:
+ if options.too_short_output:
+ too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output)
+ filters.append(filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length)))
+ too_long_writer = None # too long reads go here
+ if options.maximum_length < sys.maxsize:
+ if options.too_long_output is not None:
+ too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output)
+ filters.append(filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length)))
+
+ if options.max_n != -1:
+ filters.append(filter_wrapper(None, NContentFilter(options.max_n)))
+
+ if int(options.discard_trimmed) + int(options.discard_untrimmed) + int(options.untrimmed_output is not None) > 1:
+ parser.error("Only one of the --discard-trimmed, --discard-untrimmed "
+ "and --untrimmed-output options can be used at the same time.")
+ demultiplexer = None
+ untrimmed_writer = None
+ writer = None
+ if options.output is not None and '{name}' in options.output:
+ if options.discard_trimmed:
+ parser.error("Do not use --discard-trimmed when demultiplexing.")
+ if paired:
+ parser.error("Demultiplexing not supported for paired-end files, yet.")
+ untrimmed = options.output.replace('{name}', 'unknown')
+ if options.untrimmed_output:
+ untrimmed = options.untrimmed_output
+ if options.discard_untrimmed:
+ untrimmed = None
+ demultiplexer = Demultiplexer(options.output, untrimmed,
+ qualities=reader.delivers_qualities, colorspace=options.colorspace)
+ filters.append(demultiplexer)
+ else:
+ # Set up the remaining filters to deal with --discard-trimmed,
+ # --discard-untrimmed and --untrimmed-output. These options
+ # are mutually exclusive in order to avoid brain damage.
+ if options.discard_trimmed:
+ filters.append(filter_wrapper(None, DiscardTrimmedFilter()))
+ elif options.discard_untrimmed:
+ filters.append(filter_wrapper(None, DiscardUntrimmedFilter()))
+ elif options.untrimmed_output:
+ untrimmed_writer = open_writer(options.untrimmed_output,
+ options.untrimmed_paired_output)
+ filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter()))
+
+ # Finally, figure out where the reads that passed all the previous
+ # filters should go.
+ if options.output is not None:
+ writer = open_writer(options.output, options.paired_output)
+ else:
+ writer = open_writer(default_outfile)
+ if not paired:
+ filters.append(NoFilter(writer))
+ else:
+ filters.append(PairedNoFilter(writer))
+
+ if options.maq:
+ options.colorspace = True
+ options.double_encode = True
+ options.trim_primer = True
+ options.strip_suffix.append('_F3')
+ options.suffix = "/1"
+ if options.zero_cap is None:
+ options.zero_cap = options.colorspace
+ if options.trim_primer and not options.colorspace:
+ parser.error("Trimming the primer makes only sense in colorspace.")
+ if options.double_encode and not options.colorspace:
+ parser.error("Double-encoding makes only sense in colorspace.")
+ if options.anywhere and options.colorspace:
+ parser.error("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).")
+ if not (0 <= options.error_rate <= 1.):
+ parser.error("The maximum error rate must be between 0 and 1.")
+ if options.overlap < 1:
+ parser.error("The overlap must be at least 1.")
+
+ if options.rest_file is not None:
+ options.rest_file = xopen(options.rest_file, 'w')
+ rest_writer = RestFileWriter(options.rest_file)
+ else:
+ rest_writer = None
+ if options.info_file is not None:
+ options.info_file = xopen(options.info_file, 'w')
+ if options.wildcard_file is not None:
+ options.wildcard_file = xopen(options.wildcard_file, 'w')
+
+ if options.colorspace:
+ if options.match_read_wildcards:
+ parser.error('IUPAC wildcards not supported in colorspace')
+ options.match_adapter_wildcards = False
+
+ adapter_parser = AdapterParser(
+ colorspace=options.colorspace,
+ max_error_rate=options.error_rate,
+ min_overlap=options.overlap,
+ read_wildcards=options.match_read_wildcards,
+ adapter_wildcards=options.match_adapter_wildcards,
+ indels=options.indels)
+
+ try:
+ adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front)
+ adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2)
+ except IOError as e:
+ if e.errno == errno.ENOENT:
+ parser.error(e)
+ raise
+ except ValueError as e:
+ parser.error(e)
+ if options.debug:
+ for adapter in adapters + adapters2:
+ adapter.enable_debug()
+
+ if not adapters and not adapters2 and not cutoffs and \
+ options.nextseq_trim is None and \
+ options.cut == [] and options.cut2 == [] and \
+ options.minimum_length == 0 and \
+ options.maximum_length == sys.maxsize and \
+ quality_filename is None and \
+ options.max_n == -1 and not options.trim_n:
+ parser.error("You need to provide at least one adapter sequence.")
+
+ # Create the single-end processing pipeline (a list of "modifiers")
+ modifiers = []
+ if options.cut:
+ if len(options.cut) > 2:
+ parser.error("You cannot remove bases from more than two ends.")
+ if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
+ parser.error("You cannot remove bases from the same end twice.")
+ for cut in options.cut:
+ if cut != 0:
+ modifiers.append(UnconditionalCutter(cut))
+
+ if options.nextseq_trim is not None:
+ modifiers.append(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base))
+
+ if cutoffs:
+ modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
+ if adapters:
+ adapter_cutter = AdapterCutter(adapters, options.times,
+ options.wildcard_file, options.info_file,
+ rest_writer, options.action)
+ modifiers.append(adapter_cutter)
+
+ # Modifiers that apply to both reads of paired-end reads unless in legacy mode
+ modifiers_both = []
+ if options.trim_n:
+ modifiers_both.append(NEndTrimmer())
+ if options.length_tag:
+ modifiers_both.append(LengthTagModifier(options.length_tag))
+ if options.strip_f3:
+ options.strip_suffix.append('_F3')
+ for suffix in options.strip_suffix:
+ modifiers_both.append(SuffixRemover(suffix))
+ if options.prefix or options.suffix:
+ modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix))
+ if options.double_encode:
+ modifiers_both.append(DoubleEncoder())
+ if options.zero_cap and reader.delivers_qualities:
+ modifiers_both.append(ZeroCapper(quality_base=options.quality_base))
+ if options.trim_primer:
+ modifiers_both.append(PrimerTrimmer)
+ modifiers.extend(modifiers_both)
+
+ # For paired-end data, create a second processing pipeline.
+ # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to
+ # be backwards compatible and *no modifications* are done to the second read.
+ modifiers2 = []
+ if paired == 'both':
+ if options.cut2:
+ if len(options.cut2) > 2:
+ parser.error("You cannot remove bases from more than two ends.")
+ if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0:
+ parser.error("You cannot remove bases from the same end twice.")
+ for cut in options.cut2:
+ if cut != 0:
+ modifiers2.append(UnconditionalCutter(cut))
+
+ if cutoffs:
+ modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
+ if adapters2:
+ adapter_cutter2 = AdapterCutter(adapters2, options.times,
+ None, None, None, options.action)
+ modifiers2.append(adapter_cutter2)
+ else:
+ adapter_cutter2 = None
+ modifiers2.extend(modifiers_both)
+
+ logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version())
+ logger.info("Command line parameters: %s", " ".join(cmdlineargs))
+ logger.info("Trimming %s adapter%s with at most %.1f%% errors in %s mode ...",
+ len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '',
+ options.error_rate * 100,
+ { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired])
+
+ if paired == 'first' and (modifiers_both or cutoffs):
+ logger.warning('\n'.join(textwrap.wrap('WARNING: Requested read '
+ 'modifications are applied only to the first '
+ 'read since backwards compatibility mode is enabled. '
+ 'To modify both reads, also use any of the -A/-B/-G/-U options. '
+ 'Use a dummy adapter sequence when necessary: -A XXX')))
+
+ start_time = time.clock()
+ try:
+ if paired:
+ stats = process_paired_reads(reader, modifiers, modifiers2, filters)
+ else:
+ stats = process_single_reads(reader, modifiers, filters)
+ except KeyboardInterrupt as e:
+ print("Interrupted", file=sys.stderr)
+ sys.exit(130)
+ except IOError as e:
+ if e.errno == errno.EPIPE:
+ sys.exit(1)
+ raise
+ except (seqio.FormatError, EOFError) as e:
+ sys.exit("cutadapt: error: {0}".format(e))
+
+ # close open files
+ for f in [writer, untrimmed_writer,
+ options.rest_file, options.wildcard_file,
+ options.info_file, too_short_writer, too_long_writer,
+ options.info_file, demultiplexer]:
+ if f is not None and f is not sys.stdin and f is not sys.stdout:
+ f.close()
+
+ elapsed_time = time.clock() - start_time
+ if not options.quiet:
+ stats.collect((adapters, adapters2), elapsed_time,
+ modifiers, modifiers2, filters)
+ # send statistics to stderr if result was sent to stdout
+ stat_file = sys.stderr if options.output is None else None
+ with redirect_standard_output(stat_file):
+ print_report(stats, (adapters, adapters2))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cutadapt/seqio.py b/cutadapt/seqio.py
new file mode 100644
index 0000000..28d6722
--- /dev/null
+++ b/cutadapt/seqio.py
@@ -0,0 +1,756 @@
+# coding: utf-8
+"""
+Sequence I/O classes: Reading and writing of FASTA and FASTQ files.
+
+TODO
+
+- Sequence.name should be Sequence.description or so (reserve .name for the part
+ before the first space)
+"""
+from __future__ import print_function, division, absolute_import
+import sys
+from os.path import splitext
+from .xopen import xopen
+from .compat import zip, basestring
+
+__author__ = "Marcel Martin"
+
+
+class FormatError(Exception):
+ """
+ Raised when an input file (FASTA or FASTQ) is malformatted.
+ """
+
+
+def _shorten(s, n=100):
+ """Shorten string s to at most n characters, appending "..." if necessary."""
+ if s is None:
+ return None
+ if len(s) > n:
+ s = s[:n-3] + '...'
+ return s
+
+
+class Sequence(object):
+ """qualities is a string and it contains the qualities encoded as ascii(qual+33)."""
+
+ def __init__(self, name, sequence, qualities=None, name2='', match=None, match_info=None):
+ """Set qualities to None if there are no quality values"""
+ self.name = name
+ self.sequence = sequence
+ self.qualities = qualities
+ self.name2 = name2
+ self.match = match
+ self.match_info = match_info
+ self.original_length = len(sequence)
+ if qualities is not None:
+ if len(qualities) != len(sequence):
+ rname = _shorten(name)
+ raise FormatError("In read named {0!r}: Length of quality sequence ({1}) and "
+ "length of read ({2}) do not match".format(rname, len(qualities), len(sequence)))
+
+ def __getitem__(self, key):
+ """slicing"""
+ return self.__class__(
+ self.name,
+ self.sequence[key],
+ self.qualities[key] if self.qualities is not None else None,
+ self.name2,
+ self.match,
+ self.match_info)
+
+ def __repr__(self):
+ qstr = ''
+ if self.qualities is not None:
+ qstr = ', qualities={0!r}'.format(_shorten(self.qualities))
+ return '<Sequence(name={0!r}, sequence={1!r}{2})>'.format(
+ _shorten(self.name), _shorten(self.sequence), qstr)
+
+ def __len__(self):
+ return len(self.sequence)
+
+ def __eq__(self, other):
+ return self.name == other.name and \
+ self.sequence == other.sequence and \
+ self.qualities == other.qualities
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+
+class SequenceReader(object):
+ """Read possibly compressed files containing sequences"""
+ _close_on_exit = False
+
+ def __init__(self, file):
+ """
+ file is a path or a file-like object. In both cases, the file may
+ be compressed (.gz, .bz2, .xz).
+ """
+ if isinstance(file, basestring):
+ file = xopen(file)
+ self._close_on_exit = True
+ self._file = file
+
+ def close(self):
+ if self._close_on_exit and self._file is not None:
+ self._file.close()
+ self._file = None
+
+ def __enter__(self):
+ if self._file is None:
+ raise ValueError("I/O operation on closed SequenceReader")
+ return self
+
+ def __exit__(self, *args):
+ self.close()
+
+
+try:
+ from ._seqio import Sequence
+except ImportError:
+ pass
+
+
+class ColorspaceSequence(Sequence):
+ def __init__(self, name, sequence, qualities, primer=None, name2='', match=None, match_info=None):
+ # In colorspace, the first character is the last nucleotide of the primer base
+ # and the second character encodes the transition from the primer base to the
+ # first real base of the read.
+ if primer is None:
+ self.primer = sequence[0:1]
+ sequence = sequence[1:]
+ else:
+ self.primer = primer
+ if qualities is not None and len(sequence) != len(qualities):
+ rname = _shorten(name)
+ raise FormatError("In read named {0!r}: length of colorspace quality "
+ "sequence ({1}) and length of read ({2}) do not match (primer "
+ "is: {3!r})".format(rname, len(qualities), len(sequence), self.primer))
+ super(ColorspaceSequence, self).__init__(name, sequence, qualities, name2, match, match_info)
+ if not self.primer in ('A', 'C', 'G', 'T'):
+ raise FormatError("Primer base is {0!r} in read {1!r}, but it "
+ "should be one of A, C, G, T.".format(
+ self.primer, _shorten(name)))
+
+ def __repr__(self):
+ qstr = ''
+ if self.qualities is not None:
+ qstr = ', qualities={0!r}'.format(_shorten(self.qualities))
+ return '<ColorspaceSequence(name={0!r}, primer={1!r}, sequence={2!r}{3})>'.format(
+ _shorten(self.name), self.primer, _shorten(self.sequence), qstr)
+
+ def __getitem__(self, key):
+ return self.__class__(
+ self.name,
+ self.sequence[key],
+ self.qualities[key] if self.qualities is not None else None,
+ self.primer,
+ self.name2,
+ self.match,
+ self.match_info)
+
+
+def sra_colorspace_sequence(name, sequence, qualities, name2):
+ """Factory for an SRA colorspace sequence (which has one quality value too many)"""
+ return ColorspaceSequence(name, sequence, qualities[1:], name2=name2)
+
+
+class FileWithPrependedLine(object):
+ """
+ A file-like object that allows to "prepend" a single
+ line to an already opened file. That is, further
+ reads on the file will return the provided line and
+ only then the actual content. This is needed to solve
+ the problem of autodetecting input from a stream:
+ As soon as the first line has been read, we know
+ the file type, but also that line is "gone" and
+ unavailable for further processing.
+ """
+ def __init__(self, file, line):
+ """
+ file is an already opened file-like object.
+ line is a single string (newline will be appended if not included)
+ """
+ if not line.endswith('\n'):
+ line += '\n'
+ self.first_line = line
+ self._file = file
+
+ def __iter__(self):
+ yield self.first_line
+ for line in self._file:
+ yield line
+
+ def close(self):
+ self._file.close()
+
+
+class FastaReader(SequenceReader):
+ """
+ Reader for FASTA files.
+ """
+ def __init__(self, file, keep_linebreaks=False, sequence_class=Sequence):
+ """
+ file is a path or a file-like object. In both cases, the file may
+ be compressed (.gz, .bz2, .xz).
+
+ keep_linebreaks -- whether to keep newline characters in the sequence
+ """
+ super(FastaReader, self).__init__(file)
+ self.sequence_class = sequence_class
+ self.delivers_qualities = False
+ self._delimiter = '\n' if keep_linebreaks else ''
+
+ def __iter__(self):
+ """
+ Read next entry from the file (single entry at a time).
+ """
+ name = None
+ seq = []
+ for i, line in enumerate(self._file):
+ # strip() also removes DOS line breaks
+ line = line.strip()
+ if not line:
+ continue
+ if line and line[0] == '>':
+ if name is not None:
+ yield self.sequence_class(name, self._delimiter.join(seq), None)
+ name = line[1:]
+ seq = []
+ elif line and line[0] == '#':
+ continue
+ elif name is not None:
+ seq.append(line)
+ else:
+ raise FormatError("At line {0}: Expected '>' at beginning of "
+ "FASTA record, but got {1!r}.".format(i+1, _shorten(line)))
+
+ if name is not None:
+ yield self.sequence_class(name, self._delimiter.join(seq), None)
+
+
+class ColorspaceFastaReader(FastaReader):
+ def __init__(self, file, keep_linebreaks=False):
+ super(ColorspaceFastaReader, self).__init__(file, keep_linebreaks, sequence_class=ColorspaceSequence)
+
+
+class FastqReader(SequenceReader):
+ """
+ Reader for FASTQ files. Does not support multi-line FASTQ files.
+ """
+ def __init__(self, file, sequence_class=Sequence): # TODO could be a class attribute
+ """
+ file is a path or a file-like object. compressed files are supported.
+
+ The sequence_class should be a class such as Sequence or
+ ColorspaceSequence.
+ """
+ super(FastqReader, self).__init__(file)
+ self.sequence_class = sequence_class
+ self.delivers_qualities = True
+
+ def __iter__(self):
+ """
+ Return tuples: (name, sequence, qualities).
+ qualities is a string and it contains the unmodified, encoded qualities.
+ """
+ i = 3
+ for i, line in enumerate(self._file):
+ if i % 4 == 0:
+ if not line.startswith('@'):
+ raise FormatError("Line {0} in FASTQ file is expected to start with '@', "
+ "but found {1!r}".format(i+1, line[:10]))
+ name = line.strip()[1:]
+ elif i % 4 == 1:
+ sequence = line.strip()
+ elif i % 4 == 2:
+ line = line.strip()
+ if not line.startswith('+'):
+ raise FormatError("Line {0} in FASTQ file is expected to start with '+', "
+ "but found {1!r}".format(i+1, line[:10]))
+ if len(line) > 1:
+ if line[1:] != name:
+ raise FormatError(
+ "At line {0}: Sequence descriptions in the FASTQ file do not match "
+ "({1!r} != {2!r}).\n"
+ "The second sequence description must be either empty "
+ "or equal to the first description.".format(
+ i+1, name, line[1:].rstrip()))
+ name2 = name
+ else:
+ name2 = ''
+ elif i % 4 == 3:
+ qualities = line.rstrip('\n\r')
+ yield self.sequence_class(name, sequence, qualities, name2=name2)
+ if i % 4 != 3:
+ raise FormatError("FASTQ file ended prematurely")
+
+
+try:
+ from ._seqio import FastqReader
+except ImportError:
+ pass
+
+
+class ColorspaceFastqReader(FastqReader):
+ def __init__(self, file):
+ super(ColorspaceFastqReader, self).__init__(file, sequence_class=ColorspaceSequence)
+
+
+class SRAColorspaceFastqReader(FastqReader):
+ def __init__(self, file):
+ super(SRAColorspaceFastqReader, self).__init__(file, sequence_class=sra_colorspace_sequence)
+
+
+class FastaQualReader(object):
+ """
+ Reader for reads that are stored in .(CS)FASTA and .QUAL files.
+ """
+ delivers_qualities = True
+
+ def __init__(self, fastafile, qualfile, sequence_class=Sequence):
+ """
+ fastafile and qualfile are filenames or file-like objects.
+ If a filename is used, then .gz files are recognized.
+
+ The objects returned when iteritng over this file are instances of the
+ given sequence_class.
+ """
+ self.fastareader = FastaReader(fastafile)
+ self.qualreader = FastaReader(qualfile, keep_linebreaks=True)
+ self.sequence_class = sequence_class
+
+ def __iter__(self):
+ """
+ Yield Sequence objects.
+ """
+ # conversion dictionary: maps strings to the appropriate ASCII-encoded character
+ conv = dict()
+ for i in range(-5, 256 - 33):
+ conv[str(i)] = chr(i + 33)
+ for fastaread, qualread in zip(self.fastareader, self.qualreader):
+ if fastaread.name != qualread.name:
+ raise FormatError("The read names in the FASTA and QUAL file "
+ "do not match ({0!r} != {1!r})".format(fastaread.name, qualread.name))
+ try:
+ qualities = ''.join([conv[value] for value in qualread.sequence.split()])
+ except KeyError as e:
+ raise FormatError("Within read named {0!r}: Found invalid quality "
+ "value {1}".format(fastaread.name, e))
+ assert fastaread.name == qualread.name
+ yield self.sequence_class(fastaread.name, fastaread.sequence, qualities)
+
+ def close(self):
+ self.fastareader.close()
+ self.qualreader.close()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *args):
+ self.close()
+
+
+class ColorspaceFastaQualReader(FastaQualReader):
+ def __init__(self, fastafile, qualfile):
+ super(ColorspaceFastaQualReader, self).__init__(fastafile, qualfile, sequence_class=ColorspaceSequence)
+
+
+def sequence_names_match(r1, r2):
+ """
+ Check whether the sequences r1 and r2 have identical names, ignoring a
+ suffix of '1' or '2'. Some old paired-end reads have names that end in '/1'
+ and '/2'. Also, the fastq-dump tool (used for converting SRA files to FASTQ)
+ appends a .1 and .2 to paired-end reads if option -I is used.
+ """
+ name1 = r1.name.split(None, 1)[0]
+ name2 = r2.name.split(None, 1)[0]
+ if name1[-1:] in '12' and name2[-1:] in '12':
+ name1 = name1[:-1]
+ name2 = name2[:-1]
+ return name1 == name2
+
+
+class PairedSequenceReader(object):
+ """
+ Read paired-end reads from two files.
+
+ Wraps two SequenceReader instances, making sure that reads are properly
+ paired.
+ """
+ def __init__(self, file1, file2, colorspace=False, fileformat=None):
+ self.reader1 = open(file1, colorspace=colorspace, fileformat=fileformat)
+ self.reader2 = open(file2, colorspace=colorspace, fileformat=fileformat)
+ self.delivers_qualities = self.reader1.delivers_qualities
+
+ def __iter__(self):
+ """
+ Iterate over the paired reads. Each item is a pair of Sequence objects.
+ """
+ # Avoid usage of zip() below since it will consume one item too many.
+ it1, it2 = iter(self.reader1), iter(self.reader2)
+ while True:
+ try:
+ r1 = next(it1)
+ except StopIteration:
+ # End of file 1. Make sure that file 2 is also at end.
+ try:
+ next(it2)
+ raise FormatError("Reads are improperly paired. There are more reads in "
+ "file 2 than in file 1.")
+ except StopIteration:
+ pass
+ break
+ try:
+ r2 = next(it2)
+ except StopIteration:
+ raise FormatError("Reads are improperly paired. There are more reads in "
+ "file 1 than in file 2.")
+ if not sequence_names_match(r1, r2):
+ raise FormatError("Reads are improperly paired. Read name '{0}' "
+ "in file 1 does not match '{1}' in file 2.".format(r1.name, r2.name))
+ yield (r1, r2)
+
+ def close(self):
+ self.reader1.close()
+ self.reader2.close()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *args):
+ self.close()
+
+
+class InterleavedSequenceReader(object):
+ """
+ Read paired-end reads from an interleaved FASTQ file.
+ """
+ def __init__(self, file, colorspace=False, fileformat=None):
+ self.reader = open(file, colorspace=colorspace, fileformat=fileformat)
+ self.delivers_qualities = self.reader.delivers_qualities
+
+ def __iter__(self):
+ # Avoid usage of zip() below since it will consume one item too many.
+ it = iter(self.reader)
+ for r1 in it:
+ try:
+ r2 = next(it)
+ except StopIteration:
+ raise FormatError("Interleaved input file incomplete: Last record has no partner.")
+ if not sequence_names_match(r1, r2):
+ raise FormatError("Reads are improperly paired. Name {0!r} "
+ "(first) does not match {1!r} (second).".format(r1.name, r2.name))
+ yield (r1, r2)
+
+ def close(self):
+ self.reader.close()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *args):
+ self.close()
+
+class FileWriter(object):
+ def __init__(self, file):
+ if isinstance(file, str):
+ self._file = xopen(file, 'w')
+ self._close_on_exit = True
+ else:
+ self._file = file
+ self._close_on_exit = False
+
+ def close(self):
+ if self._close_on_exit:
+ self._file.close()
+
+ def __enter__(self):
+ if self._file.closed:
+ raise ValueError("I/O operation on closed file")
+ return self
+
+ def __exit__(self, *args):
+ self.close()
+
+class SingleRecordWriter(object):
+ """Public interface to single-record files"""
+ def write(self, record):
+ raise NotImplementedError()
+
+class FastaWriter(FileWriter, SingleRecordWriter):
+ """
+ Write FASTA-formatted sequences to a file.
+ """
+
+ def __init__(self, file, line_length=None):
+ """
+ If line_length is not None, the lines will
+ be wrapped after line_length characters.
+ """
+ FileWriter.__init__(self, file)
+ self.line_length = line_length if line_length != 0 else None
+
+ def write(self, name_or_seq, sequence=None):
+ """Write an entry to the the FASTA file.
+
+ If only one parameter (name_or_seq) is given, it must have
+ attributes .name and .sequence, which are then used.
+ Otherwise, the first parameter must be the name and the second
+ the sequence.
+
+ The effect is that you can write this:
+ writer.write("name", "ACCAT")
+ or
+ writer.write(Sequence("name", "ACCAT"))
+ """
+ if sequence is None:
+ name = name_or_seq.name
+ sequence = name_or_seq.sequence
+ else:
+ name = name_or_seq
+
+ if self.line_length is not None:
+ print('>{0}'.format(name), file=self._file)
+ for i in range(0, len(sequence), self.line_length):
+ print(sequence[i:i+self.line_length], file=self._file)
+ if len(sequence) == 0:
+ print(file=self._file)
+ else:
+ print('>{0}'.format(name), sequence, file=self._file, sep='\n')
+
+class ColorspaceFastaWriter(FastaWriter):
+ def write(self, record):
+ name = record.name
+ sequence = record.primer + record.sequence
+ super(ColorspaceFastaWriter, self).write(name, sequence)
+
+class FastqWriter(FileWriter, SingleRecordWriter):
+ """
+ Write sequences with qualities in FASTQ format.
+
+ FASTQ files are formatted like this:
+ @read name
+ SEQUENCE
+ +
+ QUALITIS
+ """
+ def write(self, record):
+ """
+ Write a Sequence record to the the FASTQ file.
+
+ The record must have attributes .name, .sequence and .qualities.
+ """
+ s = ('@' + record.name + '\n' + record.sequence + '\n+' +
+ record.name2 + '\n' + record.qualities + '\n')
+ self._file.write(s)
+
+ def writeseq(self, name, sequence, qualities):
+ print("@{0:s}\n{1:s}\n+\n{2:s}".format(
+ name, sequence, qualities), file=self._file)
+
+class ColorspaceFastqWriter(FastqWriter):
+ def write(self, record):
+ name = record.name
+ sequence = record.primer + record.sequence
+ qualities = record.qualities
+ super(ColorspaceFastqWriter, self).writeseq(name, sequence, qualities)
+
+class PairRecordWriter(object):
+ """Public interface to paired-record files"""
+ def write(self, read1, read2):
+ raise NotImplementedError()
+ def close(self):
+ raise NotImplementedError()
+
+ def __enter__(self):
+ # TODO do not allow this twice
+ return self
+
+ def __exit__(self, *args):
+ self.close()
+
+class PairedSequenceWriter(PairRecordWriter):
+ def __init__(self, file1, file2, colorspace=False, fileformat='fastq', qualities=None):
+ self._writer1 = open(file1, colorspace=colorspace, fileformat=fileformat, mode='w',
+ qualities=qualities)
+ self._writer2 = open(file2, colorspace=colorspace, fileformat=fileformat, mode='w',
+ qualities=qualities)
+
+ def write(self, read1, read2):
+ self._writer1.write(read1)
+ self._writer2.write(read2)
+
+ def close(self):
+ self._writer1.close()
+ self._writer2.close()
+
+class InterleavedSequenceWriter(PairRecordWriter):
+ """
+ Write paired-end reads to an interleaved FASTA or FASTQ file
+ """
+ def __init__(self, file, colorspace=False, fileformat='fastq', qualities=None):
+ self._writer = open(file, colorspace=colorspace, fileformat=fileformat, mode='w', qualities=qualities)
+
+ def write(self, read1, read2):
+ self._writer.write(read1)
+ self._writer.write(read2)
+
+ def close(self):
+ self._writer.close()
+
+class UnknownFileType(Exception):
+ """
+ Raised when open could not autodetect the file type.
+ """
+
+
+def open(file1, file2=None, qualfile=None, colorspace=False, fileformat=None,
+ interleaved=False, mode='r', qualities=None):
+ """
+ Open sequence files in FASTA or FASTQ format for reading or writing. This is
+ a factory that returns an instance of one of the ...Reader or ...Writer
+ classes also defined in this module.
+
+ file1, file2, qualfile -- Paths to regular or compressed files or file-like
+ objects. Use file1 if data is single-end. If also file2 is provided,
+ sequences are paired. If qualfile is given, then file1 must be a FASTA
+ file and sequences are single-end. One of file2 and qualfile must always
+ be None (no paired-end data is supported when reading qualfiles).
+
+ mode -- Either 'r' for reading or 'w' for writing.
+
+ interleaved -- If True, then file1 contains interleaved paired-end data.
+ file2 and qualfile must be None in this case.
+
+ colorspace -- If True, instances of the Colorspace... classes
+ are returned.
+
+ fileformat -- If set to None, file format is autodetected from the file name
+ extension. Set to 'fasta', 'fastq', or 'sra-fastq' to not auto-detect.
+ Colorspace is not auto-detected and must always be requested explicitly.
+
+ qualities -- When mode is 'w' and fileformat is None, this can be set to
+ True or False to specify whether the written sequences will have quality
+ values. This is is used in two ways:
+ * If the output format cannot be determined (unrecognized extension
+ etc), no exception is raised, but fasta or fastq format is chosen
+ appropriately.
+ * When False (no qualities available), an exception is raised when the
+ auto-detected output format is FASTQ.
+ """
+ if mode not in ('r', 'w'):
+ raise ValueError("Mode must be 'r' or 'w'")
+ if interleaved and (file2 is not None or qualfile is not None):
+ raise ValueError("When interleaved is set, file2 and qualfile must be None")
+ if file2 is not None and qualfile is not None:
+ raise ValueError("Setting both file2 and qualfile is not supported")
+ if file2 is not None:
+ if mode == 'r':
+ return PairedSequenceReader(file1, file2, colorspace, fileformat)
+ else:
+ return PairedSequenceWriter(file1, file2, colorspace, fileformat, qualities)
+
+ if interleaved:
+ if mode == 'r':
+ return InterleavedSequenceReader(file1, colorspace, fileformat)
+ else:
+ return InterleavedSequenceWriter(file1, colorspace, fileformat, qualities)
+
+ if qualfile is not None:
+ if mode == 'w':
+ raise NotImplementedError('Writing to csfasta/qual not supported')
+ if colorspace:
+ # read from .(CS)FASTA/.QUAL
+ return ColorspaceFastaQualReader(file1, qualfile)
+ else:
+ return FastaQualReader(file1, qualfile)
+
+ # All the multi-file things have been dealt with, delegate rest to the
+ # single-file function.
+ return _seqopen1(file1, colorspace=colorspace, fileformat=fileformat,
+ mode=mode, qualities=qualities)
+
+
+def _seqopen1(file, colorspace=False, fileformat=None, mode='r', qualities=None):
+ """
+ Open a single sequence file. See description above.
+ """
+ if mode == 'r':
+ fastq_handler = ColorspaceFastqReader if colorspace else FastqReader
+ fasta_handler = ColorspaceFastaReader if colorspace else FastaReader
+ elif mode == 'w':
+ fastq_handler = ColorspaceFastqWriter if colorspace else FastqWriter
+ fasta_handler = ColorspaceFastaWriter if colorspace else FastaWriter
+ else:
+ raise ValueError("Mode must be 'r' or 'w'")
+
+ if fileformat: # Explict file format given
+ fileformat = fileformat.lower()
+ if fileformat == 'fasta':
+ return fasta_handler(file)
+ elif fileformat == 'fastq':
+ return fastq_handler(file)
+ elif fileformat == 'sra-fastq' and colorspace:
+ if mode == 'w':
+ raise NotImplementedError('Writing to sra-fastq not supported')
+ return SRAColorspaceFastqReader(file)
+ else:
+ raise UnknownFileType("File format {0!r} is unknown (expected "
+ "'sra-fastq' (only for colorspace), 'fasta' or 'fastq').".format(fileformat))
+
+ # Detect file format
+ name = None
+ if file == "-":
+ file = sys.stdin if mode == 'r' else sys.stdout
+ elif isinstance(file, basestring):
+ name = file
+ elif hasattr(file, "name"): # seems to be an open file-like object
+ name = file.name
+
+ if name:
+ for ext in ('.gz', '.xz', '.bz2'):
+ if name.endswith(ext):
+ name = name[:-len(ext)]
+ break
+ name, ext = splitext(name)
+ ext = ext.lower()
+ if ext in ['.fasta', '.fa', '.fna', '.csfasta', '.csfa']:
+ format = 'fasta'
+ elif ext in ['.fastq', '.fq'] or (ext == '.txt' and name.endswith('_sequence')):
+ format = 'fastq'
+ elif mode == 'w' and qualities is True:
+ # Format not recognized, but know we want to write reads with qualities
+ format = 'fastq'
+ elif mode == 'w' and qualities is False:
+ # Same, but we know that we want to write reads without qualities
+ format = 'fasta'
+ else:
+ raise UnknownFileType("Could not determine whether file {0!r} is FASTA "
+ "or FASTQ: file name extension {1!r} not recognized".format(file, ext))
+ if format == 'fastq' and qualities is False:
+ raise ValueError("Output format cannot be FASTQ since no quality "
+ "values are available.")
+ if format == 'fastq':
+ return fastq_handler(file)
+ else:
+ return fasta_handler(file)
+
+ if mode == 'w':
+ if qualities is True:
+ return fastq_handler(file)
+ elif qualities is False:
+ return fasta_handler(file)
+ raise UnknownFileType('Cannot determine whether to write in FASTA or '
+ 'FASTQ format')
+ # No name available. Try to autodetect type by reading from the file.
+ for line in file:
+ if line.startswith('#'):
+ # Skip comment lines (needed for csfasta)
+ continue
+ if line.startswith('>'):
+ return fasta_handler(FileWithPrependedLine(file, line))
+ if line.startswith('@'):
+ return fastq_handler(FileWithPrependedLine(file, line))
+ raise UnknownFileType("File is neither FASTQ nor FASTA.")
diff --git a/cutadapt/xopen.py b/cutadapt/xopen.py
new file mode 100644
index 0000000..c1b8c90
--- /dev/null
+++ b/cutadapt/xopen.py
@@ -0,0 +1,182 @@
+"""
+Open compressed files transparently.
+"""
+from __future__ import print_function, division, absolute_import
+__author__ = 'Marcel Martin'
+
+import gzip
+import sys
+import io
+import os
+from subprocess import Popen, PIPE
+from .compat import PY3, basestring
+
+try:
+ import bz2
+except ImportError:
+ bz2 = None
+
+try:
+ import lzma
+except ImportError:
+ lzma = None
+
+if sys.version_info < (2, 7):
+ buffered_reader = lambda x: x
+ buffered_writer = lambda x: x
+else:
+ buffered_reader = io.BufferedReader
+ buffered_writer = io.BufferedWriter
+
+
+class GzipWriter:
+ def __init__(self, path, mode='w'):
+ self.outfile = open(path, mode)
+ self.devnull = open(os.devnull, 'w')
+ try:
+ # Setting close_fds to True is necessary due to
+ # http://bugs.python.org/issue12786
+ self.process = Popen(['gzip'], stdin=PIPE, stdout=self.outfile,
+ stderr=self.devnull, close_fds=True)
+ except IOError as e:
+ self.outfile.close()
+ self.devnull.close()
+ raise
+
+ def write(self, arg):
+ self.process.stdin.write(arg)
+
+ def close(self):
+ self.process.stdin.close()
+ retcode = self.process.wait()
+ self.outfile.close()
+ self.devnull.close()
+ if retcode != 0:
+ raise IOError("Output gzip process terminated with exit code {0}".format(retcode))
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *exc_info):
+ self.close()
+
+
+class GzipReader:
+ def __init__(self, path):
+ self.process = Popen(['gzip', '-cd', path], stdout=PIPE)
+
+ def close(self):
+ retcode = self.process.poll()
+ if retcode is None:
+ # still running
+ self.process.terminate()
+ self._raise_if_error()
+
+ def __iter__(self):
+ for line in self.process.stdout:
+ yield line
+ self.process.wait()
+ self._raise_if_error()
+
+ def _raise_if_error(self):
+ """
+ Raise EOFError if process is not running anymore and the
+ exit code is nonzero.
+ """
+ retcode = self.process.poll()
+ if retcode is not None and retcode != 0:
+ raise EOFError("gzip process returned non-zero exit code {0}. Is the "
+ "input file truncated or corrupt?".format(retcode))
+
+ def read(self, *args):
+ data = self.process.stdout.read(*args)
+ if len(args) == 0 or args[0] <= 0:
+ # wait for process to terminate until we check the exit code
+ self.process.wait()
+ self._raise_if_error()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *exc_info):
+ self.close()
+
+
+def xopen(filename, mode='r'):
+ """
+ Replacement for the "open" function that can also open files that have
+ been compressed with gzip, bzip2 or xz. If the filename is '-', standard
+ output (mode 'w') or input (mode 'r') is returned. If the filename ends
+ with .gz, the file is opened with a pipe to the gzip program. If that
+ does not work, then gzip.open() is used (the gzip module is slower than
+ the pipe to the gzip program). If the filename ends with .bz2, it's
+ opened as a bz2.BZ2File. Otherwise, the regular open() is used.
+
+ mode can be: 'rt', 'rb', 'a', 'wt', or 'wb'
+ Instead of 'rt' and 'wt', 'r' and 'w' can be used as abbreviations.
+
+ In Python 2, the 't' and 'b' characters are ignored.
+
+ Append mode ('a') is unavailable with BZ2 compression and will raise an error.
+ """
+ if mode == 'r':
+ mode = 'rt'
+ elif mode == 'w':
+ mode = 'wt'
+ if mode not in ('rt', 'rb', 'wt', 'wb', 'a'):
+ raise ValueError("mode '{0}' not supported".format(mode))
+ if not PY3:
+ mode = mode[0]
+ if not isinstance(filename, basestring):
+ raise ValueError("the filename must be a string")
+
+ # standard input and standard output handling
+ if filename == '-':
+ if not PY3:
+ return sys.stdin if 'r' in mode else sys.stdout
+ return dict(
+ rt=sys.stdin,
+ wt=sys.stdout,
+ rb=sys.stdin.buffer,
+ wb=sys.stdout.buffer)[mode]
+
+ if filename.endswith('.bz2'):
+ if bz2 is None:
+ raise ImportError("Cannot open bz2 files: The bz2 module is not available")
+ if PY3:
+ if 't' in mode:
+ return io.TextIOWrapper(bz2.BZ2File(filename, mode[0]))
+ else:
+ return bz2.BZ2File(filename, mode)
+ else:
+ return bz2.BZ2File(filename, mode)
+ elif filename.endswith('.xz'):
+ if lzma is None:
+ raise ImportError("Cannot open xz files: The lzma module is not available "
+ "(use Python 3.3 or newer)")
+ return lzma.open(filename, mode)
+ elif filename.endswith('.gz'):
+ if PY3:
+ if 't' in mode:
+ # gzip.open in Python 3.2 does not support modes 'rt' and 'wt''
+ return io.TextIOWrapper(gzip.open(filename, mode[0]))
+ else:
+ if 'r' in mode:
+ return io.BufferedReader(gzip.open(filename, mode))
+ else:
+ return io.BufferedWriter(gzip.open(filename, mode))
+ else:
+ # rb/rt are equivalent in Py2
+ if 'r' in mode:
+ try:
+ return GzipReader(filename)
+ except IOError:
+ # gzip not installed
+ return buffered_reader(gzip.open(filename, mode))
+ else:
+ try:
+ return GzipWriter(filename, mode)
+ except IOError:
+ return buffered_writer(gzip.open(filename, mode))
+ else:
+ return open(filename, mode)
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..d5b1f21
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,179 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+all: html
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " xml to make Docutils-native XML files"
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/cutadapt.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/cutadapt.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/cutadapt"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/cutadapt"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through platex and dvipdfmx..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+ @echo
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+ @echo
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/_static/adapters.svg b/doc/_static/adapters.svg
new file mode 100644
index 0000000..99cf4bd
--- /dev/null
+++ b/doc/_static/adapters.svg
@@ -0,0 +1,259 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ version="1.0"
+ width="500.50909"
+ height="365.63535"
+ id="svg5571">
+ <defs
+ id="defs5573" />
+ <metadata
+ id="metadata5576">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ transform="translate(-4.4323702,147.9297)"
+ id="layer1">
+ <rect
+ width="35.933102"
+ height="7.0866098"
+ x="111.386"
+ y="-52.720001"
+ id="rect6974"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#b3b3b3;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="106.299"
+ height="7.0866098"
+ x="5.0866399"
+ y="-52.720001"
+ id="rect3625"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="141.73199"
+ height="7.0866098"
+ x="5.5865898"
+ y="-52.720001"
+ id="rect5585"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times [...]
+ <rect
+ width="70.866096"
+ height="7.0866199"
+ x="83.385101"
+ y="-123.586"
+ id="rect6102"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#84b818;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman; [...]
+ <rect
+ width="71.020401"
+ height="7.0866299"
+ x="111.732"
+ y="-66.893303"
+ id="rect6104"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#84b818;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman; [...]
+ <rect
+ width="70.866096"
+ height="7.0866098"
+ x="268.57001"
+ y="136.66589"
+ id="rect6130"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Tim [...]
+ <rect
+ width="70.866096"
+ height="7.0866098"
+ x="268.57001"
+ y="172.099"
+ id="rect6972"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#84b818;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman; [...]
+ <rect
+ width="70.866096"
+ height="7.0866199"
+ x="268.57001"
+ y="207.532"
+ id="rect7032"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#b3b3b3;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <text
+ x="353.60956"
+ y="214.61865"
+ id="text6978"
+ xml:space="preserve"
+ style="font-size:18px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Lato;-inkscape-font-specification:Lato"><tspan
+ x="353.60956"
+ y="214.61865"
+ id="tspan6980">Removed sequence</tspan></text>
+ <text
+ x="353.60956"
+ y="179.18559"
+ id="text6982"
+ xml:space="preserve"
+ style="font-size:18px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Lato;-inkscape-font-specification:Lato"><tspan
+ x="353.60956"
+ y="179.18559"
+ id="tspan6984">Adapter</tspan></text>
+ <text
+ x="353.60956"
+ y="143.75253"
+ id="text6986"
+ xml:space="preserve"
+ style="font-size:18px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Lato;-inkscape-font-specification:Lato"><tspan
+ x="353.60956"
+ y="143.75253"
+ id="tspan6988">Read </tspan></text>
+ <rect
+ width="70.866096"
+ height="7.0866098"
+ x="4.9323802"
+ y="193.35901"
+ id="rect5587"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#84b818;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman; [...]
+ <rect
+ width="70.866096"
+ height="7.0866098"
+ x="4.9324002"
+ y="207.532"
+ id="rect7030"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#b3b3b3;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="141.73199"
+ height="7.0866098"
+ x="4.9324002"
+ y="207.532"
+ id="rect6976"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times [...]
+ <rect
+ width="99.712601"
+ height="7.0866199"
+ x="82.885101"
+ y="-109.413"
+ id="rect7028"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#b3b3b3;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="77.952797"
+ height="7.0866199"
+ x="4.9324002"
+ y="-109.413"
+ id="rect3627"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="177.16499"
+ height="7.0866098"
+ x="5.4323401"
+ y="-109.413"
+ id="rect7199"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times [...]
+ <rect
+ width="70.866096"
+ height="7.0865698"
+ x="4.9323702"
+ y="24.8864"
+ id="rect6128"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#84b818;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman; [...]
+ <rect
+ width="70.866096"
+ height="7.0865698"
+ x="4.9323902"
+ y="81.5793"
+ id="rect6114"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#84b818;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman; [...]
+ <rect
+ width="70.866203"
+ height="7.0866299"
+ x="4.9323702"
+ y="39.059551"
+ id="rect7058"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#b3b3b3;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="70.366096"
+ height="7.0866299"
+ x="-146.16499"
+ y="39.059551"
+ transform="scale(-1,1)"
+ id="rect3629"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="141.73199"
+ height="7.0866098"
+ x="4.9323702"
+ y="39.059551"
+ id="rect6132"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times [...]
+ <rect
+ width="35.433102"
+ height="7.0866199"
+ x="40.365501"
+ y="95.752502"
+ id="rect7056"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#b3b3b3;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="70.866203"
+ height="7.0866199"
+ x="75.2985"
+ y="95.752502"
+ id="rect3631"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:3;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman, Bold'" />
+ <rect
+ width="106.299"
+ height="7.0866098"
+ x="40.365501"
+ y="95.752502"
+ id="rect6134"
+ style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker:none;visibility:visible;display:inline;overflow:visible;enable-background:accumulate;font-family:Times New Roman;-inkscape-font-specification:'Times [...]
+ <text
+ x="4.9323802"
+ y="10.713129"
+ id="text3333"
+ xml:space="preserve"
+ style="font-size:18px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Lato;-inkscape-font-specification:Lato"><tspan
+ x="4.9323802"
+ y="10.713129"
+ id="tspan3335">5' Adapter</tspan></text>
+ <text
+ x="4.9323802"
+ y="-130.6727"
+ id="text3337"
+ xml:space="preserve"
+ style="font-size:18px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Lato;-inkscape-font-specification:Lato"><tspan
+ x="4.9323802"
+ y="-130.6727"
+ id="tspan3339">3' Adapter</tspan></text>
+ <text
+ x="4.9323802"
+ y="179.18558"
+ id="text3341"
+ xml:space="preserve"
+ style="font-size:18px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Lato;-inkscape-font-specification:Lato"><tspan
+ x="4.9323802"
+ y="179.18558"
+ id="tspan3343">Anchored 5' adapter</tspan></text>
+ <text
+ x="40.865387"
+ y="-81.066414"
+ id="text3349"
+ xml:space="preserve"
+ style="font-size:13.63599968px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Lato;-inkscape-font-specification:Lato"><tspan
+ x="40.865387"
+ y="-81.066414"
+ id="tspan3351">or</tspan></text>
+ <text
+ x="40.365467"
+ y="67.405998"
+ id="text3353"
+ xml:space="preserve"
+ style="font-size:13.63599968px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Lato;-inkscape-font-specification:Lato"><tspan
+ x="40.365467"
+ y="67.405998"
+ id="tspan3355">or</tspan></text>
+ </g>
+</svg>
diff --git a/doc/_static/logo.svg b/doc/_static/logo.svg
new file mode 100644
index 0000000..24a06b4
--- /dev/null
+++ b/doc/_static/logo.svg
@@ -0,0 +1,94 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="56.122009"
+ height="51.8545"
+ id="svg3076"
+ version="1.1"
+ inkscape:version="0.48.5 r10040"
+ sodipodi:docname="New document 2">
+ <defs
+ id="defs3078" />
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="2.6502665"
+ inkscape:cx="41.639266"
+ inkscape:cy="34.486602"
+ inkscape:document-units="px"
+ inkscape:current-layer="layer1"
+ showgrid="false"
+ fit-margin-top="2"
+ fit-margin-left="2"
+ fit-margin-right="2"
+ fit-margin-bottom="2"
+ inkscape:window-width="1305"
+ inkscape:window-height="763"
+ inkscape:window-x="-4"
+ inkscape:window-y="56"
+ inkscape:window-maximized="0" />
+ <metadata
+ id="metadata3081">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1"
+ transform="translate(-346.939,-506.43493)">
+ <g
+ transform="translate(44.935994,179.79303)"
+ style="display:inline"
+ id="g4093"
+ inkscape:export-filename="cutadapt.png"
+ inkscape:export-xdpi="276"
+ inkscape:export-ydpi="276">
+ <path
+ inkscape:connector-curvature="0"
+ id="path4068"
+ transform="translate(0,308.2677)"
+ d="m 349.625,34.1875 -7.78125,3.625 c 0.91994,1.970873 1.4375,4.181472 1.4375,6.5 0,2.318528 -0.51756,4.497877 -1.4375,6.46875 l 7.78125,3.625 c 2.98923,-6.41042 2.98923,-13.808331 0,-20.21875 z"
+ style="fill:#aad400;fill-opacity:1;display:inline" />
+ <path
+ inkscape:connector-curvature="0"
+ id="path4066"
+ transform="translate(0,308.2677)"
+ d="m 328.15625,20.375 c -6.89497,-0.05633 -13.78424,2.867991 -18.5625,8.5625 -8.49469,10.123572 -7.15482,25.192814 2.96875,33.6875 10.12357,8.494686 25.19281,7.186072 33.6875,-2.9375 l -6.40625,-5.375 c -2.85492,3.3998 -7.11936,5.5625 -11.90625,5.5625 -8.59538,0 -15.5625,-6.96712 -15.5625,-15.5625 0,-8.59538 6.96712,-15.5625 15.5625,-15.5625 4.78689,0 9.05133,2.1627 11.90625,5.5625 l 6.40625,-5.375 c -0.8954,-1.067094 -1.87041,-2.073352 -2.9375,-2.96875 -4.42906,-3.716425 -9.793 [...]
+ style="fill:#217821;fill-opacity:1;display:inline" />
+ <path
+ sodipodi:nodetypes="cccccc"
+ inkscape:connector-curvature="0"
+ id="path4072"
+ transform="translate(0,308.2677)"
+ d="m 353.4375,25.09375 -15.28125,11.5 0.0497,1.108915 1.04406,0.578585 16.875,-8.96875 c -0.78525,-1.47685 -1.68088,-2.882924 -2.6875,-4.21875 z"
+ style="fill:#217821;fill-opacity:1;display:inline" />
+ <path
+ sodipodi:nodetypes="cccccc"
+ inkscape:connector-curvature="0"
+ id="path4074"
+ transform="translate(0,308.2677)"
+ d="m 339.25,50.3125 -1.04688,0.4375 -0.0469,1.25 15.28125,11.53125 c 1.00662,-1.335826 1.90224,-2.77315 2.6875,-4.25 z"
+ style="fill:#217821;fill-opacity:1;display:inline" />
+ </g>
+ </g>
+</svg>
diff --git a/doc/changes.rst b/doc/changes.rst
new file mode 100644
index 0000000..d9e113e
--- /dev/null
+++ b/doc/changes.rst
@@ -0,0 +1 @@
+.. include:: ../CHANGES.rst
diff --git a/doc/colorspace.rst b/doc/colorspace.rst
new file mode 100644
index 0000000..fc9c599
--- /dev/null
+++ b/doc/colorspace.rst
@@ -0,0 +1,128 @@
+Colorspace reads
+================
+
+Cutadapt was designed to work with colorspace reads from the ABi SOLiD
+sequencer. Colorspace trimming is activated by the ``--colorspace``
+option (or use ``-c`` for short). The input reads can be given either:
+
+- in a FASTA file
+- in a FASTQ file
+- in a ``.csfasta`` and a ``.qual`` file (this is the native SOLiD
+ format).
+
+In all cases, the colors must be represented by the characters 0, 1, 2,
+3. Example input files are in the cutadapt distribution at
+``tests/data/solid.*``. The ``.csfasta``/``.qual`` file format is
+automatically assumed if two input files are given to cutadapt.
+
+In colorspace mode, the adapter sequences given to the ``-a``, ``-b``
+and ``-g`` options can be given both as colors or as nucleotides. If
+given as nucleotides, they will automatically be converted to
+colorspace. For example, to trim an adapter from ``solid.csfasta`` and
+``solid.qual``, use this command-line::
+
+ cutadapt -c -a CGCCTTGGCCGTACAGCAG solid.csfasta solid.qual > output.fastq
+
+In case you know the colorspace adapter sequence, you can also write
+``330201030313112312`` instead of ``CGCCTTGGCCGTACAGCAG`` and the result
+is the same.
+
+Ambiguity in colorspace
+-----------------------
+
+The ambiguity of colorspace encoding leads to some effects to be aware
+of when trimming 3' adapters from colorspace reads. For example, when
+trimming the adapter ``AACTC``, cutadapt searches for its
+colorspace-encoded version ``0122``. But also ``TTGAG``, ``CCAGA`` and
+``GGTCT`` have an encoding of ``0122``. This means that effectively four
+different adapter sequences are searched and trimmed at the same time.
+There is no way around this, unless the decoded sequence were available,
+but that is usually only the case after read mapping.
+
+The effect should usually be quite small. The number of false positives
+is multiplied by four, but with a sufficiently large overlap (3 or 4 is
+already enough), this is still only around 0.2 bases lost per read on
+average. If inspecting k-mer frequencies or using small overlaps, you
+need to be aware of the effect, however.
+
+
+Double-encoding, BWA and MAQ
+----------------------------
+
+The read mappers MAQ and BWA (and possibly others) need their colorspace
+input reads to be in a so-called "double encoding". This simply means
+that they cannot deal with the characters 0, 1, 2, 3 in the reads, but
+require that the letters A, C, G, T be used for colors. For example, the
+colorspace sequence ``0011321`` would be ``AACCTGC`` in double-encoded
+form. This is not the same as conversion to basespace! The read is still
+in colorspace, only letters are used instead of digits. If that sounds
+confusing, that is because it is.
+
+Note that MAQ is unmaintained and should not be used in new projects.
+
+BWA’s colorspace support was dropped in versions more recent than 0.5.9,
+but that version works well.
+
+When you want to trim reads that will be mapped with BWA or MAQ, you can
+use the ``--bwa`` option, which enables colorspace mode (``-c``),
+double-encoding (``-d``), primer trimming (``-t``), all of which are
+required for BWA, in addition to some other useful options.
+
+The ``--maq`` option is an alias for ``--bwa``.
+
+
+Colorspace examples
+-------------------
+
+To cut an adapter from SOLiD data given in ``solid.csfasta`` and
+``solid.qual``, to produce MAQ- and BWA-compatible output, allow the
+default of 10% errors and write the resulting FASTQ file to
+output.fastq::
+
+ cutadapt --bwa -a CGCCTTGGCCGTACAGCAG solid.csfasta solid.qual > output.fastq
+
+Instead of redirecting standard output with ``>``, the ``-o`` option can
+be used. This also shows that you can give the adapter in colorspace and
+how to use a different error rate::
+
+ cutadapt --bwa -e 0.15 -a 330201030313112312 -o output.fastq solid.csfasta solid.qual
+
+This does the same as above, but produces BFAST-compatible output,
+strips the \_F3 suffix from read names and adds the prefix "abc:" to
+them::
+
+ cutadapt -c -e 0.15 -a 330201030313112312 -x abc: --strip-f3 solid.csfasta solid.qual > output.fastq
+
+
+Bowtie
+------
+
+Quality values of colorspace reads are sometimes negative. Bowtie gets
+confused and prints this message::
+
+ Encountered a space parsing the quality string for read xyz
+
+BWA also has a problem with such data. Cutadapt therefore converts
+negative quality values to zero in colorspace data. Use the option
+``--no-zero-cap`` to turn this off.
+
+.. _sra-fastq:
+
+Sequence Read Archive
+---------------------
+
+The Sequence Read Archive provides files in a special "SRA" file format. When
+the ``fastq-dump`` program from the sra-toolkit package is used to convert
+these ``.sra`` files to FASTQ format, colorspace reads will get an extra
+quality value in the beginning of each read. You may get an error like this::
+
+ cutadapt: error: In read named 'xyz': length of colorspace quality
+ sequence (36) and length of read (35) do not match (primer is: 'T')
+
+To make cutadapt ignore the extra quality base, add ``--format=sra-fastq`` to
+your command-line, as in this example::
+
+ cutadapt -c --format=sra-fastq -a CGCCTTGGCCG sra.fastq > trimmed.fastq
+
+When you use ``--format=sra-fastq``, the spurious quality value will be removed
+from all reads in the file.
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..bca116e
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,270 @@
+# -*- coding: utf-8 -*-
+#
+# cutadapt documentation build configuration file, created by
+# sphinx-quickstart on Fri Sep 12 09:11:16 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath(os.pardir))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.autodoc',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'cutadapt'
+copyright = u'2010-2016, Marcel Martin'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+
+from cutadapt import __version__
+
+#
+# The short X.Y version.
+version = __version__
+# The full version, including alpha/beta/rc tags.
+release = __version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+try:
+ from better import better_theme_path
+ html_theme_path = [better_theme_path]
+ html_theme = 'better'
+except ImportError:
+ pass
+
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = 'logo.png'
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'cutadaptdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+'papersize': 'a4paper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ ('index', 'cutadapt.tex', u'cutadapt Documentation',
+ u'Marcel Martin', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'cutadapt', u'cutadapt Documentation',
+ [u'Marcel Martin'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'cutadapt', u'cutadapt Documentation',
+ u'Marcel Martin', 'cutadapt', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/doc/guide.rst b/doc/guide.rst
new file mode 100644
index 0000000..d5d7fe6
--- /dev/null
+++ b/doc/guide.rst
@@ -0,0 +1,1373 @@
+==========
+User guide
+==========
+
+Basic usage
+===========
+
+If you just want to trim a 3' adapter, the basic command-line for cutadapt is::
+
+ cutadapt -a AACCGGTT -o output.fastq input.fastq
+
+The sequence of the adapter is given with the ``-a`` option. Of course, you
+need to replace ``AACCGGTT`` with your actual adapter sequence. Reads are read
+from the input file ``input.fastq`` and written to the output file
+``output.fastq``.
+
+Cutadapt searches for the adapter in all reads and removes it when it finds it.
+All reads that were present in the input file will also be present in the output
+file, some of them trimmed, some of them not. Even reads that were trimmed
+entirely (because the adapter was found in the very beginning) are output. All
+of this can be changed with command-line options, explained further down.
+
+A report is printed after cutadapt has finished processing the reads.
+
+
+Input and output file formats
+-----------------------------
+
+Input files for cutadapt need to be in one the these formats:
+
+* FASTA (file name extensions: ``.fasta``, ``.fa``, ``.fna``, ``.csfasta``, ``.csfa``)
+* FASTQ (extensions: ``.fastq``, ``.fq``)
+* A pair of a FASTA file and a ``.(cs)qual`` file
+
+The latter format is (or was) used for colorspace data from the SOLiD
+instruments.
+
+The input file format is recognized from the file name extension (given in
+parentheses in the list above). You can also explicitly specify which format
+the input has by using the ``--format`` option.
+
+The output format is the same as the input format, except for the FASTA/QUAL
+pairs -- those will always be converted to FASTQ. Also, cutadapt does not check
+the output file name: If you input FASTQ data, but use ``-o output.fasta``, then
+the output file will actually be in FASTQ format.
+
+
+Compressed files
+----------------
+
+Cutadapt supports compressed input and output files. Whether an input file
+needs to be decompressed or an output file needs to be compressed is detected
+automatically by inspecting the file name: If it ends in ``.gz``, then gzip
+compression is assumed. You can therefore run cutadapt like this and it works
+as expected::
+
+ cutadapt -a AACCGGTT -o output.fastq.gz input.fastq.gz
+
+All of cutadapt's options that expect a file name support this.
+
+Files compressed with bzip2 (``.bz2``) or xz (``.xz``) are also supported, but
+only if the Python installation includes the proper modules. xz files require
+Python 3.3 or later.
+
+
+Standard input and output
+-------------------------
+
+If no output file is specified via the ``-o`` option, then the output is sent to
+the standard output stream. Instead of the example command line from above, you
+can therefore also write::
+
+ cutadapt -a AACCGGTT input.fastq > output.fastq
+
+There is one difference in behavior if you use cutadapt without ``-o``: The
+report is sent to the standard error stream instead of standard output. You
+can redirect it to a file like this::
+
+ cutadapt -a AACCGGTT input.fastq > output.fastq 2> report.txt
+
+Wherever cutadapt expects a file name, you can also write a dash (``-``) in
+order to specify that standard input or output should be used. For example::
+
+ tail -n 4 input.fastq | cutadapt -a AACCGGTT - > output.fastq
+
+The ``tail -n 4`` prints out only the last four lines of ``input.fastq``, which
+are then piped into cutadapt. Thus, cutadapt will work only on the last read in
+the input file.
+
+In most cases, you should probably use ``-`` at most once for an input file and
+at most once for an output file, in order not to get mixed output.
+
+You cannot combine ``-`` and gzip compression since cutadapt needs to know the
+file name of the output or input file. if you want to have a gzip-compressed
+output file, use ``-o`` with an explicit name.
+
+One last "trick" is to use ``/dev/null`` as an output file name. This special
+file discards everything you send into it. If you only want to see the
+statistics output, for example, and do not care about the trimmed reads at all,
+you could use something like this::
+
+ cutadapt -a AACCGGTT -o /dev/null input.fastq
+
+
+Read processing
+===============
+
+Cutadapt can do a lot more in addition to removing adapters. There are various
+command-line options that make it possible to modify and filter reads and to
+redirect them to various output files. Each read is processed in the following
+way:
+
+1. :ref:`Read modification options <modifying-reads>` are applied. This includes
+ :ref:`adapter removal <removing-adapters>`,
+ :ref:`quality trimming <quality-trimming>`, read name modifications etc.
+2. :ref:`Filtering options <filtering>` are applied, such as removal of too
+ short or untrimmed reads. Some of the filters also allow to redirect a read
+ to a separate output file.
+3. If the read has passed all the filters, it is written to the output file.
+
+
+.. _removing-adapters:
+
+Removing adapters
+=================
+
+Cutadapt supports trimming of multiple types of adapters:
+
+=================================================== ===========================
+Adapter type Command-line option
+=================================================== ===========================
+:ref:`3' adapter <three-prime-adapters>` ``-a ADAPTER``
+:ref:`5' adapter <five-prime-adapters>` ``-g ADAPTER``
+:ref:`Anchored 3' adapter <anchored-3adapters>` ``-a ADAPTER$``
+:ref:`Anchored 5' adapter <anchored-5adapters>` ``-g ^ADAPTER``
+:ref:`5' or 3' (both possible) <anywhere-adapters>` ``-b ADAPTER``
+:ref:`Linked adapter <linked-adapters>` ``-a ADAPTER1...ADAPTER2``
+=================================================== ===========================
+
+Here is an illustration of the allowed adapter locations relative to the read
+and depending on the adapter type:
+
+|
+
+.. image:: _static/adapters.svg
+
+|
+
+By default, all adapters :ref:`are searched error-tolerantly <error-tolerance>`.
+Adapter sequences :ref:`may also contain the "N" wildcard
+character <wildcards>`.
+
+In addition, it is possible to :ref:`remove a fixed number of
+bases <cut-bases>` from the beginning or end of each read, and to :ref:`remove
+low-quality bases (quality trimming) <quality-trimming>` from the 3' and 5' ends.
+
+
+.. _three-prime-adapters:
+
+3' adapters
+-----------
+
+A 3' adapter is a piece of DNA ligated to the 3' end of the DNA fragment you
+are interested in. The sequencer starts the sequencing process at the 5' end of
+the fragment and sequences into the adapter if the read is long enough.
+The read that it outputs will then have a part of the adapter in the
+end. Or, if the adapter was short and the read length quite long, then the
+adapter will be somewhere within the read (followed by other bases).
+
+For example, assume your fragment of interest is *MYSEQUENCE* and the adapter is
+*ADAPTER*. Depending on the read length, you will get reads that look like this::
+
+ MYSEQUEN
+ MYSEQUENCEADAP
+ MYSEQUENCEADAPTER
+ MYSEQUENCEADAPTERSOMETHINGELSE
+
+Use cutadapt's ``-a ADAPTER`` option to remove this type of adapter. This will
+be the result::
+
+ MYSEQUEN
+ MYSEQUENCE
+ MYSEQUENCE
+ MYSEQUENCE
+
+As can be seen, cutadapt correctly deals with partial adapter matches, and also
+with any trailing sequences after the adapter. Cutadapt deals with 3' adapters
+by removing the adapter itself and any sequence that may follow. If the sequence
+starts with an adapter, like this::
+
+ ADAPTERSOMETHING
+
+Then the sequence will be empty after trimming. By default, empty reads are kept
+and will appear in the output.
+
+
+.. _five-prime-adapters:
+
+5' adapters
+-----------
+
+.. note::
+ Unless your adapter may also occur in a degraded form, you probably
+ want to use an anchored 5' adapter, described in the next section.
+
+A 5' adapter is a piece of DNA ligated to the 5' end of the DNA fragment of
+interest. The adapter sequence is expected to appear at the start of the read,
+but may be partially degraded. The sequence may also appear somewhere within
+the read. In all cases, the adapter itself and the sequence preceding it is
+removed.
+
+Again, assume your fragment of interest is *MYSEQUENCE* and the adapter is
+*ADAPTER*. The reads may look like this::
+
+ ADAPTERMYSEQUENCE
+ DAPTERMYSEQUENCE
+ TERMYSEQUENCE
+ SOMETHINGADAPTERMYSEQUENCE
+
+All the above sequences are trimmed to ``MYSEQUENCE`` when you use `-g ADAPTER`.
+As with 3' adapters, the resulting read may have a length of zero when the
+sequence ends with the adapter. For example, the read ::
+
+ SOMETHINGADAPTER
+
+will be empty after trimming.
+
+
+.. _anchored-5adapters:
+
+Anchored 5' adapters
+--------------------
+
+In many cases, the above behavior is not really what you want for trimming 5'
+adapters. You may know, for example, that degradation does not occur and that
+the adapter is also not expected to be within the read. Thus, you always expect
+the read to look like the first example from above::
+
+ ADAPTERSOMETHING
+
+If you want to trim only this type of adapter, use ``-g ^ADAPTER``. The ``^`` is
+supposed to indicate the the adapter is "anchored" at the beginning of the read.
+In other words: The adapter is expected to be a prefix of the read. Note that
+cases like these are also recognized::
+
+ ADAPTER
+ ADAPT
+ ADA
+
+The read will simply be empty after trimming.
+
+Be aware that cutadapt still searches for adapters error-tolerantly and, in
+particular, allows insertions. So if your maximum error rate is sufficiently
+high, even this read will be trimmed::
+
+ BADAPTERSOMETHING
+
+The ``B`` in the beginnig is seen as an insertion. If you also want to prevent
+this from happening, use the option ``--no-indels`` to disallow insertions and
+deletions entirely.
+
+
+.. _anchored-3adapters:
+
+Anchored 3' adapters
+--------------------
+
+It is also possible to anchor 3' adapters to the end of the read. This is
+rarely necessary, but if you have merged, for example, overlapping paired-end
+reads, then it is useful. Add the ``$`` character to the end of an
+adapter sequence specified via ``-a`` in order to anchor the adapter to the
+end of the read, such as ``-a ADAPTER$``. The adapter will only be found if it
+is a *suffix* of the read, but errors are still allowed as for 5' adapters.
+You can disable insertions and deletions with ``--no-indels``.
+
+Anchored 3' adapters work as if you had reversed the sequence and used an
+appropriate anchored 5' adapter.
+
+As an example, assume you have these reads::
+
+ MYSEQUENCEADAP
+ MYSEQUENCEADAPTER
+ MYSEQUENCEADAPTERSOMETHINGELSE
+
+Using ``-a ADAPTER$`` will result in::
+
+ MYSEQUENCEADAP
+ MYSEQUENCE
+ MYSEQUENCEADAPTERSOMETHINGELSE
+
+Only the middle read is trimmed at all.
+
+
+.. _linked-adapters:
+
+Linked adapters
+---------------
+
+This is a combination of a 5' and a 3' adapter. Use ``-a ADAPTER1...ADAPTER2``
+to search for a linked adapter. ADAPTER1 is interpreted as an anchored 5'
+adapter, which is searched for first. Only if ADAPTER1 is found will then
+ADAPTER2 be searched for, which is a regular 3' adapter.
+
+This feature is experimental and will probably break when used in combination
+with some other options, such as ``--info-file``, ``--mask-adapter``.
+
+
+.. _anywhere-adapters:
+
+5' or 3' adapters
+-----------------
+
+The last type of adapter is a combination of the 5' and 3' adapter. You can use
+it when your adapter is ligated to the 5' end for some reads and to the 3' end
+in other reads. This probably does not happen very often, and this adapter type
+was in fact originally implemented because the library preparation in an
+experiment did not work as it was supposed to.
+
+For this type of adapter, the sequence is specified with ``-b ADAPTER`` (or use
+the longer spelling ``--anywhere ADAPTER``). The adapter may appear in the
+beginning (even degraded), within the read, or at the end of the read (even
+partially). The decision which part of the read to remove is made as follows: If
+there is at least one base before the found adapter, then the adapter is
+considered to be a 3' adapter and the adapter itself and everything
+following it is removed. Otherwise, the adapter is considered to be a 5'
+adapter and it is removed from the read, but the sequence after it remains.
+
+Here are some examples.
+
+============================== =================== =====================
+Read before trimming Read after trimming Detected adapter type
+============================== =================== =====================
+``MYSEQUENCEADAPTERSOMETHING`` ``MYSEQUENCE`` 3' adapter
+``MYSEQUENCEADAPTER`` ``MYSEQUENCE`` 3' adapter
+``MYSEQUENCEADAP`` ``MYSEQUENCE`` 3' adapter
+``MADAPTER`` ``M`` 3' adapter
+``ADAPTERMYSEQUENCE`` ``MYSEQUENCE`` 5' adapter
+``PTERMYSEQUENCE`` ``MYSEQUENCE`` 5' adapter
+``TERMYSEQUENCE`` ``MYSEQUENCE`` 5' adapter
+============================== =================== =====================
+
+The ``-b`` option cannot be used with colorspace data.
+
+
+.. _error-tolerance:
+
+Error tolerance
+---------------
+
+All searches for adapter sequences are error tolerant. Allowed errors are
+mismatches, insertions and deletions. For example, if you search for the
+adapter sequence ``ADAPTER`` and the error tolerance is set appropriately
+(as explained below), then also ``ADABTER`` will be found (with 1 mismatch),
+as well as ``ADAPTR`` (with 1 deletion), and also ``ADAPPTER`` (with 1
+insertion).
+
+The level of error tolerance is adjusted by specifying a *maximum error rate*,
+which is 0.1 (=10%) by default. Use the ``-e`` option to set a different value.
+To determine the number of allowed errors, the maximum error rate is multiplied
+by the length of the match (and then rounded off).
+
+What does that mean?
+Assume you have a long adapter ``LONGADAPTER`` and it appears in full somewhere
+within the read. The length of the match is 11 characters since the full adapter
+has a length of 11, therefore 11·0.1=1.1 errors are allowed with the default
+maximum error rate of 0.1. This is rounded off to 1 allowed error. So the
+adapter will be found within this read::
+
+ SEQUENCELONGADUPTERSOMETHING
+
+If the match is a bit shorter, however, the result is different::
+
+ SEQUENCELONGADUPT
+
+Only 9 characters of the adapter match: ``LONGADAPT`` matches ``LONGADUPT``
+with one substitution. Therefore, only 9·0.1=0.9 errors are allowed. Since this
+is rounded off to zero allowed errors, the adapter will not be found.
+
+The number of errors allowed for a given adapter match length is also shown in
+the report that cutadapt prints::
+
+ Sequence: 'LONGADAPTER'; Length: 11; Trimmed: 2 times.
+
+ No. of allowed errors:
+ 0-9 bp: 0; 10-11 bp: 1
+
+This tells us what we now already know: For match lengths of 0-9 bases, zero
+errors are allowed and for matches of length 10-11 bases, one error is allowed.
+
+The reason for this behavior is to ensure that short matches are not favored
+unfairly. For example, assume the adapter has 40 bases and the maximum error
+rate is 0.1, which means that four errors are allowed for full-length matches.
+If four errors were allowed even for a short match such as one with 10 bases, this would
+mean that the error rate for such a case is 40%, which is clearly not what was
+desired.
+
+Insertions and deletions can be disallowed by using the option
+``--no-indels``.
+
+See also the :ref:`section on details of the alignment algorithm <algorithm>`.
+
+
+Multiple adapter occurrences within a single read
+-------------------------------------------------
+
+If a single read contains multiple copies of the same adapter, the basic rule is
+that the leftmost match is used for both 5' and 3' adapters. For example, when
+searching for a 3' adapter in ::
+
+ cccccADAPTERgggggADAPTERttttt
+
+the read will be trimmed to ::
+
+ ccccc
+
+When the adapter is a 5' adapter instead, the read will be trimmed to ::
+
+ gggggADAPTERttttt
+
+The above applies when both occurrences of the adapter are *exact* matches, and
+it also applies when both occurrences of the adapter are *inexact* matches (that
+is, it has at least one indel or mismatch). However, if one match is exact, but
+the other is inexact, then the exact match wins, even if it is not the leftmost
+one! The reason for this behavior is that cutadapt searches for exact matches
+first and, to improve performance, skips the error-tolerant matching step if an
+exact match was found.
+
+
+Reducing random matches
+-----------------------
+
+Since cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. For
+example, roughly 25% of all reads end with a base that is identical to the
+first base of the adapter. To reduce the number of falsely trimmed bases,
+the alignment algorithm requires that at least *three bases* match between
+adapter and read. The minimum overlap length can be changed with the parameter
+``--overlap`` (or its short version ``-O``). Shorter matches are simply
+ignored, and the bases are not trimmed.
+
+Requiring at least three bases to match is quite conservative. Even if no
+minimum overlap was required, we can compute that we lose only about 0.44 bases
+per read on average, see `Section 2.3.3 in my
+thesis <http://hdl.handle.net/2003/31824>`_. With the default minimum
+overlap length of 3, only about 0.07 bases are lost per read.
+
+When choosing an appropriate minimum overlap length, take into account that
+true adapter matches are also lost when the overlap length is higher than
+zero, reducing cutadapt's sensitivity.
+
+
+.. _wildcards:
+
+Wildcards
+---------
+
+All `IUPAC nucleotide codes <http://www.bioinformatics.org/sms/iupac.html>`_
+(wildcard characters) are supported. For example, use an ``N`` in the adapter
+sequence to match any nucleotide in the read, or use ``-a YACGT`` for an adapter
+that matches both ``CACGT`` and ``TACGT``. The wildcard character ``N`` is
+useful for trimming adapters with an embedded variable barcode::
+
+ cutadapt -a ACGTAANNNNTTAGC -o output.fastq input.fastq
+
+Wildcard characters in the adapter are enabled by default. Use the option ``-N``
+to disable this.
+
+Matching of wildcards in the reads is also possible, but disabled by default
+in order to avoid matches in reads that consist of many (often low-quality)
+``N`` bases. Use ``--match-read-wildcards`` to enable wildcards also in reads.
+
+If wildcards are disabled entirely (that is, you use ``-N`` and *do not* use
+``--match-read-wildcards``), then cutadapt compares characters by ASCII value.
+Thus, both the read and adapter can be arbitrary strings (such as ``SEQUENCE``
+or ``ADAPTER`` as used here in the examples).
+
+Wildcards do not work in colorspace.
+
+
+Repeated bases in the adapter sequence
+--------------------------------------
+
+If you have many repeated bases in the adapter sequence, such as many ``N``s or
+many ``A``s, you do not have to spell them out. For example, instead of writing
+ten ``A`` in a row (``AAAAAAAAAA``), write ``A{10}`` instead. The number within
+the curly braces specifies how often the character that preceeds it will be
+repeated. This works also for IUPAC wildcard characters, as in ``N{5}``.
+
+It is recommended that you use quotation marks around your adapter sequence if
+you use this feature. For poly-A trimming, for example, you would write::
+
+ cutadapt -a "A{100}" -o output.fastq input.fastq
+
+
+.. _modifying-reads:
+
+Modifying reads
+===============
+
+This section describes in which ways reads can be modified other than adapter
+removal.
+
+.. _cut-bases:
+
+Removing a fixed number of bases
+--------------------------------
+
+By using the ``--cut`` option or its abbreviation ``-u``, it is possible to
+unconditionally remove bases from the beginning or end of each read. If
+the given length is positive, the bases are removed from the beginning
+of each read. If it is negative, the bases are removed from the end.
+
+For example, to remove the first five bases of each read::
+
+ cutadapt -u 5 -o trimmed.fastq reads.fastq
+
+To remove the last seven bases of each read::
+
+ cutadapt -u -7 -o trimmed.fastq reads.fastq
+
+The ``-u``/``--cut`` option can be combined with the other options, but
+the desired bases are removed *before* any adapter trimming.
+
+
+.. _quality-trimming:
+
+Quality trimming
+----------------
+
+The ``-q`` (or ``--trim-qualities``) parameter can be used to trim
+low-quality ends from reads before adapter removal. For this to work
+correctly, the quality values must be encoded as ascii(phred quality +
+33). If they are encoded as ascii(phred quality + 64), you need to add
+``--quality-base=64`` to the command line.
+
+Quality trimming can be done without adapter trimming, so this will work::
+
+ cutadapt -q 10 -o output.fastq input.fastq
+
+By default, only the 3' end of each read is quality-trimmed. If you want to
+trim the 5' end as well, use the ``-q`` option with two comma-separated cutoffs::
+
+ cutadapt -q 15,10 -o output.fastq input.fastq
+
+The 5' end will then be trimmed with a cutoff of 15, and the 3' will be trimmed
+with a cutoff of 10. If you only want to trim the 5' end, then use a cutoff of
+0 for the 3' end, as in ``-q 10,0``.
+
+
+Quality trimming algorithm
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The trimming algorithm is the same as the one used by BWA, but applied to both
+ends of the read in turn (if requested). That is: Subtract the given cutoff
+from all qualities; compute partial sums from all indices to the end of the
+sequence; cut the sequence at the index at which the sum is minimal. If both
+ends are to be trimmed, repeat this for the other end.
+
+The basic idea is to remove all bases starting from the end of the read whose
+quality is smaller than the given threshold. This is refined a bit by allowing
+some good-quality bases among the bad-quality ones. In the following example,
+we assume that the 3' end is to be quality-trimmed.
+
+Assume you use a threshold of 10 and have these quality values:
+
+42, 40, 26, 27, 8, 7, 11, 4, 2, 3
+
+Subtracting the threshold gives:
+
+32, 30, 16, 17, -2, -3, 1, -6, -8, -7
+
+Then sum up the numbers, starting from the end (partial sums). Stop early if
+the sum is greater than zero:
+
+(70), (38), 8, -8, -25, -23, -20, -21, -15, -7
+
+The numbers in parentheses are not computed (because 8 is greater than zero),
+but shown here for completeness. The position of the minimum (-25) is used as
+the trimming position. Therefore, the read is trimmed to the first four bases,
+which have quality values 42, 40, 26, 27.
+
+
+Modifying read names
+--------------------
+
+If you feel the need to modify the names of processed reads, some of the
+following options may be useful.
+
+Use ``-y`` or ``--suffix`` to append a text to read names. The given string can
+contain the placeholder ``{name}``, which will be replaced with the name of the
+adapter found in that read. For example, writing ::
+
+ cutadapt -a adapter1=ACGT -y ' we found {name}' input.fastq
+
+changes a read named ``read1`` to ``read1 we found adapter1`` if the adapter
+``ACGT`` was found. The options ``-x``/``--prefix`` work the same, but the text
+is added in front of the read name. For both options, spaces need to be
+specified explicitly, as in the above example. If no adapter was found in a
+read, the text ``no_adapter`` is inserted for ``{name}``.
+
+In order to remove a suffix of each read name, use ``--strip-suffix``.
+
+Some old 454 read files contain the length of the read in the name::
+
+ >read1 length=17
+ ACGTACGTACAAAAAAA
+
+If you want to update this to the correct length after trimming, use the option
+``--length-tag``. In this example, this would be ``--length-tag 'length='``.
+After trimming, the read would perhaps look like this::
+
+ >read1 length=10
+ ACGTACGTAC
+
+
+Read modification order
+-----------------------
+
+The read modifications described above are applied in the following order to
+each read. Steps not requested on the command-line are skipped.
+
+1. Unconditional base removal with ``--cut``
+2. Quality trimming (``-q``)
+3. Adapter trimming (``-a``, ``-b``, ``-g`` and uppercase versions)
+4. N-end trimming (``--trim-n``)
+5. Length tag modification (``--length-tag``)
+6. Read name suffixe removal (``--strip-suffix``)
+7. Addition of prefix and suffix to read name (``-x``/``--prefix`` and ``-y``/``--suffix``)
+8. Double-encode the sequence (only colorspace)
+9. Replace negative quality values with zero (zero capping, only colorspace)
+10. Trim primer base (only colorspace)
+
+The last three steps are colorspace-specific.
+
+
+.. _filtering:
+
+Filtering reads
+===============
+
+By default, all processed reads, no matter whether they were trimmed are not,
+are written to the output file specified by the ``-o`` option (or to standard
+output if ``-o`` was not provided). For paired-end reads, the second read in a
+pair is always written to the file specified by the ``-p`` option.
+
+The options described here make it possible to filter reads by either discarding
+them entirely or by redirecting them to other files. When redirecting reads,
+the basic rule is that *each read is written to at most one file*. You cannot
+write reads to more than one output file.
+
+In the following, the term "processed read" refers to a read to which all
+modifications have been applied (adapter removal, quality trimming etc.). A
+processed read can be identical to the input read if no modifications were done.
+
+
+``--minimum-length N`` or ``-m N``
+ Throw away processed reads shorter than *N* bases.
+
+``--too-short-output FILE``
+ Instead of throwing away the reads that are too short according to ``-m``,
+ write them to *FILE* (in FASTA/FASTQ format).
+
+``--maximum-length N`` or ``-M N``
+ Throw away processed reads longer than *N* bases.
+
+``--too-long-output FILE``
+ Instead of throwing away the reads that are too long (according to ``-M``),
+ write them to *FILE* (in FASTA/FASTQ format).
+
+``--untrimmed-output FILE``
+ Write all reads without adapters to *FILE* (in FASTA/FASTQ format) instead
+ of writing them to the regular output file.
+
+``--discard-trimmed``
+ Throw away reads in which an adapter was found.
+
+``--discard-untrimmed``
+ Throw away reads in which *no* adapter was found. This has the same effect as
+ specifying ``--untrimmed-output /dev/null``.
+
+The options ``--too-short-output`` and ``--too-long-output`` are applied first.
+This means, for example, that a read that is too long will never end up in the
+``--untrimmed-output`` file when ``--too-long-output`` was given, no matter
+whether it was trimmed or not.
+
+The options ``--untrimmed-output``, ``--discard-trimmed`` and ``-discard-untrimmed``
+are mutually exclusive.
+
+
+.. _paired-end:
+
+Trimming paired-end reads
+=========================
+
+Cutadapt supports trimming of paired-end reads, trimming both reads in a pair
+at the same time.
+
+Assume the input is in ``reads.1.fastq`` and ``reads.2.fastq`` and that
+``ADAPTER_FWD`` should be trimmed from the forward reads (first file)
+and ``ADAPTER_REV`` from the reverse reads (second file).
+
+The basic command-line is::
+
+ cutadapt -a ADAPTER_FWD -A ADAPTER_REV -o out.1.fastq -p out.2.fastq reads.1.fastq reads.2.fastq
+
+``-p`` is the short form of ``--paired-output``. The option ``-A`` is used here
+to specify an adapter sequence that cutadapt
+should remove from the second read in each pair. There are also the options
+``-G``, ``-B``. All of them work just like their lowercase counterparts,
+except that the adapter is searched for in the second read in each paired-end
+read. There is also option ``-U``, which you can use to remove a fixed number
+of bases from the second read in a pair.
+
+While it is possible to run cutadapt on the two files separately, processing
+both files at the same time is highly recommended since the program can check
+for problems in your input files only when they are processed together.
+
+When you use ``-p``/``--paired-output``, cutadapt checks whether the files are
+properly paired. An error is raised if one of the files contains more reads than
+the other or if the read names in the two files do not match. Only the part of
+the read name before the first space is considered. If the read name ends with
+``/1`` or ``/2``, then that is also ignored. For example, two FASTQ headers that
+would be considered to denote properly paired reads are::
+
+ @my_read/1 a comment
+
+and::
+
+ @my_read/2 another comment
+
+This is an example for *improperly paired* read names::
+
+ @my_read/1;1
+
+and::
+
+ @my_read/2;1
+
+Since the ``/1`` and ``/2`` are ignored only if the occur at the end of the read
+name, and since the ``;1`` is considered to be part of the read name, these
+reads will not be considered to be propely paired.
+
+As soon as you start to use one of the filtering options that discard reads, it
+is mandatory you process both files at the same time to make sure that the
+output files are kept synchronized: If a read is removed from one of the files,
+cutadapt will ensure it is also removed from the other file.
+
+
+The following command-line options are applied to *both* reads:
+
+* ``-q`` (along with ``--quality-base``)
+* ``--times`` applies to all the adapters given
+* ``--no-trim``
+* ``--trim-n``
+* ``--mask``
+* ``--length-tag``
+* ``--prefix``, ``--suffix``
+* ``--strip-f3``
+* ``--colorspace``, ``--bwa``, ``-z``, ``--no-zero-cap``, ``--double-encode``,
+ ``--trim-primer``
+
+The following limitations still exist:
+
+* The ``--info-file``, ``--rest-file`` and ``--wildcard-file`` options write out
+ information only from the first read.
+* Demultiplexing is not yet supported with paired-end data.
+
+
+
+.. _filtering-paired:
+
+Filtering paired-end reads
+--------------------------
+
+The :ref:`filtering options listed above <filtering>` can also be used when
+trimming paired-end data. Since there are two reads, however, the filtering
+criteria are checked for both reads. The question is what to do when a criterion
+applies to only one read and not the other.
+
+By default, the filtering options discard or redirect the read pair if *any*
+of the two reads fulfill the criteria. That is, ``--max-n`` discards the pair
+if one of the two reads has too many ``N`` bases; ``--discard-untrimmed``
+discards the pair if one of the reads does not contain an adapter;
+``--minimum-length`` discards the pair if one of the reads is too short;
+and ``--maximum-length`` discards the pair if one of the reads is too long.
+Note that the ``--discard-trimmed`` filter would also apply because it is also
+the case that at least one of the reads is *trimmed*!
+
+To require that filtering criteria must apply to *both* reads in order for a
+read pair to be considered "filtered", use the option ``--pair-filter=both``.
+
+To further complicate matters, cutadapt switches to a backwards compatibility
+mode ("legacy mode") when none of the uppercase modification options
+(``-A``/``-B``/``-G``/``-U``) are given. In that mode, filtering criteria are
+checked only for the *first* read. Cutadapt will also tell you at the top of
+the report whether legacy mode is active. Check that line if you get strange
+results!
+
+These are the paired-end specific filtering and output options:
+
+``--paired-output FILE`` or ``-p FILE``
+ Write the second read of each processed pair to *FILE* (in FASTA/FASTQ
+ format).
+
+``--untrimmed-paired-output FILE``
+ Used together with ``--untrimmed-output``. The second read in a pair is
+ written to this file when the processed pair was *not* trimmed.
+
+``--pair-filter=(any|both)``
+ Which of the reads in a paired-end read have to match the filtering
+ criterion in order for it to be filtered.
+
+Note that the option names can be abbreviated as long as it is clear which
+option is meant (unique prefix). For example, instead of ``--untrimmed-output``
+and ``--untrimmed-paired-output``, you can write ``--untrimmed-o`` and
+``--untrimmed-p``.
+
+
+Interleaved paired-end reads
+----------------------------
+
+Paired-end reads can be read from a single FASTQ file in which the entries for
+the first and second read from each pair alternate. The first read in each pair
+comes before the second. Enable this file format by adding the ``--interleaved``
+option to the command-line. For example::
+
+ cutadapt --interleaved -q 20 -a ACGT -A TGCA -o trimmed.fastq reads.fastq
+
+The output FASTQ file will also be written interleaved. Cutadapt will detect if
+the input file is not properly interleaved by checking whether read names match
+and whether the file contains an even number of entries.
+
+When ``--interleaved`` is used, legacy mode is disabled (that is,
+read-modification options such as ``-q`` always apply to both reads).
+
+
+Legacy paired-end read trimming
+-------------------------------
+
+.. note::
+ This section describes the way paired-end trimming was done
+ in cutadapt before 1.8, where the ``-A``, ``-G``, ``-B`` options were not
+ available. It is less safe and more complicated, but you can still use it.
+
+If you do not use any of the filtering options that discard reads, such
+as ``--discard``, ``--minimum-length`` or ``--maximum-length``, you can run
+cutadapt on each file separately::
+
+ cutadapt -a ADAPTER_FWD -o trimmed.1.fastq reads1.fastq
+ cutadapt -a ADAPTER_REV -o trimmed.2.fastq reads2.fastq
+
+You can use the options that are listed under 'Additional modifications'
+in cutadapt's help output without problems. For example, if you want to
+quality-trim the first read in each pair with a threshold of 10, and the
+second read in each pair with a threshold of 15, then the commands could
+be::
+
+ cutadapt -q 10 -a ADAPTER_FWD -o trimmed.1.fastq reads1.fastq
+ cutadapt -q 15 -a ADAPTER_REV -o trimmed.2.fastq reads2.fastq
+
+If you use any of the filtering options, you must use cutadapt in the following
+way (with the ``-p`` option) to make sure that read pairs remain sychronized.
+
+First trim the forward read, writing output to temporary files (we also
+add some quality trimming)::
+
+ cutadapt -q 10 -a ADAPTER_FWD --minimum-length 20 -o tmp.1.fastq -p tmp.2.fastq reads.1.fastq reads.2.fastq
+
+Then trim the reverse read, using the temporary files as input::
+
+ cutadapt -q 15 -a ADAPTER_REV --minimum-length 20 -o trimmed.2.fastq -p trimmed.1.fastq tmp.2.fastq tmp.1.fastq
+
+Finally, remove the temporary files::
+
+ rm tmp.1.fastq tmp.2.fastq
+
+Please see the previous section for a much simpler way of trimming paired-end
+reads!
+
+In legacy paired-end mode, the read-modifying options such as ``-q`` only
+apply to the first file in each call to cutadapt (first ``reads.1.fastq``, then
+``tmp.2.fastq`` in this example). Reads in the second file are not affected by those
+options, but by the filtering options: If a read in the first file is
+discarded, then the matching read in the second file is also filtered
+and not written to the output given by ``--paired-output`` in order to
+keep both output files synchronized.
+
+
+.. _multiple-adapters:
+
+Multiple adapters
+=================
+
+It is possible to specify more than one adapter sequence by using the options
+``-a``, ``-b`` and ``-g`` more than once. Any combination is allowed, such as
+five ``-a`` adapters and two ``-g`` adapters. Each read will be searched for
+all given adapters, but **only the best matching adapter is removed**. (But it
+is possible to :ref:`trim more than one adapter from each
+read <more-than-one>`). This is how a command may look like to trim one of two
+possible 3' adapters::
+
+ cutadapt -a TGAGACACGCA -a AGGCACACAGGG -o output.fastq input.fastq
+
+The adapter sequences can also be read from a FASTA file. Instead of giving an
+explicit adapter sequence, you need to write ``file:`` followed by the name of
+the FASTA file::
+
+ cutadapt -a file:adapters.fasta -o output.fastq input.fastq
+
+All of the sequences in the file ``adapters.fasta`` will be used as 3'
+adapters. The other adapter options ``-b`` and ``-g`` also support this. Again,
+only the best matching adapter is trimmed from each read.
+
+When cutadapt has multiple adapter sequences to work with, either specified
+explicitly on the command line or via a FASTA file, it decides in the
+following way which adapter should be trimmed:
+
+* All given adapter sequences are matched to the read.
+* Adapter matches where the overlap length (see the ``-O`` parameter) is too
+ small or where the error rate is too high (``-e``) are removed from further
+ consideration.
+* Among the remaining matches, the one with the **greatest number of matching
+ bases** is chosen.
+* If there is a tie, the first adapter wins. The order of adapters is the order
+ in which they are given on the command line or in which they are found in the
+ FASTA file.
+
+If your adapter sequences are all similar and differ only by a variable barcode
+sequence, you should use a single adapter sequence instead that
+:ref:`contains wildcard characters <wildcards>`.
+
+
+.. _named-adapters:
+
+Named adapters
+--------------
+
+Cutadapt reports statistics for each adapter separately. To identify the
+adapters, they are numbered and the adapter sequence is also printed::
+
+ === Adapter 1 ===
+
+ Sequence: AACCGGTT; Length 8; Trimmed: 5 times.
+
+If you want this to look a bit nicer, you can give each adapter a name in this
+way::
+
+ cutadapt -a My_Adapter=AACCGGTT -o output.fastq input.fastq
+
+The actual adapter sequence in this example is ``AACCGGTT`` and the name
+assigned to it is ``My_Adapter``. The report will then contain this name in
+addition to the other information::
+
+ === Adapter 'My_Adapter' ===
+
+ Sequence: TTAGACATATCTCCGTCG; Length 18; Trimmed: 5 times.
+
+When adapters are read from a FASTA file, the sequence header is used as the
+adapter name.
+
+Adapter names are also used in column 8 of :ref:`info files <info-file>`.
+
+
+.. _demultiplexing:
+
+Demultiplexing
+--------------
+
+Cutadapt supports demultiplexing, which means that reads are written to different
+output files depending on which adapter was found in them. To use this, include
+the string ``{name}`` in the name of the output file and give each adapter a name.
+The path is then interpreted as a template and each trimmed read is written
+to the path in which ``{name}`` is replaced with the name of the adapter that
+was found in the read. Reads in which no adapter was found will be written to a
+file in which ``{name}`` is replaced with ``unknown``.
+
+.. note:
+ Demultiplexing is currently only supported for single-end reads. Paired-end
+ support is planned for one of the next versions.
+
+Example::
+
+ cutadapt -a one=TATA -a two=GCGC -o trimmed-{name}.fastq.gz input.fastq.gz
+
+This command will create the three files ``demulti-one.fastq.gz``,
+``demulti-two.fastq.gz`` and ``demulti-unknown.fastq.gz``. You can :ref:`also
+provide adapter sequences in a FASTA file <multiple-adapters>`.
+
+In order to not trim the input files at all, but to only do multiplexing, use
+option ``--no-trim``. And if you want to output the reads in which no
+adapters were found to a different file, use the ``--untrimmed-output``
+parameter with a file name. Here is an example that uses both parameters and
+reads the adapters from a FASTA file (note that ``--untrimmed-output`` can be
+abbreviated)::
+
+ cutadapt -a file:barcodes.fasta --no-trim --untrimmed-o untrimmed.fastq.gz -o trimmed-{name}.fastq.gz input.fastq.gz
+
+
+.. _more-than-one:
+
+Trimming more than one adapter from each read
+---------------------------------------------
+
+By default, at most one adapter sequence is removed from each read, even if
+multiple adapter sequences were provided. This can be changed by using the
+``--times`` option (or its abbreviated form ``-n``). Cutadapt will then search
+for all the given adapter sequences repeatedly, either until no adapter match
+was found or until the specified number of rounds was reached.
+
+As an example, assume you have a protocol in which a 5' adapter gets ligated
+to your DNA fragment, but it's possible that the adapter is ligated more than
+once. So your sequence could look like this::
+
+ ADAPTERADAPTERADAPTERMYSEQUENCE
+
+To be on the safe side, you assume that there are at most 5 copies of the
+adapter sequence. This command can be used to trim the reads correctly::
+
+ cutadapt -g ^ADAPTER -n 5 -o output.fastq input.fastq
+
+This feature can also be used to search for *5'/3' linked adapters*. For example,
+when the 5' adapter is *FIRST* and the 3' adapter is *SECOND*, then the read
+could look like this::
+
+ FIRSTMYSEQUENCESECOND
+
+That is, the sequence of interest is framed by the 5' and the 3' adapter. The
+following command can be used to trim such a read::
+
+ cutadapt -g ^FIRST -a SECOND -n 2 ...
+
+Support for linked adapters is currently incomplete. For example, it is not
+possible to specify that SECOND should only be trimmed when FIRST also occurs.
+`See also this feature
+request <https://code.google.com/p/cutadapt/issues/detail?id=34>`_, and
+comment on it if you would like to see this implemented.
+
+
+.. _truseq:
+
+Illumina TruSeq
+===============
+
+If you have reads containing Illumina TruSeq adapters, follow these
+steps.
+
+Single-end reads as well as the first reads of paired-end data need to be
+trimmed with ``A`` + the “TruSeq Indexed Adapter”. Use only the prefix of the
+adapter sequence that is common to all Indexed Adapter sequences::
+
+ cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC -o trimmed.fastq.gz reads.fastq.gz
+
+If you have paired-end data, trim also read 2 with the reverse complement of the
+“TruSeq Universal Adapter”. The full command-line looks as follows::
+
+ cutadapt \
+ -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
+ -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \
+ -o trimmed.1.fastq.gz -p trimmed.2.fastq.gz \
+ reads.1.fastq.gz reads.2.fastq.gz
+
+See also the :ref:`section about paired-end adapter trimming above <paired-end>`.
+
+If you want to simplify this a bit, you can also use the common prefix
+``AGATCGGAAGAGC`` as the adapter sequence in both cases::
+
+ cutadapt \
+ -a AGATCGGAAGAGC -A AGATCGGAAGAGC \
+ -o trimmed.1.fastq.gz -p trimmed.2.fastq.gz \
+ reads.1.fastq.gz reads.2.fastq.gz
+
+The adapter sequences can be found in the document `Illumina TruSeq Adapters
+De-Mystified <http://tucf-genomics.tufts.edu/documents/protocols/TUCF_Understanding_Illumina_TruSeq_Adapters.pdf>`__.
+
+
+.. _warnbase:
+
+Warning about incomplete adapter sequences
+------------------------------------------
+
+Sometimes cutadapt’s report ends with these lines::
+
+ WARNING:
+ One or more of your adapter sequences may be incomplete.
+ Please see the detailed output above.
+
+Further up, you’ll see a message like this::
+
+ Bases preceding removed adapters:
+ A: 95.5%
+ C: 1.0%
+ G: 1.6%
+ T: 1.6%
+ none/other: 0.3%
+ WARNING:
+ The adapter is preceded by "A" extremely often.
+ The provided adapter sequence may be incomplete.
+ To fix the problem, add "A" to the beginning of the adapter sequence.
+
+This means that in 95.5% of the cases in which an adapter was removed from a
+read, the base coming *before* that was an ``A``. If your DNA fragments are
+not random, such as in amplicon sequencing, then this is to be expected and
+the warning can be ignored. If the DNA fragments are supposed to be random,
+then the message may be genuine: The adapter sequence may be incomplete and
+should include an additional ``A`` in the beginning.
+
+This warning exists because some documents list the Illumina TruSeq adapters
+as starting with ``GATCGGA...``. While that is technically correct, the
+library preparation actually results in an additional ``A`` before that
+sequence, which also needs to be removed. See the :ref:`previous
+section <truseq>` for the correct sequence.
+
+
+.. _dealing-with-ns:
+
+Dealing with ``N`` bases
+========================
+
+Cutadapt supports the following options to deal with ``N`` bases in your reads:
+
+``--max-n COUNT``
+ Discard reads containing more than *COUNT* ``N`` bases. A fractional *COUNT*
+ between 0 and 1 can also be given and will be treated as the proportion of
+ maximally allowed ``N`` bases in the read.
+
+``--trim-n``
+ Remove flanking ``N`` bases from each read. That is, a read such as this::
+
+ NNACGTACGTNNNN
+
+ Is trimmed to just ``ACGTACGT``. This option is applied *after* adapter
+ trimming. If you want to get rid of ``N`` bases before adapter removal, use
+ quality trimming: ``N`` bases typically also have a low quality value
+ associated with them.
+
+
+.. _bisulfite:
+
+Bisulfite sequencing (RRBS)
+===========================
+
+When trimming reads that come from a library prepared with the RRBS (reduced
+representation bisulfit sequencing) protocol, the last two 3' bases must be
+removed in addition to the adapter itself. This can be achieved by using not
+the adapter sequence itself, but by adding two wildcard characters to its
+beginning. If the adapter sequence is ``ADAPTER``, the command for trimming
+should be::
+
+ cutadapt -a NNADAPTER -o output.fastq input.fastq
+
+Details can be found in `Babraham bioinformatics' "Brief guide to
+RRBS" <http://www.bioinformatics.babraham.ac.uk/projects/bismark/RRBS_Guide.pdf>`_.
+A summary follows.
+
+During RRBS library preparation, DNA is digested with the restriction enzyme
+MspI, generating a two-base overhang on the 5' end (``CG``). MspI recognizes
+the sequence ``CCGG`` and cuts
+between ``C`` and ``CGG``. A double-stranded DNA fragment is cut in this way::
+
+ 5'-NNNC|CGGNNN-3'
+ 3'-NNNGGC|CNNN-5'
+
+The fragment between two MspI restriction sites looks like this::
+
+ 5'-CGGNNN...NNNC-3'
+ 3'-CNNN...NNNGGC-5'
+
+Before sequencing (or PCR) adapters can be ligated, the missing base positions
+must be filled in with GTP and CTP::
+
+ 5'-ADAPTER-CGGNNN...NNNCcg-ADAPTER-3'
+ 3'-ADAPTER-gcCNNN...NNNGGC-ADAPTER-5'
+
+The filled-in bases, marked in lowercase above, do not contain any original
+methylation information, and must therefore not be used for methylation calling.
+By prefixing the adapter sequence with ``NN``, the bases will be automatically
+stripped during adapter trimming.
+
+
+Cutadapt's output
+=================
+
+
+How to read the report
+----------------------
+
+After every run, cutadapt prints out per-adapter statistics. The output
+starts with something like this::
+
+ Sequence: 'ACGTACGTACGTTAGCTAGC'; Length: 20; Trimmed: 2402 times.
+
+The meaning of this should be obvious.
+
+The next piece of information is this::
+
+ No. of allowed errors:
+ 0-9 bp: 0; 10-19 bp: 1; 20 bp: 2
+
+The adapter has, as was shown above, has a length of 20
+characters. We are using the default error rate of 0.1. What this
+implies is shown above: Matches up to a length of 9 bp are allowed to
+have no errors. Matches of lengths 10-19 bp are allowd to have 1 error
+and matches of length 20 can have 2 errors. See also :ref:`the section about
+error-tolerant matching <error-tolerance>`.
+
+Finally, a table is output that gives more detailed information about
+the lengths of the removed sequences. The following is only an excerpt;
+some rows are left out::
+
+ Overview of removed sequences
+ length count expect max.err error counts
+ 3 140 156.2 0 140
+ 4 57 39.1 0 57
+ 5 50 9.8 0 50
+ 6 35 2.4 0 35
+ ...
+ 100 397 0.0 3 358 36 3
+
+The first row tells us the following: Three bases were removed in 140
+reads; randomly, one would expect this to occur 156.2 times; the maximum
+number of errors at that match length is 0 (this is actually redundant
+since we know already that no errors are allowed at lengths 0-9 bp).
+
+The last column shows the number of reads that had 0, 1, 2 ... errors.
+In the last row, for example, 358 reads matched the adapter with zero
+errors, 36 with 1 error, and 3 matched with 2 errors.
+
+The "expect" column gives only a rough estimate of the number of
+sequences that is expected to match randomly (it assumes a GC content of
+50%, for example), but it can help to estimate whether the matches that
+were found are true adapter matches or if they are due to chance. At
+lengths 6, for example, only 2.4 reads are expected, but 35 do match,
+which hints that most of these matches are due to actual adapters.
+
+Note that the "length" column refers to the length of the removed
+sequence. That is, the actual length of the match in the above row at
+length 100 is 20 since that is the adapter length. Assuming the read
+length is 100, the adapter was found in the beginning of 397 reads and
+therefore those reads were trimmed to a length of zero.
+
+The table may also be useful in case the given adapter sequence contains
+an error. In that case, it may look like this::
+
+ ...
+ length count expect max.err error counts
+ 10 53 0.0 1 51 2
+ 11 45 0.0 1 42 3
+ 12 51 0.0 1 48 3
+ 13 39 0.0 1 0 39
+ 14 40 0.0 1 0 40
+ 15 36 0.0 1 0 36
+ ...
+
+We can see that no matches longer than 12 have zero errors. In this
+case, it indicates that the 13th base of the given adapter sequence is
+incorrect.
+
+
+.. _info-file:
+
+Format of the info file
+-----------------------
+
+When the ``--info-file`` command-line parameter is given, detailed
+information about the found adapters is written to the given file. The
+output is a tab-separated text file. Each line corresponds to one read
+of the input file (unless `--times` is used, see below). The fields are:
+
+1. Read name
+2. Number of errors
+3. 0-based start coordinate of the adapter match
+4. 0-based end coordinate of the adapter match
+5. Sequence of the read to the left of the adapter match (can be empty)
+6. Sequence of the read that was matched to the adapter
+7. Sequence of the read to the right of the adapter match (can be empty)
+8. Name of the found adapter.
+9. Quality values corresponding to sequence left of the adapter match (can be empty)
+10. Quality values corresponding to sequence matched to the adapter (can be empty)
+11. Quality values corresponding to sequence to the right of the adapter match (can be empty)
+
+The concatenation of the fields 5-7 yields the full read sequence. Column 8 identifies
+the found adapter. `The section about named adapters <named-adapters>` describes
+how to give a name to an adapter. Adapters without a name are numbered starting
+from 1. Fields 9-11 are empty if quality values are not available.
+Concatenating them yields the full sequence of quality values.
+
+If no adapter was found, the format is as follows:
+
+1. Read name
+2. The value -1
+3. The read sequence
+4. Quality values
+
+When parsing the file, be aware that additional columns may be added in
+the future. Note also that some fields can be empty, resulting in
+consecutive tabs within a line.
+
+If the ``--times`` option is used and greater than 1, each read can appear
+more than once in the info file. There will be one line for each found adapter,
+all with identical read names. Only for the first of those lines will the
+concatenation of columns 5-7 be identical to the original read sequence (and
+accordingly for columns 9-11). For subsequent lines, the shown sequence are the
+ones that were used in subsequent rounds of adapter trimming, that is, they get
+successively shorter.
+
+Columns 9-11 have been added in cutadapt version 1.9.
+
+
+.. _algorithm:
+
+The alignment algorithm
+=======================
+
+Since the publication of the `EMBnet journal application note about
+cutadapt <http://dx.doi.org/10.14806/ej.17.1.200>`_, the alignment algorithm
+used for finding adapters has changed significantly. An overview of this new
+algorithm is given in this section. An even more detailed description is
+available in Chapter 2 of my PhD thesis `Algorithms and tools for the analysis
+of high-throughput DNA sequencing data <http://hdl.handle.net/2003/31824>`_.
+
+The algorithm is based on *semiglobal alignment*, also called *free-shift*,
+*ends-free* or *overlap* alignment. In a regular (global) alignment, the
+two sequences are compared from end to end and all differences occuring over
+that length are counted. In semiglobal alignment, the sequences are allowed to
+freely shift relative to each other and differences are only penalized in the
+overlapping region between them::
+
+ FANTASTIC
+ ELEFANT
+
+The prefix ``ELE`` and the suffix ``ASTIC`` do not have a counterpart in the
+respective other row, but this is not counted as an error. The overlap ``FANT``
+has a length of four characters.
+
+Traditionally, *alignment scores* are used to find an optimal overlap aligment:
+This means that the scoring function assigns a positive value to matches,
+while mismatches, insertions and deletions get negative values. The optimal
+alignment is then the one that has the maximal total score. Usage of scores
+has the disadvantage that they are not at all intuitive: What does a total score
+of *x* mean? Is that good or bad? How should a threshold be chosen in order to
+avoid finding alignments with too many errors?
+
+For cutadapt, the adapter alignment algorithm uses *unit costs* instead.
+This means that mismatches, insertions and deletions are counted as one error, which
+is easier to understand and allows to specify a single parameter for the
+algorithm (the maximum error rate) in order to describe how many errors are
+acceptable.
+
+There is a problem with this: When using costs instead of scores, we would like
+to minimize the total costs in order to find an optimal alignment. But then the
+best alignment would always be the one in which the two sequences do not overlap
+at all! This would be correct, but meaningless for the purpose of finding an
+adapter sequence.
+
+The optimization criteria are therefore a bit different. The basic idea is to
+consider the alignment optimal that maximizes the overlap between the two
+sequences, as long as the allowed error rate is not exceeded.
+
+Conceptually, the procedure is as follows:
+
+1. Consider all possible overlaps between the two sequences and compute an
+ alignment for each, minimizing the total number of errors in each one.
+2. Keep only those alignments that do not exceed the specified maximum error
+ rate.
+3. Then, keep only those alignments that have a maximal number of matches
+ (that is, there is no alignment with more matches).
+4. If there are multiple alignments with the same number of matches, then keep
+ only those that have the smallest error rate.
+5. If there are still multiple candidates left, choose the alignment that starts
+ at the leftmost position within the read.
+
+In Step 1, the different adapter types are taken into account: Only those
+overlaps that are actually allowed by the adapter type are actually considered.
diff --git a/doc/ideas.rst b/doc/ideas.rst
new file mode 100644
index 0000000..b5fa9d7
--- /dev/null
+++ b/doc/ideas.rst
@@ -0,0 +1,103 @@
+Ideas/To Do
+===========
+
+This is a rather unsorted list of features that would be nice to have, of
+things that could be improved in the source code, and of possible algorithmic
+improvements.
+
+- show average error rate
+- In colorspace and probably also for Illumina data, gapped alignment
+ is not necessary
+- ``--progress``
+- run pylint, pychecker
+- length histogram
+- check whether input is FASTQ although -f fasta is given
+- search for adapters in the order in which they are given on the
+ command line
+- more tests for the alignment algorithm
+- deprecate ``--rest-file``
+- ``--detect`` prints out best guess which of the given adapters is the correct one
+- alignment algorithm: make a 'banded' version
+- it seems the str.find optimization isn't very helpful. In any case, it should be
+ moved into the Aligner class.
+- allow to remove not the adapter itself, but the sequence before or after it
+- instead of trimming, convert adapter to lowercase
+- warn when given adapter sequence contains non-IUPAC characters
+- try multithreading again, this time use os.pipe() or 0mq
+- extensible file type detection
+- the --times setting should be an attribute of Adapter
+
+Specifying adapters
+-------------------
+
+The idea is to deprecate the ``-b``, ``-g`` and ``-u`` parameters. Only ``-a``
+is used with a special syntax for each adapter type. This makes it a bit easier
+to add new adapter types in the feature.
+
+.. csv-table::
+
+ back,``-a ADAPTER``,``-a ADAPTER`` or ``-a ...ADAPTER``
+ suffix,``-a ADAPTER$``,``-a ...ADAPTER$``
+ front,``-g ADAPTER``,``-a ADAPTER...``
+ prefix,``-g ^ADAPTER``,``-a ^ADAPTER...`` (or have anchoring by default?)
+ anywhere,``-b ADAPTER``, ``-a ...ADAPTER...`` ???
+ unconditional,``-u +10``,``-a 10...`` (collides with colorspace)
+ unconditional,``-u -10``,``-a ...10$``
+ linked,(not implemented),``-a ADAPTER...ADAPTER`` or ``-a ^ADAPTER...ADAPTER``
+
+Or add only ``-a ADAPTER...`` as an alias for ``-g ^ADAPTER`` and
+``-a ...ADAPTER`` as an alias for ``-a ADAPTER``.
+
+The ``...`` would be equivalent to ``N*`` as in regular expressions.
+
+Another idea: Allow something such as ``-a ADAP$TER`` or ``-a ADAPTER$NNN``.
+This would be a way to specify less strict anchoring.
+
+Make it possible to specify that the rightmost or leftmost match should be
+picked. Default right now: Leftmost, even for -g adapters.
+
+Allow ``N{3,10}`` as in regular expressions (for a variable-length sequence).
+
+Use parentheses to specify the part of the sequence that should be kept:
+
+* ``-a (...)ADAPTER`` (default)
+* ``-a (...ADAPTER)`` (default)
+* ``-a ADAPTER(...)`` (default)
+* ``-a (ADAPTER...)`` (??)
+
+Or, specify the part that should be removed:
+
+ ``-a ...(ADAPTER...)``
+ ``-a ...ADAPTER(...)``
+ ``-a (ADAPTER)...``
+
+Model somehow all the flags that exist for semiglobal alignment. For start of the adapter:
+
+* Start of adapter can be degraded or not
+* Bases are allowed to be before adapter or not
+
+Not degraded and no bases before allowed = anchored.
+Degraded and bases before allowed = regular 5'
+
+By default, the 5' end should be anchored, the 3' end not.
+
+* ``-a ADAPTER...`` → not degraded, no bases before allowed
+* ``-a N*ADAPTER...`` → not degraded, bases before allowed
+* ``-a ADAPTER^...`` → degraded, no bases before allowed
+* ``-a N*ADAPTER^...`` → degraded, bases before allowed
+* ``-a ...ADAPTER`` → degraded, bases after allowed
+* ``-a ...ADAPTER$`` → not degraded, no bases after allowed
+
+
+
+Paired-end trimming
+-------------------
+
+* Could also use a paired-end read merger, then remove adapters with -a and -g
+
+Available/used letters for command-line options
+-----------------------------------------------
+
+* Remaining characters: All uppercase letters except A, B, G, M, N, O, U
+* Lowercase letters: i, j, k, l, s, w
+* Planned/reserved: Q (paired-end quality trimming), j (multithreading)
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..f42e58f
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,25 @@
+.. include:: ../README.rst
+
+=================
+Table of contents
+=================
+
+.. toctree::
+ :maxdepth: 2
+
+ installation
+ guide
+ colorspace
+ recipes
+ ideas
+ changes
+
+
+..
+ Indices and tables
+ ==================
+
+ * :ref:`genindex`
+ * :ref:`modindex`
+ * :ref:`search`
+
diff --git a/doc/installation.rst b/doc/installation.rst
new file mode 100644
index 0000000..305d910
--- /dev/null
+++ b/doc/installation.rst
@@ -0,0 +1,127 @@
+============
+Installation
+============
+
+Quickstart
+----------
+
+The easiest way to install cutadapt is to use ``pip`` on the command line::
+
+ pip install --user --upgrade cutadapt
+
+This will download the software from `PyPI (the Python packaging
+index) <https://pypi.python.org/pypi/cutadapt/>`_, and
+install the cutadapt binary into ``$HOME/.local/bin``. If an old version of
+cutadapt exists on your system, the ``--upgrade`` parameter is required in order
+to install a newer version. You can then run the program like this::
+
+ ~/.local/bin/cutadapt --help
+
+If you want to avoid typing the full path, add the directory
+``$HOME/.local/bin`` to your ``$PATH`` environment variable.
+
+
+Installation with conda
+-----------------------
+
+Alternatively, cutadapt is also available as a conda package from the
+`bioconda channel <https://bioconda.github.io/>`_. If you do not have conda,
+`install miniconda <http://conda.pydata.org/miniconda.html>`_ first.
+Then install cutadapt like this::
+
+ conda install -c bioconda cutadapt
+
+If neither `pip` nor `conda` installation works, keep reading.
+
+
+Dependencies
+------------
+
+Cutadapt requires this software to be installed:
+
+* One of Python 2.6, 2.7, 3.3, 3.4 or 3.5. Python 2.7 is a bit faster than the
+ other versions.
+* A C compiler.
+
+Under Ubuntu, you may need to install the packages ``build-essential`` and
+``python-dev`` (or ``python3-dev``).
+
+
+Installation
+------------
+
+If you have already downloaded and unpacked the ``.tar.gz`` file, then
+installation is done like this (replace "python" with "python3" to
+install the Python 3 version)::
+
+ python setup.py install --user
+
+If you get an error message::
+
+ error: command 'gcc' failed with exit status 1
+
+Then check the entire error message. If it says something about a missing ``Python.h``
+file, then you need to install the Python development packages. The
+appropriate package is called ``python-dev`` in Ubuntu (or ``python3-dev``
+for Python 3).
+
+
+System-wide installation (root required)
+----------------------------------------
+
+If you have root access, then you can install cutadapt system-wide by running::
+
+ sudo pip install cutadapt
+
+This installs cutadapt into `/usr/local/bin`.
+
+If you want to upgrade from an older version, use this command instead::
+
+ sudo pip install --upgrade cutadapt
+
+
+Uninstalling
+------------
+
+Type ::
+
+ pip uninstall cutadapt
+
+and confirm with ``y`` to remove the package. Under some circumstances, multiple
+versions may be installed at the same time. Repeat the above command until you
+get an error message in order to make sure that all versions are removed.
+
+
+Shared installation (on a cluster)
+----------------------------------
+
+If you have a larger installation and want to provide cutadapt as a module
+that can be loaded and unloaded (with the Lmod system, for example), we
+recommend that you create a virtual environment and 'pip install' cutadapt into
+it. These instructions work on our SLURM cluster that uses the Lmod system
+(replace ``1.9.1`` with the actual version you want to use)::
+
+ BASE=/software/cutadapt-1.9.1
+ virtualenv $BASE/venv
+ $BASE/venv/bin/pip install --install-option="--install-scripts=$BASE/bin" cutadapt==1.9.1
+
+The ``install-option`` part is important. It ensures that a second, separate
+``bin/`` directory is created (``/software/cutadapt-1.9.1/bin/``) that *only*
+contains the ``cutadapt`` script and nothing else. To make cutadapt available to
+the users, that directory (``$BASE/bin``) needs to be added to the ``$PATH``.
+
+Make sure you *do not* add the ``bin/`` directory within the ``venv`` directory
+to the ``$PATH``! Otherwise, a user trying to run ``python`` who also has the
+cutadapt module loaded would get the python from the virtual environment,
+which leads to confusing error messages.
+
+A simple module file for the Lmod system matching the above example could look
+like this::
+
+ conflict("cutadapt")
+ whatis("adapter trimming tool")
+ prepend_path("PATH", "/software/cutadapt-1.9.1/bin")
+
+Please note that there is no need to “activate” the virtual environment:
+Activation merely adds the ``bin/`` directory to the ``$PATH``, so the
+``prepend_path`` directive is equivalent to activating the virtual environment.
diff --git a/doc/recipes.rst b/doc/recipes.rst
new file mode 100644
index 0000000..3020be4
--- /dev/null
+++ b/doc/recipes.rst
@@ -0,0 +1,83 @@
+=======
+Recipes
+=======
+
+For some trimming applications, the pre-defined adapter types behave differently
+from what you would like to have. In this section, we show some ways in which
+cutadapt can be made to behave in the desired way.
+
+.. note:: This section is still being written.
+
+
+Forcing matches to be at the end of the read
+--------------------------------------------
+
+Use ``-a TACGGCATXXX``. The ``X`` is always counted as a mismatch and will force
+the adapter match to be at the end. This is not the same as an anchored 3'
+adapter since partial matches are still allowed.
+
+
+Removing more than one adapter
+------------------------------
+
+If you want to remove more than one adapter, let's say a 5' adapter and a 3'
+adapter, you have two options.
+
+First, you can specify both adapters and also ``--times=2`` (or the short
+version ``-n 2``). For example::
+
+ cutadapt -g ^TTAAGGCC -a TACGGACT -n 2 -o output.fastq input.fastq
+
+This instructs cutadapt to run two rounds of adapter finding and removal. That
+means that, after the first round and only when an adapter was actually found,
+another round is performed. In both rounds, all given adapters (two in this
+case) are searched and removed. The problem is that it could happen that one
+adapter is found twice (so the 3' adapter, for example, could be removed twice).
+
+The second option is to not use the ``-n`` option, but to run cutadapt twice,
+first removing one adapter and then the other. It is easiest if you use a pipe
+as in this example::
+
+ cutadapt -g ^TTAAGGCC input.fastq | cutadapt -a TACGGACT - > output.fastq
+
+
+Trimming poly-A tails
+---------------------
+
+If you want to trim a poly-A tail from the 3' end of your reads, use the 3'
+adapter type (``-a``) with an adapter sequence of many repeated ``A``
+nucleotides. Starting with version 1.8 of cutadapt, you can use the
+following notation to specify a sequence that consists of 100 ``A``::
+
+ cutadapt -a "A{100}" -o output.fastq input.fastq
+
+This also works when there are sequencing errors in the poly-A tail. So this
+read ::
+
+ TACGTACGTACGTACGAAATAAAAAAAAAAA
+
+will be trimmed to::
+
+ TACGTACGTACGTACG
+
+If for some reason you would like to use a shorter sequence of ``A``, you can
+do so: The matching algorithm always picks the leftmost match that it can find,
+so cutadapt will do the right thing even when the tail has more ``A`` than you
+used in the adapter sequence. However, sequencing errors may result in shorter
+matches than desired. For example, using ``-a "A{10}"``, the read above (where
+the ``AAAT`` is followed by eleven ``A``) would be trimmed to::
+
+ TACGTACGTACGTACGAAAT
+
+Depending on your application, perhaps a variant of ``-a A{10}N{90}`` is an
+alternative, forcing the match to be located as much to the left as possible,
+while still allowing for non-``A`` bases towards the end of the read.
+
+
+Other things (unfinished)
+-------------------------
+
+* How to detect adapters
+* Use cutadapt for quality-trimming only
+* Use it for minimum/maximum length filtering
+* Use it for conversion to FASTQ
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1a2d235
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,148 @@
+"""
+Build cutadapt.
+
+Cython is run when
+* no pre-generated C sources are found,
+* or the pre-generated C sources are out of date,
+* or when --cython is given on the command line.
+"""
+import sys
+import os.path
+
+from distutils.core import setup, Extension
+from distutils.version import LooseVersion
+from distutils.command.sdist import sdist as _sdist
+from distutils.command.build_ext import build_ext as _build_ext
+
+MIN_CYTHON_VERSION = '0.24'
+
+if sys.version_info < (2, 6):
+ sys.stdout.write("At least Python 2.6 is required.\n")
+ sys.exit(1)
+
+
+# set __version__
+with open(os.path.join(os.path.dirname(__file__), 'cutadapt', '__init__.py')) as f:
+ for line in f:
+ if line.startswith('__version__'):
+ exec(line)
+ break
+
+
+def out_of_date(extensions):
+ """
+ Check whether any pyx source is newer than the corresponding generated
+ C source or whether any C source is missing.
+ """
+ for extension in extensions:
+ for pyx in extension.sources:
+ path, ext = os.path.splitext(pyx)
+ if ext not in ('.pyx', '.py'):
+ continue
+ if extension.language == 'c++':
+ csource = path + '.cpp'
+ else:
+ csource = path + '.c'
+ # When comparing modification times, allow five seconds slack:
+ # If the installation is being run from pip, modification
+ # times are not preserved and therefore depends on the order in
+ # which files were unpacked.
+ if not os.path.exists(csource) or (
+ os.path.getmtime(pyx) > os.path.getmtime(csource) + 5):
+ return True
+ return False
+
+
+def no_cythonize(extensions, **_ignore):
+ """
+ Change file extensions from .pyx to .c or .cpp.
+
+ Copied from Cython documentation
+ """
+ for extension in extensions:
+ sources = []
+ for sfile in extension.sources:
+ path, ext = os.path.splitext(sfile)
+ if ext in ('.pyx', '.py'):
+ if extension.language == 'c++':
+ ext = '.cpp'
+ else:
+ ext = '.c'
+ sfile = path + ext
+ sources.append(sfile)
+ extension.sources[:] = sources
+
+
+def check_cython_version():
+ """Exit if Cython was not found or is too old"""
+ try:
+ from Cython import __version__ as cyversion
+ except ImportError:
+ sys.stdout.write(
+ "ERROR: Cython is not installed. Install at least Cython version " +
+ str(MIN_CYTHON_VERSION) + " to continue.\n")
+ sys.exit(1)
+ if LooseVersion(cyversion) < LooseVersion(MIN_CYTHON_VERSION):
+ sys.stdout.write(
+ "ERROR: Your Cython is at version '" + str(cyversion) +
+ "', but at least version " + str(MIN_CYTHON_VERSION) + " is required.\n")
+ sys.exit(1)
+
+
+extensions = [
+ Extension('cutadapt._align', sources=['cutadapt/_align.pyx']),
+ Extension('cutadapt._qualtrim', sources=['cutadapt/_qualtrim.pyx']),
+ Extension('cutadapt._seqio', sources=['cutadapt/_seqio.pyx']),
+]
+
+
+class build_ext(_build_ext):
+ def run(self):
+ # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip
+ # file retrieved from PyPI that already includes the pre-cythonized
+ # extension modules, and then we do not need to run cythonize().
+ if os.path.exists('PKG-INFO'):
+ no_cythonize(extensions)
+ else:
+ # Otherwise, this is a 'developer copy' of the code, and then the
+ # only sensible thing is to require Cython to be installed.
+ check_cython_version()
+ from Cython.Build import cythonize
+ self.extensions = cythonize(self.extensions)
+ _build_ext.run(self)
+
+
+class sdist(_sdist):
+ def run(self):
+ # Make sure the compiled Cython files in the distribution are up-to-date
+ from Cython.Build import cythonize
+ check_cython_version()
+ cythonize(extensions)
+ _sdist.run(self)
+
+
+setup(
+ name = 'cutadapt',
+ version = __version__,
+ author = 'Marcel Martin',
+ author_email = 'marcel.martin at scilifelab.se',
+ url = 'https://cutadapt.readthedocs.org/',
+ description = 'trim adapters from high-throughput sequencing reads',
+ license = 'MIT',
+ cmdclass = {'sdist': sdist, 'build_ext': build_ext},
+ ext_modules = extensions,
+ packages = ['cutadapt', 'cutadapt.scripts'],
+ scripts = ['bin/cutadapt'],
+ classifiers = [
+ "Development Status :: 5 - Production/Stable",
+ "Environment :: Console",
+ "Intended Audience :: Science/Research",
+ "License :: OSI Approved :: MIT License",
+ "Natural Language :: English",
+ "Programming Language :: Cython",
+ "Programming Language :: Python :: 2.6",
+ "Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 3",
+ "Topic :: Scientific/Engineering :: Bio-Informatics"
+ ]
+)
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 0000000..a1b3311
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1,3 @@
+tmp.log
+tmp.fastaq
+tmp.fastq
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cut/454.fa b/tests/cut/454.fa
new file mode 100644
index 0000000..7d4f345
--- /dev/null
+++ b/tests/cut/454.fa
@@ -0,0 +1,118 @@
+>000163_1255_2627 length=8 uaccno=E0R4ISW01DCIQD
+GTGTGGTG
+>000652_1085_0667 length=80 uaccno=E0R4ISW01CXJXP
+ATTGAAGAGGTTGGTAAGTTTTAAGTTGGTAGGTGGTTGGGGAGTGGTTGGAGAGGAGTTGTTGGGAGTTTGTGTCCTGC
+>000653_1285_1649 length=92 uaccno=E0R4ISW01DE4SJ
+AATTAGTCGAGCGTTGTGGTGGGTATTTGTAATTTTAGCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGGAGGTTGC
+>000902_0715_2005 length=50 uaccno=E0R4ISW01B03K3
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC
+>001146_1255_0340 length=50 uaccno=E0R4ISW01DCGYU
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC
+>001210_1147_1026 length=124 uaccno=E0R4ISW01C2Z5W
+GAGGTGGTGAGTGTTGTGTGTTTAGATTGTGTGTGGTGGTTGGGAGTGGGAGTTGTATTTTAGGGTGTGGGTTGGGAGAGTGAAAGTTGTGGGTGTTTTGGATGGTGGGTTAGGTGGTTGTGCC
+>001278_1608_2022 length=66 uaccno=E0R4ISW01D7HW4
+CACACACACTCTTCCCCATACCTACTCACACACACACACACACACACAAACATACACAAATAATTC
+>001333_1518_1176 length=100 uaccno=E0R4ISW01DZKTM
+AATTGTCGTTTGATTGTTGGAAAGTAGAGGGTCGGGTTGGGGTAGATTCGAAAGGGGAATTTTGAGAAAAGAAATGGAGGGAGGTAGGAAAATTTTTTGC
+>001398_1584_1549 length=112 uaccno=E0R4ISW01D5DPB
+TAATGAAATGGAATGGAATGGAATGGAATGAAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGAAATGGAATGGAGTATAAAGGAATGGAATTAC
+>001455_1136_2179 length=50 uaccno=E0R4ISW01C12AD
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC
+>001481_1165_0549 length=50 uaccno=E0R4ISW01C4KON
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC
+>001744_1376_3512 length=101 uaccno=E0R4ISW01DM5T2
+TAAGTAGGGAAGGTTTGAGGTTGTTGGTGTTGGTAGTAGGGGTGTTTTAGTTAGGGGTTGTAGTTTGTTAAGGGAATTTTATTTGAGTTTAGAATTGAGGC
+>001893_1084_1137 length=120 uaccno=E0R4ISW01CXG4Z
+TGTATATTTTGTTGGGTTTGTATATATTGTTAGGTGTGGTTGGTGAGTTGTATTGGTGGTGGTGTAAGGTGAGTGGAAATGGGAATGGATTGTAGATATGTTGGATTTGTGGTTTTTGGT
+>001927_0254_0706 length=139 uaccno=E0R4ISW01AWLLG
+TGGAATCATCTAAGGGACACAAATAGAATCATCATTGAATGGAATCGAATGGAATCATCTAATGTACTCGAATGGAATTATTATTGAATAGAATAGAATGGAATTATCGAATGGAATCAAATGGAATGTAATGGAATGC
+>002007_1338_1037 length=95 uaccno=E0R4ISW01DJRTR
+GGGTTGTGTATTTGGATAGTATGTGGAAAATGGTATTAAAAAGAATTTGTAGTTGGATTGTTGGTGGTTATTTAGTTTTTGGGTAATGGGTAGAT
+>002186_1130_0654 length=50 uaccno=E0R4ISW01C1H5C
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC
+>002282_1237_2702 length=92 uaccno=E0R4ISW01DAXWG
+AATTAGCCGGGCGTGATGGCGGGCGTTTGTAGTTTTAGTTATTCGGGAGGTTGAGGTAGGAGAATGGCGTGAATTCGGGAAGCGGAGTTTGC
+>002382_1259_0997 length=64 uaccno=E0R4ISW01DCT37
+TAAGGGTTGAAGCGAGGTAGGTAGTTTGTTTGTGGTTTTGTTTCGTATTTTTGTTTCGTATCCC
+>002477_0657_0655 length=131 uaccno=E0R4ISW01BVY8H
+TTTTTGGAAAGTTGGGTGGGTATAGTTTTGAGTAGTTAGAGGTATTATAATAGTATTAGGAAGTTGAATGTGAGGGTATAAGAGTTAATTTGATTTTTCGTTGATATGTTTGTTGTTTGAAGTTAGAGTGC
+>003149_1553_2333 length=128 uaccno=E0R4ISW01D2OBZ
+TATTTAGTTTTAGTTTGTTTAGGTGGTTATAGAATACGGAGTTTATGAAGTTGATTAGGAATATTATTAGTTGAATTAAGAATTGGGAAGAGAGGGGAACGGGAAGGGACGTGAGTGATTATTATTGC
+>003194_1475_2845 length=58 uaccno=E0R4ISW01DVT7J
+TATTTTGGGTTAAGTCGGGTTTAGTTGTTAGGGCGAGAAGTTAGTTGTTGACCCCTGC
+>003206_1315_0479 length=52 uaccno=E0R4ISW01DHQPD
+GGGTTGGATAATATGATGGTGTTGGGGAATATTTAGGTATGTGGTTTGTGGC
+>003271_0173_0314 length=82 uaccno=E0R4ISW01APHAK
+GTTTATTTGTTATTTATTTTTAGGTTTAGAAGAGTGTTTGGTATTTATTGAGGATTTAGTATTTGTTAGAAGGATTGGATTC
+>003443_1737_2250 length=21 uaccno=E0R4ISW01EITSS
+TGTAGGTTGTGTTGTAGGTTG
+>002633_1776_1582 length=40 uaccno=E0R4ISW01EL8JK
+CAGGGTGGATTGGGGAACACACAGTGTGGCCGCGTGATTC
+>002663_0725_3154 length=84 uaccno=E0R4ISW01B1Z2S
+GCGTTTTATATTATAATTTAATATTTTGGAGGTTGGGTGCGGTGGTTTACGTTTGTAGTTTAGTATTTGGGAGGTTAAGGTAGC
+>002761_1056_4055 length=72 uaccno=E0R4ISW01CU2V9
+AATTTTATTCGATTTATGTGATGATTTATTTATTTTATTTGAAGATGATTTTATTCGAGATTATTCGATGAT
+>002843_0289_2275 length=80 uaccno=E0R4ISW01AZPE9
+ATTGAAGAGGTTGGTAAGTTTTAAGTTGGTAGGTGGTTGGGGAGTGGTTGGAGAGGAGTTGTTGGGAGTTTGTGTCCTGC
+>002934_1762_2177 length=50 uaccno=E0R4ISW01EK0Q7
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC
+>003515_1711_1058 length=79 uaccno=E0R4ISW01EGIPG
+AATTGAATGGAATTATTATTGAATGGATTCGAATGGAATTATTATTGAATGGAATCATCGAGTGGAATCGAATGGAATC
+>003541_1276_1589 length=70 uaccno=E0R4ISW01DECAV
+TAGTTTAGGGTGGTAGTTTGGATAAGGTAGTTTTACGGTTTAGTAGTAGTAGGTTAAGTAGGAAAACTGC
+>003587_1522_1804 length=109 uaccno=E0R4ISW01DZXX6
+AATTTATGTAGTGGAAGTAGGATATAAAGAATAGGTTAATGGATTTTGAGATATTAAAAAGAGTAGGAAATTAGTTGAGAGGTTAAGTAGTAGTTTATTTTAGCCACCC
+>003592_0076_0430 length=92 uaccno=E0R4ISW01AGYTC
+AATTAGTTAGGCGTGGTGGCGGGTGTTTGTAGTTTTAGTTATTCGGGAGGTTGAGGTAGGAGAATGTTGTGAATTTAGGAGGTGGAGTTTGC
+>003957_0595_0965 length=130 uaccno=E0R4ISW01BQJIV
+TAATATTAGGTGTCAATTTGACTGGATCGAGGGATGTGTGTCGGTGAGAGTCTCACTAGAGGTTGATATTTGAGTCGTTAGACTGGGAGAGGAAGACCGAACTGTCAAGTGTATGGGCGCCATCCAATTC
+>003986_1127_2937 length=61 uaccno=E0R4ISW01C1AFF
+TAATGGAATGGAATTTTCGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATTAC
+>004012_1559_1491 length=72 uaccno=E0R4ISW01D26M9
+TAGTGGATATAAATGGAATGGATTGGAATGGAATGGATACGAATGGAATGGATTGGAGTGGAATGGATTGAC
+>004030_1508_2061 length=123 uaccno=E0R4ISW01DYPWF
+TACGTATATACGCGTACGCGTATACGTATATACGCGTATACGTATACGCGTACGTATATATACGCGTATACGTTTACGTACGTACGCGTATATACGTACGTATACACACACGCATATGCATAC
+>004038_1061_2047 length=109 uaccno=E0R4ISW01CVG5D
+AATTGATTCGAATGGAATGGATTGGAATGGAACGGATTTGAATGGAATGGATTGGAATGGAATGGATTGAATGGAATGGATTGGAGAGGATTGGATTTGAATGGAATTC
+>004105_1121_0391 length=92 uaccno=E0R4ISW01C0PH1
+AATTAGTTGGGCGTGGTGGCGAGTGTTTGTAATTTTAGTTATTTAGGAGGTTGAGGTAGGAGAATTATTTGAACCCGGTAGACGGAAGTTGC
+>004129_1618_3423 length=79 uaccno=E0R4ISW01D8ELT
+AATTGAATGGTATTGAAAGGTATTAATTTAGTGGAATGGAATGGAATGTATTGGAATGGAAAATAATGGAATGGAGTGC
+>004203_0451_0902 length=72 uaccno=E0R4ISW01BDWC4
+TAGTTGGTGTGTTGTAATCGAGACGTAGTTGGTTGGTACGGGTTAGGGTTTTGATTGGGTTGTTGTGTTTGC
+>004626_1937_0919 length=180 uaccno=E0R4ISW01E0CVD
+TAGAGTAGATAGTAGGGTTAGAGAAGGTAGGGTACGTTTAGTTTGTTAGTAAGGTTTAAGTTTTGGGTGGGAAAGGTTAGTGGCGGGAAGGGACGAAGGTGGTAATCGAGAGTAGATTTAGAGAAGTTTTTGAAGTGGGCGTTGGGAGTTTTCGAAGTATTGAGAGAGAGGAGCTTGTGC
+>004913_0641_2071 length=92 uaccno=E0R4ISW01BULRD
+AATTAGTCGAGCGTTGTGGTGGGTATTTGTAATTTTAGCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGGAGGTTGC
+>005063_0599_1983 length=84 uaccno=E0R4ISW01BQWX9
+ATGTGGTGAAGATTGGTTTTAGGTGTTTTAATGTGGATTTTCAGGGGTTTTAAAAGGGTTGGGAGAGTGAAATATATATAAGGC
+>005140_0759_3209 length=74 uaccno=E0R4ISW01B4ZKR
+TAGTATAGAGGGTTTGTGGTCGTGAGGGTGTTGATGGCGGGAGGGTTTTGATGGTAGGAGGGCCCGTGCTGTGC
+>005351_0883_3221 length=95 uaccno=E0R4ISW01CFVHJ
+TTAGGTGTTATAGTTGAGTGAGATGTTAGTGTTTAATGGTTTTATTTAGGTTGATGGGTTAATGAGGGGGTATTTGATAGTTTTGAAGATTTGAC
+>005380_1702_1187 length=160 uaccno=E0R4ISW01EFQC1
+GTTTTTCGAGTATATATTTAGTAGTACGCTCGACTTCTCTTATATAAAGGTTTTGGTTTTTATAGGTTTTTCCATTGTGTCTGCCTGGGGGAGGGCCCTTCTCCTTCAGGATACTGTAGCTTCTCTGCGTGATAAGCCAGCATTCACGGCTTTCAGGTGC
+>005568_1060_1943 length=20 uaccno=E0R4ISW01CVDWP
+ATAGCGTATTTCTCACCTGC
+>005740_1536_2697 length=116 uaccno=E0R4ISW01D06VV
+TAAAGAGGTGTTATTATTAGTTAGGAGAGGAGGTGGTTAGATAGTAGTGGGATTATAGGGGAATATAGAGTTGTTAGTTTAGGGATAAGGGATTGATCGATGGGTTAGGTCTCTGC
+>005753_1884_3877 length=53 uaccno=E0R4ISW01EVRNB
+AAACTGAGTTGTGATGTTTGCATTCAACTCACAGAGTTCAACATTCCTTTAAC
+>read_equals_adapter 1a
+
+>read_equals_start_of_adapter 1b
+
+>read_equals_end_of_adapter 1c
+
+>read_equals_middle_of_adapter 1d
+
+>read_ends_with_adapter 2a
+GCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCG
+>read_ends_with_start_of_adapter 2b
+GCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCG
+>read_contains_adapter_in_the_middle 3
+CGTAGTTGGTTGGTACG
+>read_starts_with_adapter 4a
+AAAGGTTTTGGTTTTTATAGGTTTTT
+>read_starts_with_end_of_adapter 4b
+AAAGGTTTTGGTTTTTATAGGTTTTT
diff --git a/tests/cut/anchored-back.fasta b/tests/cut/anchored-back.fasta
new file mode 100644
index 0000000..c65f89a
--- /dev/null
+++ b/tests/cut/anchored-back.fasta
@@ -0,0 +1,8 @@
+>read1
+sequence
+>read2
+sequenceBACKADAPTERblabla
+>read3
+sequenceBACKADA
+>read4
+sequence
diff --git a/tests/cut/anchored.fasta b/tests/cut/anchored.fasta
new file mode 100644
index 0000000..cca3279
--- /dev/null
+++ b/tests/cut/anchored.fasta
@@ -0,0 +1,8 @@
+>read1
+sequence
+>read2
+blablaFRONTADAPTsequence
+>read3
+NTADAPTsequence
+>read4
+sequence
diff --git a/tests/cut/anchored_no_indels.fasta b/tests/cut/anchored_no_indels.fasta
new file mode 100644
index 0000000..b189dd4
--- /dev/null
+++ b/tests/cut/anchored_no_indels.fasta
@@ -0,0 +1,12 @@
+>no_mismatch (adapter: TTAGACATAT)
+GAGGTCAG
+>one_mismatch
+GAGGTCAG
+>two_mismatches
+TAAGACGTATGAGGTCAG
+>insertion
+ATTAGACATATGAGGTCAG
+>deletion
+TAGACATATGAGGTCAG
+>mismatch_plus_wildcard
+TNAGACGTATGAGGTCAG
diff --git a/tests/cut/anchored_no_indels_wildcard.fasta b/tests/cut/anchored_no_indels_wildcard.fasta
new file mode 100644
index 0000000..245cd41
--- /dev/null
+++ b/tests/cut/anchored_no_indels_wildcard.fasta
@@ -0,0 +1,12 @@
+>no_mismatch (adapter: TTAGACATAT)
+GAGGTCAG
+>one_mismatch
+GAGGTCAG
+>two_mismatches
+TAAGACGTATGAGGTCAG
+>insertion
+ATTAGACATATGAGGTCAG
+>deletion
+TAGACATATGAGGTCAG
+>mismatch_plus_wildcard
+GAGGTCAG
diff --git a/tests/cut/anywhere_repeat.fastq b/tests/cut/anywhere_repeat.fastq
new file mode 100644
index 0000000..e5ae7f3
--- /dev/null
+++ b/tests/cut/anywhere_repeat.fastq
@@ -0,0 +1,28 @@
+ at prefix:1_13_1400/1
+CGTCCGAANTAGCTACCACCCTGATTAGACAAAT
++
+)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5
+ at prefix:1_13_1500/1
+NNNNANNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1550/1
+NNNNANNNNNNNNNNNNNNNNNNNNNNNNNNNNN
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1600/1
+NNNNATGTCCCCTGCCACATTGCCCTAGTNNNNN
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1700/1
+NNNNATGTCCCCTGCCACATTGCCCTAGTTTATT
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1800/1
+GTTCATGTCCCCTGCCACATTGCCCTAGTTTATT
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1900/1
+ATGGCTGTCCCCTGCCACATTGCCCTAGTNNNNN
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/discard-untrimmed.fastq b/tests/cut/discard-untrimmed.fastq
new file mode 100644
index 0000000..5caed44
--- /dev/null
+++ b/tests/cut/discard-untrimmed.fastq
@@ -0,0 +1,4 @@
+ at prefix:1_13_1440/1
+CTNCCCTGCCACATTGCCCTAGTTAAAC
++
+57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/discard.fastq b/tests/cut/discard.fastq
new file mode 100644
index 0000000..d3668fd
--- /dev/null
+++ b/tests/cut/discard.fastq
@@ -0,0 +1,4 @@
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/dos.fastq b/tests/cut/dos.fastq
new file mode 100644
index 0000000..a3437d1
--- /dev/null
+++ b/tests/cut/dos.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGA
++
+)3%)&&&&!.1&(6:<'67..*,:
+ at prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCC
++
+;<:&:A;A!9<<<,7:<=3=;:
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/empty.fastq b/tests/cut/empty.fastq
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cut/example.fa b/tests/cut/example.fa
new file mode 100644
index 0000000..50ab75e
--- /dev/null
+++ b/tests/cut/example.fa
@@ -0,0 +1,18 @@
+>read1
+MYSEQUENCE
+>read2
+MYSEQUENCE
+>read3
+MYSEQUENCE
+>read4
+MYSEQUENCEADABTER
+>read5
+MYSEQUENCEADAPTR
+>read6
+MYSEQUENCEADAPPTER
+>read7
+MYSEQUENCE
+>read8
+MYSEQUENCE
+>read9
+SOMETHING
diff --git a/tests/cut/examplefront.fa b/tests/cut/examplefront.fa
new file mode 100644
index 0000000..b60e194
--- /dev/null
+++ b/tests/cut/examplefront.fa
@@ -0,0 +1,18 @@
+>read1
+
+>read2
+MYSEQUENCEADAP
+>read3
+SOMETHINGELSE
+>read4
+MYSEQUENCEADABTER
+>read5
+MYSEQUENCEADAPTR
+>read6
+MYSEQUENCEADAPPTER
+>read7
+MYSEQUENCE
+>read8
+MYSEQUENCE
+>read9
+MYSEQUENCE
diff --git a/tests/cut/illumina.fastq b/tests/cut/illumina.fastq
new file mode 100644
index 0000000..9e74b7d
--- /dev/null
+++ b/tests/cut/illumina.fastq
@@ -0,0 +1,400 @@
+ at SEQ:1:1101:9010:3891#0/1 adapter start: 51
+ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG
++
+FFFFFEDBE at 79@@>@CBCBFDBDFDDDDD<@C>ADD at B;5:978 at CBDDF
+ at SEQ:1:1101:9240:3898#0/1
+CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG
++
+GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH at GGGDGDFEEFC@=D?GBGFGF:FB6D
+ at SEQ:1:1101:9207:3899#0/1 adapter start: 64
+TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAAC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHF
+ at SEQ:1:1101:9148:3908#0/1 adapter start: 28
+ACGACGCAATGGAGAAAGACGGAGAGCG
++
+HHHHHHHHHHHHGHHHHGHHHHHHHHHH
+ at SEQ:1:1101:9044:3916#0/1 adapter start: 78
+AACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHGHHHHHHHHHHHHFHEBFHFFEFHE
+ at SEQ:1:1101:9235:3923#0/1
+TTGATGCGGTTATCCATCTGCTTATGGAAGCCAAGCATTGGGGATTGAGAAAGAGTAGAAATGCCACAAGCCTCAATAGCAGGTTTAAGAGCCTCGATACG
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHBHHFHFHHHHHFHHCHHFFHHHHEHHFDHCEEHHHFHHFHFEHHHHHHHHHEHHGFHH<FGGFABGGG?
+ at SEQ:1:1101:9086:3930#0/1 adapter start: 46
+CCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGTAGTCGGAA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH at HHEHHHFH
+ at SEQ:1:1101:9028:3936#0/1
+CTTGATATTAATAACACTATAGACCACCGCCCCGAAGGGGACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHCHFHHFHGBEFFFEFEFHEHHHFEHHFEEC>CDCEEEFDFFHHHCFFEFE?EBFEB?3
+ at SEQ:1:1101:9185:3939#0/1
+CGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTAGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATC
++
+HHHHHHHHHHHHHHFHHEHHHDHHFGHHHCHHHHHDHHHHFECEGBD<DCFHBHBBEEEGCCCDB?C9DECCC3CD<@DA<@>@@?A?DAFF9F<@@08?<
+ at SEQ:1:1101:9140:3961#0/1 adapter start: 66
+CAGGAGAAACATACGAAGGCGCATAACGATACCACTGACCCTCAGCAATCTTAAACTTCTTAGACG
++
+HHHHHHHGHHHHHHHHHHHGHHHHHHHHHHHHHHHHFHHHHHHFGHHHHHHHHHHHHHHHHDHHFH
+ at SEQ:1:1101:9073:3961#0/1 adapter start: 49
+GTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGC
++
+HHHHHHHHFHHHHHHGHHHHHHHHHEHHGHHGHHHHHHHHHHGEHHHHH
+ at SEQ:1:1101:9196:3971#0/1 adapter start: 18
+ACCAGAAGGCGGTTCCTG
++
+HHHHHHHHHFHHHHHHHH
+ at SEQ:1:1101:9053:3973#0/1
+TTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGGTTTATTGCTGTTTGTTTCTATGTGGCTTAAAACGTTACCA
++
+A39>A################################################################################################
+ at SEQ:1:1101:9120:3979#0/1
+GGCGTTGACAGATGTATCCATCTGAATGCAATGAAGAAAACCACCATTACCAGCATTAACCGTCAAACTATCAAAATATAACGTTGACGATGTAGCTTTAG
++
+HHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFGFFDHBHHHFGEHHHFGHHHEHHHGH
+ at SEQ:1:1101:9045:3988#0/1 adapter start: 91
+TAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGCAGTGTTAA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHFHHHHHHHHHHHFHHHHHHDHHHHHHHFHFFHHGHEHHGHHHGHGHHFH
+ at SEQ:1:1101:9418:3756#0/1
+TAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCATGAACTTAATCCACTGT
++
+HHHHHHHHHHHHHHHHFHHHGHEHHHFHHHHFFEHHFHHHHGHHFHFHHHGHHHDHFHCHFCFBCFEFDEHHHHHG at GGGGHHGHFFEG=AB at C:EDEEEH
+ at SEQ:1:1101:9394:3759#0/1
+CCCTCGCTTTCCTGCTCCTGTTGAGGTTATTGCTGCCGTCATTGCTTATTATGTTCATCTCGGCAACATTCATACGGTCTGGCTTATCCGTGCAGAGACTG
++
+#####################################################################################################
+ at SEQ:1:1101:9365:3766#0/1
+AAGCACATCACCTTGAATGCCACCGGAGGCGGCTTTTTGACCGCCTCCAAACAATTTAGACATGGCGCCACCAGCAAGAGCAGAAGCAATACCGCCAGCAA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFHHHHFHHHHEHHFGHHHHFEHHHHFEHHFDFFAFHEFHFHDFFFFHHDH?DFABFDHADFDHHHFBF
+ at SEQ:1:1101:9436:3776#0/1
+GAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATGGCGACCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGGAGTCGGA
++
+HHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHFHGHHHHHHHGHHHHHHFDHHHHHHHHHHHHHFH?HHHHHFBHEH at GHHGD=EEEE88==%893A@@;
+ at SEQ:1:1101:9354:3801#0/1
+CCAGCAAGAGCAGAAGCAATACCGCCAGCAATAGCACCAAACATAAATCACCTCACTTAAGTGGCTGGAGACAAATAATCTCTTTAATAACCTGATTCAGC
++
+HHHHHHHHHGHHGHHEGHHEHFGFEHHGHGGHHHHHHHFHGHHFHHEFFFHEHHFHHHDHE5EDFCAC+C)4&27DDA?7HFHDHEFGFG,<@7>?>??<A
+ at SEQ:1:1101:9389:3804#0/1 adapter start: 28
+ATTAGAGCCAATACCATCAGCTTTACCG
++
+GGGGFDGGHFHHHFFFGBEFGGGGGEFE
+ at SEQ:1:1101:9477:3819#0/1 adapter start: 28
+ATAAAGGAAAGGATACTCGTGATTATCT
++
+HHHHHHHHHHHHHHHHGHHHHHHHHHHH
+ at SEQ:1:1101:9428:3823#0/1
+CGTCAGTAAGAACGTCAGTGTTTCCTGCGCGTACACGCAAGGTAAACGCGAACAATTCAGCGGCTTTAACCGGACGCTCGACGCCATTAATAATGTTTTCC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHFGHGHHHHHHHEHHHHFHHHHHFHHHFHH?FHEFFFDGFDAFDCFAFDBFGBFGFHHHHHHHHHFHFH;8
+ at SEQ:1:1101:9403:3824#0/1 adapter start: 70
+GCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCCTAA
++
+HHHHHHHHHHHHHHHHHHHEHHHHHHHHHHHHHHHHGDHDHHHHHHHHHGHHHHGHEHGHHHHFFHHHHH
+ at SEQ:1:1101:9362:3824#0/1
+ACCATGAAACCAACATAAACATTATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATGGCGACCATC
++
+HHHHHHHGHHHHHHHHHHHHHHHGHHHHHFHHHHHHHHFHHFHHHFHHHHHHHHHFHEHHHFHBHFHHHFCEFDEHHHHGHHHHHHHHHEFFFHHFFFDAG
+ at SEQ:1:1101:9480:3842#0/1 adapter start: 54
+GTACGGATTGTTCAGTAACTTGACTCATGATTTCTTACCTATTAGTGGTTGAAC
++
+BDCCC at 5<<<@BBB7DDDDD<<<9>::@<5DDDDDCDCBEDCDDDDBDDDBAA1
+ at SEQ:1:1101:9286:3846#0/1
+TGATTAAACTCCTAAGCAGAAAACCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGTTATAACCTCACACT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHFHHDGCEGGHHHHFHHFHEHHFHEGHGHGF
+ at SEQ:1:1101:9403:3867#0/1 adapter start: 1
+G
++
+H
+ at SEQ:1:1101:9341:3873#0/1 adapter start: 88
+CCTAAGCAGAAAACCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGTTATAACCTCAC
++
+HHHHHHHGGFHGHHHHHGHHHHFGHGHHHHEHHHFHFHFHFHH?CEEEDFCEFCDFFHFEABEDF.ECDCDFEEEEEGGFADACDHHH
+ at SEQ:1:1101:9381:3881#0/1 adapter start: 41
+ACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGC
++
+HHHHHHHHHHHHGHGHDHHHHHHHHFEHHHGGGGFFBGFFF
+ at SEQ:1:1101:9360:3884#0/1
+TAATACCTTTCTTTTTGGGGTAATTATACTCATCGCGAATATCCTTAAGAGGGCGTTCAGCAGCCAGCTTGCGGCAAAACTGCGTAACCGTCTTCTCGTTC
++
+HGDEHGHDGHFGFGHFDFFF7EEEEGGFGGEGHEGHHHHFFFEHHHFHEHFBFFF>?DEEBF=?CDB:DFBGFBBGDFFHF?FAFGGABFGGFAFE6EDDC
+ at SEQ:1:1101:9323:3894#0/1 adapter start: 100
+ATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTG
++
+HHGHHHHHHHHHHHHHHHHHHHEHDHHHHHGEHHFFHHFFFHHHHHHHHFHDHHBHGHB?HHDFFF?EFEHFHBFGEGGFFFDFBHFHHHHHFHHEFFFCF
+ at SEQ:1:1101:9267:3900#0/1 adapter start: 89
+GTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHFHHHHEHHEHHHFHHHHHHHHHHFHFHECFFHABGGGIGHHHGGFFGF
+ at SEQ:1:1101:9416:3909#0/1
+TAAACGTGACGATGAGGGACATAAAAAGTAAAAATGTCTACAGTAGAGTCAATAGCAAGGCCACGACGCAATGGAGAAAGACGGAGAGCGCCAACGGCGTC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHHHHEHHGHHFEFHEFHFFDHEFHFAFFFA?GDFGFE at FFFB?B7EEFEFE?DAA##
+ at SEQ:1:1101:9360:3917#0/1 adapter start: 68
+ATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAA
++
+HHHHHHHHHHHHHHHHHHHFHHHHHHHHHHFHHHHHHHFHEFHHHEHHCFFEFEE9AFFBBDCDCAEE
+ at SEQ:1:1101:9337:3918#0/1 adapter start: 14
+CATCAGCACCAGCA
++
+FDEGGGCDBEFCDF
+ at SEQ:1:1101:9307:3927#0/1 adapter start: 15
+TCAGCGCCTTCCATG
++
+FFFFFFFFFFFFFDF
+ at SEQ:1:1101:9479:3929#0/1 adapter start: 9
+GACAAATTA
++
+HHHHHHHHH
+ at SEQ:1:1101:9277:3934#0/1 adapter start: 71
+CTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCTGCTTC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHEHFHHHHFHHHHHFHHEHFHHHFHHFDHHFHHE
+ at SEQ:1:1101:9442:3934#0/1
+AGACCCATAATGTCAATAGATGTGGTAGAAGTCGTCATTTGGCGAGAAAGCTCAGTCTCAGGAGGAAGCGGAGCAGTCCAAATGTTTTTGAGATGGCAGCA
++
+HHHHHHHHHGHHHHHFGHHBHHEHGFHHDHGDEGDHHHHHFHHHHHAHHH?FEEBEFDFBEBEEFEHFE7ECCDCG=FDFFDFFFHHHHFEEBEF;BEAEG
+ at SEQ:1:1101:9329:3935#0/1
+AGATGGATAACCGCATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCT
++
+GFGGGEEGDHHHGGEHHHHHHGGFHHEAHHAGDEGEGGEDG at GGGHHGHHFGGH6@CADDHHBEEE at 8EBGEEFGGGHFHHHHGEGFGGEFBGEDDE?E7E
+ at SEQ:1:1101:9445:3956#0/1 adapter start: 81
+TGCAACAACTGAACGGACTGGAAACACTGGTCATAATCATGGTGGCGAATAAGTACGCGTTCTTGCAAATCACCAGAAGGC
++
+HHHHHHHHHGFHHHHHHHHHHHHHHGHHHHFHHHHHHHHHHHFGHHHFGHHHHFGHHFHEHHHHHHHHHHHHGBHHHHGFG
+ at SEQ:1:1101:9357:3957#0/1
+TTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCATGAACTTAATCCACTG
++
+HHHHHHGHHHHHHHHHHGHEHHHHHGHEHHHHHHHHHHHHHHGHEBGGFGFFFFFBH?HCEEED<FEEEFFHHDHHHHEHHHGFHHH:BHHHHFHEFFHFF
+ at SEQ:1:1101:9487:3957#0/1
+CAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATG
++
+HHHHHHHHHHHHHHHGEHHHGHHHHHHHEGFHGHHHGHHHHGGHHHHHGHHHHHHHHHHFHHB>EFHFHBHFHCFHHGGGHEGHEGHEF at GHHFHEDHH;H
+ at SEQ:1:1101:9309:3957#0/1 adapter start: 72
+GTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHGHFHHHFHHHHHHHHGHHHFHHHHHHHFHDHHHHHHFHCHHEAHHDG
+ at SEQ:1:1101:9425:3960#0/1
+CTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGC
++
+8?8?C?BC at BD=ABB==BD?CADD=AD>C@@CCBBDD at B/143'3.>>@9BCBDDDC8@@;<A=<DDDDB?A:A;9:2-74,<82;9877CBCDD/B at 5;<
+ at SEQ:1:1101:9337:3969#0/1
+GAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAA
++
+DBFEFFDEEEBFFFFF8FF=D=DDDEEE=E>@???FB=DFB=>C=EEFFFFFEFFFFF:FEF at FEF<FFFFF?DFDD8DDBD=DBFEB at E6FECF@EB8E?
+ at SEQ:1:1101:9388:3971#0/1
+CTGAATCTCTTTAGTCGCAGTAGGCGGAAAACGAACAAGCGCAAGAGTAAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGG
++
+HHHHHHFHHHHHHHHHHHHHHHHHHHHFHHGHHHFHHHHHHGHHHHHEFHHHFHHFEHHFEHHFFHHHHECFDF?HHHHGEGGHHHFHHHFEGCFFFFF=E
+ at SEQ:1:1101:9414:3978#0/1 adapter start: 99
+TTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCGC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHFFHHHHG at HFHDHGHDHHHHHHFGHHGHG
+ at SEQ:1:1101:9494:3983#0/1 adapter start: 72
+TAGCACCAAACATAAATCACCTCACTTAAGTGGCTGGAGACAAATAATCTCTTTAATAACCTGATTCAGCGA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHBF?FBHHFEHB?HEFEHBGEDEEBEDEEFACAFE>
+ at SEQ:1:1101:9363:3989#0/1 adapter start: 95
+CCTCCAAGATTTGGAGGCATGAAAACATACAATTGGGAGGGTGTCAATCCTGACGGTTATTTCCTAGACAAATTAGAGCCAATACCATCAGCTTTGCCTAA
++
+HHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHGHHHHHHHG<GFGGGGFGHHHHHHEEEEHHDEFHHFHHHFHHDHEGHHHHBHHGCGF8ECEEFFEDBA=
+ at SEQ:1:1101:9436:3998#0/1 adapter start: 67
+TAAATTGTTTGGAGGCGGTCAAAAAGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGAT
++
+HHHHHHHHHHHHHHHHGGDHHHHHHFHHFGHHHHHHDHHFFDFGEFFHDFCFFEBDFHFFFFEEDEB
+ at SEQ:1:1101:9621:3755#0/1
+AGCCAATACCATCAGCTTTACCGTCTTTCCAGAAATTGTTCCAAGTATCGGCAACAGCTTTATCAATACCATGAAAAATATCAACCACACCAGAAGCAGCA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHFHHFF?FHHFHFHHHHEHHC at FEHFHFHBGGGFHHHHHHDHHFFHGFHA
+ at SEQ:1:1101:9738:3756#0/1 adapter start: 1
+T
++
+H
+ at SEQ:1:1101:9580:3761#0/1 adapter start: 49
+TATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGAT
++
+HHHHHHHHHGGHHHHHEHHHFHEHHGEGGHFGDFGHFHFGHHFFDH?EF
+ at SEQ:1:1101:9533:3764#0/1 adapter start: 20
+TCTGTTGAACACGACCAGAA
++
+FEFFFF at FFDFFEFFDDBDD
+ at SEQ:1:1101:9636:3775#0/1
+ATAAGGCCACGTATTTTGCAAGCTATTTAACTGGCGGCGATTGCGTACCCGACGACCAAAATTAGGGTCAACGCTACCTGTAGGAAGTGTCCGCATAAAGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHFH6HHHHHHHHHHFFHFCHFDCHFE;DAD9BDDDDGFGDGDGGB<FDCDCDGF>GEEGB;5
+ at SEQ:1:1101:9554:3781#0/1
+CACGCTCTTTTAAAATGTCAACAAGAGAATCTCTACCATGAACAAAATGTGACTCATATCTAAACCAGTCCTTGACGAACGTGCCAAGCATATTAAGCCAC
++
+HHHHHHHHHHHHHGGHHHHHHGHFHHHHHHEHHFHHHEHHHHHHHEHHGHHHHEHHHGFHHHEHHHHHHEEFFEDFEDFF>ACBAHGHHHHECEGHBCFEE
+ at SEQ:1:1101:9695:3783#0/1 adapter start: 52
+AATAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHGHHHHHHHHHHF
+ at SEQ:1:1101:9572:3788#0/1
+ACCAACACGGCGAGTACAACGGCCCAGCTCAGAAGCGAGAACCAGCTGCGCTTGGGTGGGGCGATGGTGATGGTTTGCATGTTTGGCTCCGGTCTGTAGGC
++
+FFFFFFFFF=EBEB0A at A@<BD:EEFFA at EEEDE?EDE8<E?EE=E:BBB>>A?;FED;;<7??A>>9A>?DA1ADD?D:FF:BC;@##############
+ at SEQ:1:1101:9601:3793#0/1
+GCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGAC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHEHEGHFHHHHHHHHFHFHCHHHFHFFHHHHHH at HHHHHHGHHHFHHGFHHCFHEGGGFEGE?GCDAD6AD
+ at SEQ:1:1101:9634:3800#0/1
+TTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGG
++
+HHGHFHFHHHHCGHHFHHHHHHGEHHHHHGFBEFHHFEHDHHHGFHHEHHFF9ECD?CEEHED<HBDEEBFEDEEE<FDFDGFBEHHEHCE>F?GEEDEEG
+ at SEQ:1:1101:9501:3800#0/1 adapter start: 42
+TGACCACCTACATACCAAAGACGAGCGCCTTTACGCTTGCCT
++
+HHHHHHHHHHHHHHHHFHHHHHHHHFHHHHHHHHHHHHHHHH
+ at SEQ:1:1101:9703:3807#0/1 adapter start: 27
+TAATAACCTGATTCAGCGAAACCAATC
++
+HHHHHHHHHHHHHHHHHHHHHHGHHHG
+ at SEQ:1:1101:9728:3808#0/1 adapter start: 7
+CAGAAAA
++
+HHHFHHH
+ at SEQ:1:1101:9676:3812#0/1 adapter start: 1
+T
++
+H
+ at SEQ:1:1101:9620:3815#0/1
+TCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAGGCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGA
++
+HHHHHHHHHHGGHHHGHHGHHHHHHHHHHGFHGHHHHHHHHHFHDHHHDDHFHFHFHHHHFF9EFF>DG?FCBCDFFFEBFFE at DFEGGEEG?GF>>:;@A
+ at SEQ:1:1101:9720:3834#0/1 adapter start: 74
+TAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGT
++
+HGHHHHHHHHHHHHHHHGGHEGGFGHFGHFHHDGHGHGHHHHHHHHHHFHHHHHFHFHFFHEFHF=FFHFHHFF
+ at SEQ:1:1101:9635:3844#0/1 adapter start: 4
+GACC
++
+HHHH
+ at SEQ:1:1101:9744:3849#0/1 adapter start: 55
+AAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTC
++
+HHHHHHHGCHHFHHFHHFFHEHFGCHHGDGHEFFHFHEHHGBBGFCDGFEEFDCF
+ at SEQ:1:1101:9725:3850#0/1
+ATAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGA
++
+FDGGGDGGGEGGGGGBGBEGFFFDFFFFGGFGGGGFBGGGGGEFDFFGEGFFEFEDGGEEF9DCF?EFBBEDBBGFGGEGGGGCFGFEB at B7C>CDEEE##
+ at SEQ:1:1101:9544:3854#0/1
+TAGCGGTAAAGTTAGACCAAACCATGAAACCAACATAAACATTATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHFFHHHHHHHHHBFHHHHHFHHHHHHHHHHHHHHFCHHHBHE
+ at SEQ:1:1101:9581:3856#0/1
+GGGCGGTGGTCTATAGTGTTATTAATATCAAGTTGGGGGAGCACATTGTAGCATTGTGCCAATTCATCCATTAACTTCTCAGTAACAGATACAAACTCATC
++
+HHHHHHEHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHFHHHHGHHHHHHHHHHHHHHHGGHHHFHHHHHGHFGHGEGHHHHHHFEHFHGDGGFFGHH at DH
+ at SEQ:1:1101:9649:3858#0/1 adapter start: 33
+CCTCCAAACAATTTAGACATGGCGCCACCAGCA
++
+B<B at A@AAB>FEEEE@@BA at 3>8<>CCDDBEE@
+ at SEQ:1:1101:9616:3862#0/1 adapter start: 91
+GAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHEHHHHHHHHHHHHHHFHHHHHHHFFFHFDHHEHHHGHHHHGDEHHGHHEGH
+ at SEQ:1:1101:9696:3866#0/1
+CAAGTTGCCATACAAAACAGGGTCGCCAGCAATATCGGTATAAGTCAAAGCACCTTTAGCGTTAAGGTACTGAATCTCTTTAGTCGCAGTAGGCGGAAAAC
++
+HHHHHHHHHHHHHHHHHHHHEHEHHHEHHHHFHHHHHHFHHHFHFHHHHHHHHFHHHHFHHFEHBHFEHHHHCEEHHFHHHHHHHHHHHHEHHHHCAFEFG
+ at SEQ:1:1101:9512:3869#0/1
+GCTCGACGCCATTAATAATGTTTTCCGTAAATTCAGCGCCTTCCATGATGAGACAGGCCGTTTGAATGTTGACGGGATGAACATAATAAGCAATGACGGCA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHFHHHDHHHEHHFFFFFFHFAFEFH?E at FFGGGFGHFHAEFGFFFCEEFF
+ at SEQ:1:1101:9723:3870#0/1 adapter start: 66
+CTTTAGCAGCAAGGTATATATCTGACTTTTTGTTAACGTATTTAGCCACATAGCAACCAACAGACA
++
+##################################################################
+ at SEQ:1:1101:9667:3874#0/1
+CTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAAAAAGAGCTTACT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHAHHHHEHHD=DAD>D6ADGE at EBE;@?BCGGE?4>ADAAC
+ at SEQ:1:1101:9565:3879#0/1 adapter start: 24
+AGCCTTATGGCCGTCAACATACAT
++
+HHHHHHHHHHHHHHHHHFHHGFFH
+ at SEQ:1:1101:9721:3885#0/1 adapter start: 51
+TTCCTCAAACGCTTGTTCGGTGGATAAGTTATGGCATTAATCGATTTATTT
++
+>BC?:A?=<>::A=528882.53)5.77;407)*9@:AA8CAA########
+ at SEQ:1:1101:9707:3894#0/1 adapter start: 40
+AACACCATCCTTCATGAACTTAATCCACTGTTCACCATAA
++
+F at F8DEE@EEBCCCCFFEFDDC=DCCFFF=ADD=D>@AA@
+ at SEQ:1:1101:9560:3900#0/1 adapter start: 6
+AGAAGT
++
+GGGGGF
+ at SEQ:1:1101:9696:3913#0/1 adapter start: 2
+CC
++
+HH
+ at SEQ:1:1101:9574:3914#0/1 adapter start: 5
+GAACA
++
+HHHHH
+ at SEQ:1:1101:9508:3931#0/1 adapter start: 91
+TAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCCCTCTTAAGGATATTCGCGATGAGTATAATTACCCCAA
++
+HGHHHHHHHHHHHHHHHHHHGHHHHHFHHHGHHHHFHHHHHHHHHD?ACFEF9FFEEBHBAEFB?E<F5CAD(DAEE;AE at C?D at BDGF?F
+ at SEQ:1:1101:9617:3935#0/1
+TAAATTTAATGTGACCGTTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAACGCCGAAGCGGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHGHHHHHHHHHHHHHHHHHFHHHFFEDFEFHFHHFHFHGHHFHHHFHHEHHHFHHHHFB
+ at SEQ:1:1101:9667:3950#0/1 adapter start: 66
+CTTTAGCCATAGCACCAGAAACAAAACTAGGGACGGCCTCATCAGGGTTAGGAACATTAGAGCCTT
++
+HHHHHHHHHHHHHHHHGHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHEHHHHDGHFHHHHHHHBHH
+ at SEQ:1:1101:9705:3951#0/1 adapter start: 29
+ATTGCGTACCCGACGACCAAAATTAGGGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at SEQ:1:1101:9527:3965#0/1
+AACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCCTAACGACGTTTGGTCAGTTCC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHFHHGHHFHEHHHEFEFF at HFHFGGGDGGHFGDFHFGHGHHFGHG
+ at SEQ:1:1101:9550:3969#0/1
+AGAGTAAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCCTAACGACGTTTGGTC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHFHHHHHHHHHHHHFHHHFHHFEHHHHHHHHHHHHHGHHFHHHHFHHHHHHHHHHHG
+ at SEQ:1:1101:9636:3973#0/1 adapter start: 9
+CAAGCGCAA
++
+HHHHHHHHH
+ at SEQ:1:1101:9726:3981#0/1 adapter start: 66
+TTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAAT
++
+HHFEHHHHHHHHHHHHHHHHHHHHHHGHGHHGHGHHHHHHHHHHGGHHHEHHGFHHHHHEHHEHGH
+ at SEQ:1:1101:9603:3981#0/1 adapter start: 32
+TCTAAGAAGTTTAAGATTGCTGAGGGTCAGTG
++
+HHHHHHHHHHHHHHHFHHHHHHHHHHEHHHHH
+ at SEQ:1:1101:9533:3990#0/1 adapter start: 1
+G
++
+B
+ at SEQ:1:1101:9583:3992#0/1 adapter start: 20
+AAGGTACTGAATCTCTTTAG
++
+98583=>><>B at CBCD==BB
+ at SEQ:1:1101:9903:3754#0/1
+ACCAAAATTAGGGTCAACGCTACCTGTAGGAAGTGTCCGCATAAAGTGCACCGCATGGAAATGAAGACGGCCATCAGCTGTACCATACTCAGGCACACAAA
++
+GFEGGGGGBGE at EAEEGGFGGEGGFGEFFGFGFFGGEGGGGEFGCFCEFBF7FGEGEF?BFEEFDFFE??AADD+D at C@CGFCE6FDFFDFBGFDD at DAAD
+ at SEQ:1:1101:9878:3755#0/1 adapter start: 32
+AGAACGTGAAAAAGCGTCCTGCGTGTAGCGAA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at SEQ:1:1101:9833:3756#0/1 adapter start: 65
+TCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGCCACTCCTC
++
+HHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHFHHHHGHHFHHHHHEHEHHHHFHEHHHEHFH
+ at SEQ:1:1101:9991:3777#0/1
+GCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTGGATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGT
++
+HHHHHHHHHHHHHHHHHHHHHHGHHHGHHHHHHHGHHHHHHGHHHHHHHHHHHHHFHHFFDFFFCFFDHCFF;BFGEFGEGFGGFFF.CFDCCEDB=CBC@
diff --git a/tests/cut/illumina.info.txt b/tests/cut/illumina.info.txt
new file mode 100644
index 0000000..f8ad1fc
--- /dev/null
+++ b/tests/cut/illumina.info.txt
@@ -0,0 +1,100 @@
+SEQ:1:1101:9010:3891#0/1 adapter start: 51 1 51 81 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG GCCTAACTTCTTAGACTGCCTTAAGGACGT AAGCCAAGATGGGAAAGGTC adapt FFFFFEDBE at 79@@>@CBCBFDBDFDDDDD<@C>ADD at B;5:978 at CBDDF FDB4B?DB21;84?DDBC9DEBAB;=@<@@ B@@@@B>CCBBDE98>>0 at 7
+SEQ:1:1101:9240:3898#0/1 -1 CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH at GGGDGDFEEFC@=D?GBGFGF:FB6D
+SEQ:1:1101:9207:3899#0/1 adapter start: 64 1 64 94 TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAAC GCCTAACTTCTTAGACTGCCTTAAGGACGT ATACATA adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHF HHFHFFFFFBHHGHHHFFHHFHGGHHDEBF G<FGGDG
+SEQ:1:1101:9148:3908#0/1 adapter start: 28 1 28 58 ACGACGCAATGGAGAAAGACGGAGAGCG GCCTAACTTCTTAGACTGCCTTAAGGACGT CCAACGGCGTCCATCTCGAAGGAGTCGCCAGCGATAACCGGAG adapt HHHHHHHHHHHHGHHHHGHHHHHHHHHH HHHHHHHHHHHHHHHHHGHHHHDHDHHFHH HHHFFFFFHHHEFBEGEGGFFFHHHFHHHHHHFHHEHHGHEHD
+SEQ:1:1101:9044:3916#0/1 adapter start: 78 1 78 101 AACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGA GCCTAACTTCTTAGACTGCCTTA adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHGHHHHHHHHHHHHFHEBFHFFEFHE FHHGHFHHHHGGHGHHFHGGGHG
+SEQ:1:1101:9235:3923#0/1 -1 TTGATGCGGTTATCCATCTGCTTATGGAAGCCAAGCATTGGGGATTGAGAAAGAGTAGAAATGCCACAAGCCTCAATAGCAGGTTTAAGAGCCTCGATACG HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHBHHFHFHHHHHFHHCHHFFHHHHEHHFDHCEEHHHFHHFHFEHHHHHHHHHEHHGFHH<FGGFABGGG?
+SEQ:1:1101:9086:3930#0/1 adapter start: 46 1 46 76 CCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGTAGTCGGAA GCCTAACTTCTTAGACTGCCTTAAGGACGT CCGAAGAAGACTCAAAGCGAACCAA adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH at HHEHHHFH HHHFHHHHHFFHFFHHBHFFHFHHCFFHFH HFHHHHEEHHGHHFEHFHGGEHEFH
+SEQ:1:1101:9028:3936#0/1 -1 CTTGATATTAATAACACTATAGACCACCGCCCCGAAGGGGACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGC HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHCHFHHFHGBEFFFEFEFHEHHHFEHHFEEC>CDCEEEFDFFHHHCFFEFE?EBFEB?3
+SEQ:1:1101:9185:3939#0/1 -1 CGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTAGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATC HHHHHHHHHHHHHHFHHEHHHDHHFGHHHCHHHHHDHHHHFECEGBD<DCFHBHBBEEEGCCCDB?C9DECCC3CD<@DA<@>@@?A?DAFF9F<@@08?<
+SEQ:1:1101:9140:3961#0/1 adapter start: 66 1 66 96 CAGGAGAAACATACGAAGGCGCATAACGATACCACTGACCCTCAGCAATCTTAAACTTCTTAGACG GCCTAACTTCTTAGACTGCCTTAAGGACGT AATCA adapt HHHHHHHGHHHHHHHHHHHGHHHHHHHHHHHHHHHHFHHHHHHFGHHHHHHHHHHHHHHHHDHHFH HHHEHHFHFHHHHHGHHHHHFHGHGHHHHH EHCFG
+SEQ:1:1101:9073:3961#0/1 adapter start: 49 1 49 79 GTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGC GCCTAACTTCTTAGACTGCCTTAAGGACGT TGCTGCATTTCCTGAGCTTAAT adapt HHHHHHHHFHHHHHHGHHHHHHHHHEHHGHHGHHHHHHHHHHGEHHHHH GFHFFGHFHHGHHCHHFDGHHHHHFHHHFC DFGHHHHHHCFGHHEGEFBGGB
+SEQ:1:1101:9196:3971#0/1 adapter start: 18 1 18 48 ACCAGAAGGCGGTTCCTG GCCTAACTTCTTAGACTGCCTTAAGGACGT AATGAATGGGAAGCCTTCAAGAAGGTGATAAGCAGGAGAAACATACGAAGGCG adapt HHHHHHHHHFHHHHHHHH HGHHHGHHHHHHHFHHHHHHHHHHHEHHHH HHHHHHHHFHHGHHHHHEHFHHHHBHEHHGEHFHFHHFHHHHFBDFHF?HHHH
+SEQ:1:1101:9053:3973#0/1 -1 TTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGGTTTATTGCTGTTTGTTTCTATGTGGCTTAAAACGTTACCA A39>A################################################################################################
+SEQ:1:1101:9120:3979#0/1 -1 GGCGTTGACAGATGTATCCATCTGAATGCAATGAAGAAAACCACCATTACCAGCATTAACCGTCAAACTATCAAAATATAACGTTGACGATGTAGCTTTAG HHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFGFFDHBHHHFGEHHHFGHHHEHHHGH
+SEQ:1:1101:9045:3988#0/1 adapter start: 91 1 91 101 TAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGCAGTGTTAA GCCTAACTTC adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHFHHHHHHHHHHHFHHHHHHDHHHHHHHFHFFHHGHEHHGHHHGHGHHFH GHHFFFEFFE
+SEQ:1:1101:9418:3756#0/1 -1 TAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCATGAACTTAATCCACTGT HHHHHHHHHHHHHHHHFHHHGHEHHHFHHHHFFEHHFHHHHGHHFHFHHHGHHHDHFHCHFCFBCFEFDEHHHHHG at GGGGHHGHFFEG=AB at C:EDEEEH
+SEQ:1:1101:9394:3759#0/1 -1 CCCTCGCTTTCCTGCTCCTGTTGAGGTTATTGCTGCCGTCATTGCTTATTATGTTCATCTCGGCAACATTCATACGGTCTGGCTTATCCGTGCAGAGACTG #####################################################################################################
+SEQ:1:1101:9365:3766#0/1 -1 AAGCACATCACCTTGAATGCCACCGGAGGCGGCTTTTTGACCGCCTCCAAACAATTTAGACATGGCGCCACCAGCAAGAGCAGAAGCAATACCGCCAGCAA HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFHHHHFHHHHEHHFGHHHHFEHHHHFEHHFDFFAFHEFHFHDFFFFHHDH?DFABFDHADFDHHHFBF
+SEQ:1:1101:9436:3776#0/1 -1 GAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATGGCGACCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGGAGTCGGA HHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHFHGHHHHHHHGHHHHHHFDHHHHHHHHHHHHHFH?HHHHHFBHEH at GHHGD=EEEE88==%893A@@;
+SEQ:1:1101:9354:3801#0/1 -1 CCAGCAAGAGCAGAAGCAATACCGCCAGCAATAGCACCAAACATAAATCACCTCACTTAAGTGGCTGGAGACAAATAATCTCTTTAATAACCTGATTCAGC HHHHHHHHHGHHGHHEGHHEHFGFEHHGHGGHHHHHHHFHGHHFHHEFFFHEHHFHHHDHE5EDFCAC+C)4&27DDA?7HFHDHEFGFG,<@7>?>??<A
+SEQ:1:1101:9389:3804#0/1 adapter start: 28 1 28 58 ATTAGAGCCAATACCATCAGCTTTACCG GCCTAACTTCTTAGACTGCCTTAAGGACGT TCTTTCCAGAAATTGTTCCAAGTATCGGCAACAGCTTTATCAA adapt GGGGFDGGHFHHHFFFGBEFGGGGGEFE EFFGHFHHFHFDEFFEFHHHBFEFDD=BDD DFHBE>EDC at FDDDDCDFE?DEEFGF<EE?F?GGEF>CC@;@D
+SEQ:1:1101:9477:3819#0/1 adapter start: 28 1 28 58 ATAAAGGAAAGGATACTCGTGATTATCT GCCTAACTTCTTAGACTGCCTTAAGGACGT TGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGT adapt HHHHHHHHHHHHHHHHGHHHHHHHHHHH HHHHHHHHHHHHHFHHFHFHHHHHHHEHHH HHEHHHHHHEHHDHDHBHHGCEHHHHHGGEFGG=DGDGCGC68
+SEQ:1:1101:9428:3823#0/1 -1 CGTCAGTAAGAACGTCAGTGTTTCCTGCGCGTACACGCAAGGTAAACGCGAACAATTCAGCGGCTTTAACCGGACGCTCGACGCCATTAATAATGTTTTCC HHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHFGHGHHHHHHHEHHHHFHHHHHFHHHFHH?FHEFFFDGFDAFDCFAFDBFGBFGFHHHHHHHHHFHFH;8
+SEQ:1:1101:9403:3824#0/1 adapter start: 70 1 70 100 GCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCCTAA GCCTAACTTCTTAGACTGCCTTAAGGACGT C adapt HHHHHHHHHHHHHHHHHHHEHHHHHHHHHHHHHHHHGDHDHHHHHHHHHGHHHHGHEHGHHHHFFHHHHH EHFHFEHHFGBFFFDHCEHHHHGH=HHH=G E
+SEQ:1:1101:9362:3824#0/1 -1 ACCATGAAACCAACATAAACATTATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATGGCGACCATC HHHHHHHGHHHHHHHHHHHHHHHGHHHHHFHHHHHHHHFHHFHHHFHHHHHHHHHFHEHHHFHBHFHHHFCEFDEHHHHGHHHHHHHHHEFFFHHFFFDAG
+SEQ:1:1101:9480:3842#0/1 adapter start: 54 1 54 84 GTACGGATTGTTCAGTAACTTGACTCATGATTTCTTACCTATTAGTGGTTGAAC GCCTAACTTCTTAGACTGCCTTAAGGACGT CGCATCGGACTCAGATA adapt BDCCC at 5<<<@BBB7DDDDD<<<9>::@<5DDDDDCDCBEDCDDDDBDDDBAA1 /82638?D=CD2*><6BC<CC7;=;*CBCC AC at 73C2=3<<@,CB at D
+SEQ:1:1101:9286:3846#0/1 -1 TGATTAAACTCCTAAGCAGAAAACCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGTTATAACCTCACACT HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHFHHDGCEGGHHHHFHHFHEHHFHEGHGHGF
+SEQ:1:1101:9403:3867#0/1 adapter start: 1 1 1 31 G GCCTAACTTCTTAGACTGCCTTAAGGACGT GAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCCTAACGACGT adapt H HHHHHHHHHHHHHHHHHHHHHHHHHHHHGH HHHHHHHHHHHHHHHHHHHHHHHHHHHFHFHHHHHHHHHDFFBFHGGGFHHHHHHHHHHHHHHEBHHHFB
+SEQ:1:1101:9341:3873#0/1 adapter start: 88 1 88 101 CCTAAGCAGAAAACCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGTTATAACCTCAC GCCTAACTTCTTA adapt HHHHHHHGGFHGHHHHHGHHHHFGHGHHHHEHHHFHFHFHFHH?CEEEDFCEFCDFFHFEABEDF.ECDCDFEEEEEGGFADACDHHH BAFG3FF:BBE##
+SEQ:1:1101:9381:3881#0/1 adapter start: 41 1 41 71 ACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGC GCCTAACTTCTTAGACTGCCTTAAGGACGT CGCTTAAAGCTACCAGTTATATGGCTGTTG adapt HHHHHHHHHHHHGHGHDHHHHHHHHFEHHHGGGGFFBGFFF HFEHEHHEF>FGFF?E?FEFFHBBFEE3E, ;/97-0(6,?=BB at A@D9D###########
+SEQ:1:1101:9360:3884#0/1 -1 TAATACCTTTCTTTTTGGGGTAATTATACTCATCGCGAATATCCTTAAGAGGGCGTTCAGCAGCCAGCTTGCGGCAAAACTGCGTAACCGTCTTCTCGTTC HGDEHGHDGHFGFGHFDFFF7EEEEGGFGGEGHEGHHHHFFFEHHHFHEHFBFFF>?DEEBF=?CDB:DFBGFBBGDFFHF?FAFGGABFGGFAFE6EDDC
+SEQ:1:1101:9323:3894#0/1 adapter start: 100 -1 ATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTG HHGHHHHHHHHHHHHHHHHHHHEHDHHHHHGEHHFFHHFFFHHHHHHHHFHDHHBHGHB?HHDFFF?EFEHFHBFGEGGFFFDFBHFHHHHHFHHEFFFCF
+SEQ:1:1101:9267:3900#0/1 adapter start: 89 1 89 101 GTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGT GCCTAACTTCTT adapt HHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHFHHHHEHHEHHHFHHHHHHHHHHFHFHECFFHABGGGIGHHHGGFFGF FCACFECEB5<;
+SEQ:1:1101:9416:3909#0/1 -1 TAAACGTGACGATGAGGGACATAAAAAGTAAAAATGTCTACAGTAGAGTCAATAGCAAGGCCACGACGCAATGGAGAAAGACGGAGAGCGCCAACGGCGTC HHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHHHHEHHGHHFEFHEFHFFDHEFHFAFFFA?GDFGFE at FFFB?B7EEFEFE?DAA##
+SEQ:1:1101:9360:3917#0/1 adapter start: 68 1 68 98 ATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAA GCCTAACTTCTTAGACTGCCTTAAGGACGT AAA adapt HHHHHHHHHHHHHHHHHHHFHHHHHHHHHHFHHHHHHHFHEFHHHEHHCFFEFEE9AFFBBDCDCAEE EFHD??<DFEEEEHHEEBFEGBDEHCHFE? GE@
+SEQ:1:1101:9337:3918#0/1 adapter start: 14 1 14 44 CATCAGCACCAGCA GCCTAACTTCTTAGACTGCCTTAAGGACGT CGCTCCCAAGCATTAAGCTCAGGAAATGCAGCAGCAAGATAATCACGAGTATCCTTT adapt FDEGGGCDBEFCDF FBGFFGGEGEDE=GGGEGGGEFFCCFGF7E FFEGDEGCF;BFBEBFFCD5FEDCDA=95>>E4 at EC>74<-5@##############
+SEQ:1:1101:9307:3927#0/1 adapter start: 15 1 15 45 TCAGCGCCTTCCATG GCCTAACTTCTTAGACTGCCTTAAGGACGT ATGAGACAGGCCGTTTGAATGTTGACGGGATGAACATAATAAGCAATGACGGCAGC adapt FFFFFFFFFFFFFDF =EEEEDFFFFBEEEEFFFFFFFFFFFDEEB DFFFFDFFFFEF at FFFBEFFBFFEF--@@<FFBFFFF?FFEBEDEFEFFF######
+SEQ:1:1101:9479:3929#0/1 adapter start: 9 1 9 39 GACAAATTA GCCTAACTTCTTAGACTGCCTTAAGGACGT GAGCCAATACCATCAGCTTTACCGTCTTTCCAGAAATTGTTCCAAGTATCGGCAACAGCTTT adapt HHHHHHHHH HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH HHHFHFFHHFHFHHFFFHHHHHHFHHAE?EEHFFCFGGGAGGEGFFHHHHHGFHH?GHGHEG
+SEQ:1:1101:9277:3934#0/1 adapter start: 71 1 71 101 CTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCTGCTTC GCCTAACTTCTTAGACTGCCTTAAGGACGT adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHEHFHHHHFHHHHHFHHEHFHHHFHHFDHHFHHE FACFGEFFGEHDFFEHBHHDBEFEHFHHBC
+SEQ:1:1101:9442:3934#0/1 -1 AGACCCATAATGTCAATAGATGTGGTAGAAGTCGTCATTTGGCGAGAAAGCTCAGTCTCAGGAGGAAGCGGAGCAGTCCAAATGTTTTTGAGATGGCAGCA HHHHHHHHHGHHHHHFGHHBHHEHGFHHDHGDEGDHHHHHFHHHHHAHHH?FEEBEFDFBEBEEFEHFE7ECCDCG=FDFFDFFFHHHHFEEBEF;BEAEG
+SEQ:1:1101:9329:3935#0/1 -1 AGATGGATAACCGCATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCT GFGGGEEGDHHHGGEHHHHHHGGFHHEAHHAGDEGEGGEDG at GGGHHGHHFGGH6@CADDHHBEEE at 8EBGEEFGGGHFHHHHGEGFGGEFBGEDDE?E7E
+SEQ:1:1101:9445:3956#0/1 adapter start: 81 1 81 101 TGCAACAACTGAACGGACTGGAAACACTGGTCATAATCATGGTGGCGAATAAGTACGCGTTCTTGCAAATCACCAGAAGGC GCCTAACTTCTTAGACTGCC adapt HHHHHHHHHGFHHHHHHHHHHHHHHGHHHHFHHHHHHHHHHHFGHHHFGHHHHFGHHFHEHHHHHHHHHHHHGBHHHHGFG GGEGGGGFDHHHFHHGGEBE
+SEQ:1:1101:9357:3957#0/1 -1 TTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCATGAACTTAATCCACTG HHHHHHGHHHHHHHHHHGHEHHHHHGHEHHHHHHHHHHHHHHGHEBGGFGFFFFFBH?HCEEED<FEEEFFHHDHHHHEHHHGFHHH:BHHHHFHEFFHFF
+SEQ:1:1101:9487:3957#0/1 -1 CAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATG HHHHHHHHHHHHHHHGEHHHGHHHHHHHEGFHGHHHGHHHHGGHHHHHGHHHHHHHHHHFHHB>EFHFHBHFHCFHHGGGHEGHEGHEF at GHHFHEDHH;H
+SEQ:1:1101:9309:3957#0/1 adapter start: 72 1 72 101 GTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTC GCCTAACTTCTTAGACTGCCTTAAGGACG adapt HHHHHHHHHHHHHHHHHHHHHHHHHGHFHHHFHHHHHHHHGHHHFHHHHHHHFHDHHHHHHFHCHHEAHHDG GHFHFHDHHHGHHEHHFFH?HHHFDGGG?
+SEQ:1:1101:9425:3960#0/1 -1 CTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGC 8?8?C?BC at BD=ABB==BD?CADD=AD>C@@CCBBDD at B/143'3.>>@9BCBDDDC8@@;<A=<DDDDB?A:A;9:2-74,<82;9877CBCDD/B at 5;<
+SEQ:1:1101:9337:3969#0/1 -1 GAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAA DBFEFFDEEEBFFFFF8FF=D=DDDEEE=E>@???FB=DFB=>C=EEFFFFFEFFFFF:FEF at FEF<FFFFF?DFDD8DDBD=DBFEB at E6FECF@EB8E?
+SEQ:1:1101:9388:3971#0/1 -1 CTGAATCTCTTTAGTCGCAGTAGGCGGAAAACGAACAAGCGCAAGAGTAAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGG HHHHHHFHHHHHHHHHHHHHHHHHHHHFHHGHHHFHHHHHHGHHHHHEFHHHFHHFEHHFEHHFFHHHHECFDF?HHHHGEGGHHHFHHHFEGCFFFFF=E
+SEQ:1:1101:9414:3978#0/1 adapter start: 99 -1 TTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCGC HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHFFHHHHG at HFHDHGHDHHHHHHFGHHGHG
+SEQ:1:1101:9494:3983#0/1 adapter start: 72 1 72 101 TAGCACCAAACATAAATCACCTCACTTAAGTGGCTGGAGACAAATAATCTCTTTAATAACCTGATTCAGCGA GCCTAACTTCTTAGACTGCCTTAAGGACG adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHBF?FBHHFEHB?HEFEHBGEDEEBEDEEFACAFE> EFBGGGFFGHFFHD5DGB=>>@;A>C5?A
+SEQ:1:1101:9363:3989#0/1 adapter start: 95 -1 CCTCCAAGATTTGGAGGCATGAAAACATACAATTGGGAGGGTGTCAATCCTGACGGTTATTTCCTAGACAAATTAGAGCCAATACCATCAGCTTTGCCTAA HHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHGHHHHHHHG<GFGGGGFGHHHHHHEEEEHHDEFHHFHHHFHHDHEGHHHHBHHGCGF8ECEEFFEDBA=
+SEQ:1:1101:9436:3998#0/1 adapter start: 67 1 67 97 TAAATTGTTTGGAGGCGGTCAAAAAGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGAT GCCTAACTTCTTAGACTGCCTTAAGGACGT AACA adapt HHHHHHHHHHHHHHHHGGDHHHHHHFHHFGHHHHHHDHHFFDFGEFFHDFCFFEBDFHFFFFEEDEB EFFF9FEFGGFDBGEBBFGGBFBD6DDAF< EEBE
+SEQ:1:1101:9621:3755#0/1 -1 AGCCAATACCATCAGCTTTACCGTCTTTCCAGAAATTGTTCCAAGTATCGGCAACAGCTTTATCAATACCATGAAAAATATCAACCACACCAGAAGCAGCA HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHFHHFF?FHHFHFHHHHEHHC at FEHFHFHBGGGFHHHHHHDHHFFHGFHA
+SEQ:1:1101:9738:3756#0/1 adapter start: 1 1 1 31 T GCCTAACTTCTTAGACTGCCTTAAGGACGT AAGTCAAAGCACCTTTAGCGTTAAGGTACTGAATCTCTTTAGTCGCAGTAGGCGGAAAACGAACAAGCGC adapt H HHHHHHHHHHHHHHHHHHHHHHHFHHEHHH HHHHHHHHHHHHFFGHHHBHHFHHHEHHHHHHFHHHHFHHHDDFGEFFDFFEFFEFHGBEGGDGHEGEFF
+SEQ:1:1101:9580:3761#0/1 adapter start: 49 1 49 79 TATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGAT GCCTAACTTCTTAGACTGCCTTAAGGACGT TTCGATTTTCTGACGAGTAACA adapt HHHHHHHHHGGHHHHHEHHHFHEHHGEGGHFGDFGHFHFGHHFFDH?EF HHEFEHEFGGG4ADCDE=ECEC<:=?DD>B B;FBFFEGGEGB==EGFHH<DB
+SEQ:1:1101:9533:3764#0/1 adapter start: 20 1 20 50 TCTGTTGAACACGACCAGAA GCCTAACTTCTTAGACTGCCTTAAGGACGT AACTGGCCTAACGACGTTTGGTCAGTTCCATCAACATCATAGCCAGATGCC adapt FEFFFF at FFDFFEFFDDBDD B??<@FFFEEEEEEEFEEFFFCFFFFEBFF FD at FBFBFFFE@BFFFFBF=ADD;@?@?AAFBEFFDA=FEFEFFB at -C?BE
+SEQ:1:1101:9636:3775#0/1 -1 ATAAGGCCACGTATTTTGCAAGCTATTTAACTGGCGGCGATTGCGTACCCGACGACCAAAATTAGGGTCAACGCTACCTGTAGGAAGTGTCCGCATAAAGT HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHFH6HHHHHHHHHHFFHFCHFDCHFE;DAD9BDDDDGFGDGDGGB<FDCDCDGF>GEEGB;5
+SEQ:1:1101:9554:3781#0/1 -1 CACGCTCTTTTAAAATGTCAACAAGAGAATCTCTACCATGAACAAAATGTGACTCATATCTAAACCAGTCCTTGACGAACGTGCCAAGCATATTAAGCCAC HHHHHHHHHHHHHGGHHHHHHGHFHHHHHHEHHFHHHEHHHHHHHEHHGHHHHEHHHGFHHHEHHHHHHEEFFEDFEDFF>ACBAHGHHHHECEGHBCFEE
+SEQ:1:1101:9695:3783#0/1 adapter start: 52 1 52 82 AATAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGT GCCTAACTTCTTAGACTGCCTTAAGGACGT GCCAAGAAAAGCGGCATGG adapt HHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHGHHHHHHHHHHF HHHHHHFHGEHEHHHHHGHHHHHHHHHFHH FHGGHHHHHHGGHGFHHHG
+SEQ:1:1101:9572:3788#0/1 -1 ACCAACACGGCGAGTACAACGGCCCAGCTCAGAAGCGAGAACCAGCTGCGCTTGGGTGGGGCGATGGTGATGGTTTGCATGTTTGGCTCCGGTCTGTAGGC FFFFFFFFF=EBEB0A at A@<BD:EEFFA at EEEDE?EDE8<E?EE=E:BBB>>A?;FED;;<7??A>>9A>?DA1ADD?D:FF:BC;@##############
+SEQ:1:1101:9601:3793#0/1 -1 GCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGAC HHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHEHEGHFHHHHHHHHFHFHCHHHFHFFHHHHHH at HHHHHHGHHHFHHGFHHCFHEGGGFEGE?GCDAD6AD
+SEQ:1:1101:9634:3800#0/1 -1 TTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGG HHGHFHFHHHHCGHHFHHHHHHGEHHHHHGFBEFHHFEHDHHHGFHHEHHFF9ECD?CEEHED<HBDEEBFEDEEE<FDFDGFBEHHEHCE>F?GEEDEEG
+SEQ:1:1101:9501:3800#0/1 adapter start: 42 1 42 72 TGACCACCTACATACCAAAGACGAGCGCCTTTACGCTTGCCT GCCTAACTTCTTAGACTGCCTTAAGGACGT TTAGTACCTCGCAACGGCTGCGGACGACC adapt HHHHHHHHHHHHHHHHFHHHHHHHHFHHHHHHHHHHHHHHHH HHHHHFHHHHHHHHHHHHHHFBHAEDBEFB BEF=ADEEGGGEFCC>B1CCDCB7FGFFE
+SEQ:1:1101:9703:3807#0/1 adapter start: 27 1 27 57 TAATAACCTGATTCAGCGAAACCAATC GCCTAACTTCTTAGACTGCCTTAAGGACGT CGCGGCATTTAGTAGCGGTAAAGTTAGACCAAACCATGAAACCA adapt HHHHHHHHHHHHHHHHHHHHHHGHHHG HHHFHGFHHHHFFHHHHHDHHHHBGFEFHH HFHFHFDHFDFFFEHHGHDHHGHHEHHG at E?FDGBEBDGGFFGF
+SEQ:1:1101:9728:3808#0/1 adapter start: 7 1 7 37 CAGAAAA GCCTAACTTCTTAGACTGCCTTAAGGACGT CCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGT adapt HHHFHHH HHHHHHHHHHHHHHHFHHHHHHHHHHHFB8 @B9C?CC at CHCFFFHF=FEED<4:?:>@,@;@>.>6;+?&@><CEC??A><:BC?DE@=7@###
+SEQ:1:1101:9676:3812#0/1 adapter start: 1 1 1 31 T GCCTAACTTCTTAGACTGCCTTAAGGACGT TATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATG adapt H HHHHHHHHHHHHHHHHHHHHHHHFHHHHHH HDHFHHHHHHHECHHEHEHHH=HHFHHFHFHHFHFHGFFEECFFHEFFGFGHFFEHHFHHFFFHF<F:D7
+SEQ:1:1101:9620:3815#0/1 -1 TCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAGGCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGA HHHHHHHHHHGGHHHGHHGHHHHHHHHHHGFHGHHHHHHHHHFHDHHHDDHFHFHFHHHHFF9EFF>DG?FCBCDFFFEBFFE at DFEGGEEG?GF>>:;@A
+SEQ:1:1101:9720:3834#0/1 adapter start: 74 1 74 101 TAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGT GCCTAACTTCTTAGACTGCCTTAAGGA adapt HGHHHHHHHHHHHHHHHGGHEGGFGHFGHFHHDGHGHGHHHHHHHHHHFHHHHHFHFHFFHEFHF=FFHFHHFF HFGAGGHHDHGHBHHHEGDGC>FEC at D
+SEQ:1:1101:9635:3844#0/1 adapter start: 4 1 4 34 GACC GCCTAACTTCTTAGACTGCCTTAAGGACGT ATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGTAGTCGGAACCGAAGAAGACTCAAAGCGAACC adapt HHHH GHHHHHHHHHGHHHHHHGHHGHHHGHGHHH HFHHH;GGCGFH?HHFHEHHFFHFHFFFHHFDHHHHHHHHHEGHHHHGHGHEHHHHC@?GFEGBGHH
+SEQ:1:1101:9744:3849#0/1 adapter start: 55 1 55 85 AAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTC GCCTAACTTCTTAGACTGCCTTAAGGACGT TGTTGAACACGACCAG adapt HHHHHHHGCHHFHHFHHFFHEHFGCHHGDGHEFFHFHEHHGBBGFCDGFEEFDCF FGEEEHEHFHHHCFF?EEFDEFD6FHGEHH HEHHHBBE?:CCDA7G
+SEQ:1:1101:9725:3850#0/1 -1 ATAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGA FDGGGDGGGEGGGGGBGBEGFFFDFFFFGGFGGGGFBGGGGGEFDFFGEGFFEFEDGGEEF9DCF?EFBBEDBBGFGGEGGGGCFGFEB at B7C>CDEEE##
+SEQ:1:1101:9544:3854#0/1 -1 TAGCGGTAAAGTTAGACCAAACCATGAAACCAACATAAACATTATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAA HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHFFHHHHHHHHHBFHHHHHFHHHHHHHHHHHHHHFCHHHBHE
+SEQ:1:1101:9581:3856#0/1 -1 GGGCGGTGGTCTATAGTGTTATTAATATCAAGTTGGGGGAGCACATTGTAGCATTGTGCCAATTCATCCATTAACTTCTCAGTAACAGATACAAACTCATC HHHHHHEHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHFHHHHGHHHHHHHHHHHHHHHGGHHHFHHHHHGHFGHGEGHHHHHHFEHFHGDGGFFGHH at DH
+SEQ:1:1101:9649:3858#0/1 adapter start: 33 1 33 63 CCTCCAAACAATTTAGACATGGCGCCACCAGCA GCCTAACTTCTTAGACTGCCTTAAGGACGT AGAGCAGAAGCAATACCGCCAGCAATAGCAACAAACAT adapt B<B at A@AAB>FEEEE@@BA at 3>8<>CCDDBEE@ DEFFDDFE=EEB at EDEEFDFDECEEBEB:C -@<698<@BBA at DCBDDFCEBFCCD;DC=D at C######
+SEQ:1:1101:9616:3862#0/1 adapter start: 91 1 91 101 GAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGC GCCTAACTTC adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHEHHHHHHHHHHHHHHFHHHHHHHFFFHFDHHEHHHGHHHHGDEHHGHHEGH GCHHHHEHFG
+SEQ:1:1101:9696:3866#0/1 -1 CAAGTTGCCATACAAAACAGGGTCGCCAGCAATATCGGTATAAGTCAAAGCACCTTTAGCGTTAAGGTACTGAATCTCTTTAGTCGCAGTAGGCGGAAAAC HHHHHHHHHHHHHHHHHHHHEHEHHHEHHHHFHHHHHHFHHHFHFHHHHHHHHFHHHHFHHFEHBHFEHHHHCEEHHFHHHHHHHHHHHHEHHHHCAFEFG
+SEQ:1:1101:9512:3869#0/1 -1 GCTCGACGCCATTAATAATGTTTTCCGTAAATTCAGCGCCTTCCATGATGAGACAGGCCGTTTGAATGTTGACGGGATGAACATAATAAGCAATGACGGCA HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHFHHHDHHHEHHFFFFFFHFAFEFH?E at FFGGGFGHFHAEFGFFFCEEFF
+SEQ:1:1101:9723:3870#0/1 adapter start: 66 1 66 96 CTTTAGCAGCAAGGTATATATCTGACTTTTTGTTAACGTATTTAGCCACATAGCAACCAACAGACA GCCTAACTTCTTAGACTGCCTTAAGGACGT TATAA adapt ################################################################## ############################## #####
+SEQ:1:1101:9667:3874#0/1 -1 CTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAAAAAGAGCTTACT HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHAHHHHEHHD=DAD>D6ADGE at EBE;@?BCGGE?4>ADAAC
+SEQ:1:1101:9565:3879#0/1 adapter start: 24 1 24 54 AGCCTTATGGCCGTCAACATACAT GCCTAACTTCTTAGACTGCCTTAAGGACGT ATCACCATTATCGAACTCAACGCCCTGCATACGAAAAGACAGAATCT adapt HHHHHHHHHHHHHHHHHFHHGFFH HHHHHHHHHDGHHFHFHHHHHFECHFFHHH HHEHFCFFFFHEHDEFHHCHHEG?GFEGGEGHHHHHH?HH?EFFFFF
+SEQ:1:1101:9721:3885#0/1 adapter start: 51 1 51 81 TTCCTCAAACGCTTGTTCGGTGGATAAGTTATGGCATTAATCGATTTATTT GCCTAACTTCTTAGACTGCCTTAAGGACGT ATCTCGCGGAAGAAAAACAC adapt >BC?:A?=<>::A=528882.53)5.77;407)*9@:AA8CAA######## ############################## ####################
+SEQ:1:1101:9707:3894#0/1 adapter start: 40 1 40 70 AACACCATCCTTCATGAACTTAATCCACTGTTCACCATAA GCCTAACTTCTTAGACTGCCTTAAGGACGT ACGTGACGATGAGGGACATAAAAAGTAAAAA adapt F at F8DEE@EEBCCCCFFEFDDC=DCCFFF=ADD=D>@AA@ FFFDE99>,>>@=856>;6C<@1:39@>6@ =??:B<B at B@F at FFE<@;B@###########
+SEQ:1:1101:9560:3900#0/1 adapter start: 6 1 6 36 AGAAGT GCCTAACTTCTTAGACTGCCTTAAGGACGT GCCAGCCTGCAACGTACCTTCAAGAAGTCCTTTACCAGCTTTAGCCATAGCACCAGAAACAAAAC adapt GGGGGF GGGGBGGGGFGGGFBEEDEEGFGACDDADF EFFEDFGGEFECFFDFGFBDBGBFD?@.DCC5:;GFF>AEEEBEDFBF69:<8<B.DAC@;B@@E
+SEQ:1:1101:9696:3913#0/1 adapter start: 2 1 2 32 CC GCCTAACTTCTTAGACTGCCTTAAGGACGT ATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCC adapt HH HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH GHHHHGHHHHHHHHGHHFHHHHHHHHHHFHHHHHHHHHHHHFBGGGGFHHHHHHHHHEHHHHHHHHHEH
+SEQ:1:1101:9574:3914#0/1 adapter start: 5 1 5 35 GAACA GCCTAACTTCTTAGACTGCCTTAAGGACGT AGCGCAAGAGTAAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTC adapt HHHHH HHHHHHHHHHFHHHHHHHHFHFHHHEHGHH HHHHHHHHHHHHHEHHHHHHFGHFHEEFEGEHFCDEFFEFHFHGEHHHHHHHHHHHHFGHDHHFHD
+SEQ:1:1101:9508:3931#0/1 adapter start: 91 1 91 101 TAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCCCTCTTAAGGATATTCGCGATGAGTATAATTACCCCAA GCCTAACTTC adapt HGHHHHHHHHHHHHHHHHHHGHHHHHFHHHGHHHHFHHHHHHHHHD?ACFEF9FFEEBHBAEFB?E<F5CAD(DAEE;AE at C?D at BDGF?F FFG;?DGDD:
+SEQ:1:1101:9617:3935#0/1 -1 TAAATTTAATGTGACCGTTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAACGCCGAAGCGGT HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHGHHHHHHHHHHHHHHHHHFHHHFFEDFEFHFHHFHFHGHHFHHHFHHEHHHFHHHHFB
+SEQ:1:1101:9667:3950#0/1 adapter start: 66 1 66 96 CTTTAGCCATAGCACCAGAAACAAAACTAGGGACGGCCTCATCAGGGTTAGGAACATTAGAGCCTT GCCTAACTTCTTAGACTGCCTTAAGGACGT GAATG adapt HHHHHHHHHHHHHHHHGHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHEHHHHDGHFHHHHHHHBHH HHHHHHHHHEEGDCGGBBFCFFE;GFBFFH BDEH=
+SEQ:1:1101:9705:3951#0/1 adapter start: 29 1 29 59 ATTGCGTACCCGACGACCAAAATTAGGGT GCCTAACTTCTTAGACTGCCTTAAGGACGT CAACGCTACCTGTAGGAAGTGTCCGCATAAAGTGCACCGCAT adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHH HHHHHHHFHHHHFHHHHFHEHHHHHHHFGH FHEHHFHFHHHFHBFHHHHHHHHHHHFHEBFHFFFFCFCEF@
+SEQ:1:1101:9527:3965#0/1 -1 AACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCCTAACGACGTTTGGTCAGTTCC HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHFHHGHHFHEHHHEFEFF at HFHFGGGDGGHFGDFHFGHGHHFGHG
+SEQ:1:1101:9550:3969#0/1 -1 AGAGTAAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCCTAACGACGTTTGGTC HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHFHHHHHHHHHHHHFHHHFHHFEHHHHHHHHHHHHHGHHFHHHHFHHHHHHHHHHHG
+SEQ:1:1101:9636:3973#0/1 adapter start: 9 1 9 39 CAAGCGCAA GCCTAACTTCTTAGACTGCCTTAAGGACGT GAGTAAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGT adapt HHHHHHHHH HHHHHHHHHHHHHHHHHHHHHHGHHHHHHH HHHHGHHHHHHFHHHFHHHHHHHBHHHFHHGFHHFEGFHGGHHHHHHHHEHHHFFHHHEEHE
+SEQ:1:1101:9726:3981#0/1 adapter start: 66 1 66 96 TTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAAT GCCTAACTTCTTAGACTGCCTTAAGGACGT GACGA adapt HHFEHHHHHHHHHHHHHHHHHHHHHHGHGHHGHGHHHHHHHHHHGGHHHEHHGFHHHHHEHHEHGH GHGHHEHHBGGGG?GDFGGEGD=GEEGBGE GFBEA
+SEQ:1:1101:9603:3981#0/1 adapter start: 32 1 32 62 TCTAAGAAGTTTAAGATTGCTGAGGGTCAGTG GCCTAACTTCTTAGACTGCCTTAAGGACGT GTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCA adapt HHHHHHHHHHHHHHHFHHHHHHHHHHEHHHHH HEHHHHHHHFHHGHGHHHHHFHHEHHFGHH HHEFHFHHHHHHFGHGHFHHFHHHEHEGFBDGGGB at F;G
+SEQ:1:1101:9533:3990#0/1 adapter start: 1 1 1 31 G GCCTAACTTCTTAGACTGCCTTAAGGACGT GGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATGGCGACCATCCAAAGGA adapt B EFFEFF=FFEFDFFDFBF at D@DDDBBDD at B CDDD::@=?BDCCAE@;BEEEE6>B5D>@DEDEEF?F<EFBBFFD8BCDDDCBCEECEEEE2??######
+SEQ:1:1101:9583:3992#0/1 adapter start: 20 1 20 50 AAGGTACTGAATCTCTTTAG GCCTAACTTCTTAGACTGCCTTAAGGACGT TCGCAGTAGGCGGAAAACGAACAAGCGCAAGAGTAAACATAGTGCCATGCT adapt 98583=>><>B at CBCD==BB DCDCCDD=8A>@<3A499:1@@@@CDC@@= @=<C7:163><6@<@:=<?A0;333+01-97=<><?C@@@<99>>189<16
+SEQ:1:1101:9903:3754#0/1 -1 ACCAAAATTAGGGTCAACGCTACCTGTAGGAAGTGTCCGCATAAAGTGCACCGCATGGAAATGAAGACGGCCATCAGCTGTACCATACTCAGGCACACAAA GFEGGGGGBGE at EAEEGGFGGEGGFGEFFGFGFFGGEGGGGEFGCFCEFBF7FGEGEF?BFEEFDFFE??AADD+D at C@CGFCE6FDFFDFBGFDD at DAAD
+SEQ:1:1101:9878:3755#0/1 adapter start: 32 1 32 62 AGAACGTGAAAAAGCGTCCTGCGTGTAGCGAA GCCTAACTTCTTAGACTGCCTTAAGGACGT CTGCGATGGGCATACTGTAACCATAAGGCCACGTATTTT adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH HFHHHBHHHHHHHHHFHFHEHHHHHHHHHH HHHEFHFHHHDFHHHHFGHHHHHFCEHECHHF?D5D7 at D
+SEQ:1:1101:9833:3756#0/1 adapter start: 65 1 65 95 TCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGCCACTCCTC GCCTAACTTCTTAGACTGCCTTAAGGACGT TCCCGA adapt HHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHFHHHHGHHFHHHHHEHEHHHHFHEHHHEHFH HHFHHFEFFB=;,01:99;;HHHHHHEFGE EFFBFB
+SEQ:1:1101:9991:3777#0/1 -1 GCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTGGATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGT HHHHHHHHHHHHHHHHHHHHHHGHHHGHHHHHHHGHHHHHHGHHHHHHHHHHHHHFHHFFDFFFCFFDHCFF;BFGEFGEGFGGFFF.CFDCCEDB=CBC@
diff --git a/tests/cut/illumina5.fastq b/tests/cut/illumina5.fastq
new file mode 100644
index 0000000..1b85887
--- /dev/null
+++ b/tests/cut/illumina5.fastq
@@ -0,0 +1,20 @@
+ at SEQ:1:1101:9010:3891#0/1 adapter start: 51
+ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG
++
+FFFFFEDBE at 79@@>@CBCBFDBDFDDDDD<@C>ADD at B;5:978 at CBDDF
+ at SEQ:1:1101:9240:3898#0/1
+CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG
++
+GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH at GGGDGDFEEFC@=D?GBGFGF:FB6D
+ at SEQ:1:1101:9207:3899#0/1 adapter start: 64
+TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAAC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHF
+ at SEQ:1:1101:9148:3908#0/1 adapter start: 28
+ACGACGCAATGGAGAAAGACGGAGAGCG
++
+HHHHHHHHHHHHGHHHHGHHHHHHHHHH
+ at SEQ:1:1101:9044:3916#0/1 adapter start: 78
+AACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHGHHHHHHHHHHHHFHEBFHFFEFHE
diff --git a/tests/cut/illumina5.info.txt b/tests/cut/illumina5.info.txt
new file mode 100644
index 0000000..b5a6cec
--- /dev/null
+++ b/tests/cut/illumina5.info.txt
@@ -0,0 +1,8 @@
+SEQ:1:1101:9010:3891#0/1 adapter start: 51 0 64 81 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGGGCCTAACTTCTTA GACTGCCTTAAGGACGT AAGCCAAGATGGGAAAGGTC adapt2 FFFFFEDBE at 79@@>@CBCBFDBDFDDDDD<@C>ADD at B;5:978 at CBDDFFDB4B?DB21;84 ?DDBC9DEBAB;=@<@@ B@@@@B>CCBBDE98>>0 at 7
+SEQ:1:1101:9010:3891#0/1 adapter start: 51 1 51 64 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG GCCTAACTTCTTA adapt FFFFFEDBE at 79@@>@CBCBFDBDFDDDDD<@C>ADD at B;5:978 at CBDDF FDB4B?DB21;84
+SEQ:1:1101:9240:3898#0/1 -1 CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH at GGGDGDFEEFC@=D?GBGFGF:FB6D
+SEQ:1:1101:9207:3899#0/1 adapter start: 64 0 77 94 TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAACGCCTAACTTCTTA GACTGCCTTAAGGACGT ATACATA adapt2 HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHFHHFHFFFFFBHHG HHHFFHHFHGGHHDEBF G<FGGDG
+SEQ:1:1101:9207:3899#0/1 adapter start: 64 1 64 77 TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAAC GCCTAACTTCTTA adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHF HHFHFFFFFBHHG
+SEQ:1:1101:9148:3908#0/1 adapter start: 28 0 41 58 ACGACGCAATGGAGAAAGACGGAGAGCGGCCTAACTTCTTA GACTGCCTTAAGGACGT CCAACGGCGTCCATCTCGAAGGAGTCGCCAGCGATAACCGGAG adapt2 HHHHHHHHHHHHGHHHHGHHHHHHHHHHHHHHHHHHHHHHH HHHHGHHHHDHDHHFHH HHHFFFFFHHHEFBEGEGGFFFHHHFHHHHHHFHHEHHGHEHD
+SEQ:1:1101:9148:3908#0/1 adapter start: 28 1 28 41 ACGACGCAATGGAGAAAGACGGAGAGCG GCCTAACTTCTTA adapt HHHHHHHHHHHHGHHHHGHHHHHHHHHH HHHHHHHHHHHHH
+SEQ:1:1101:9044:3916#0/1 adapter start: 78 1 78 91 AACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGA GCCTAACTTCTTA GACTGCCTTA adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHGHHHHHHHHHHHHFHEBFHFFEFHE FHHGHFHHHHGGH GHHFHGGGHG
diff --git a/tests/cut/illumina64.fastq b/tests/cut/illumina64.fastq
new file mode 100644
index 0000000..bbfce73
--- /dev/null
+++ b/tests/cut/illumina64.fastq
@@ -0,0 +1,80 @@
+ at 14569
+AAGTTTATTCCTGGACGAAGGAAGAAAAGGCCAGATGGGAAACAAGAACAAGCCCCTGTTGAAGACGCAGGGCC
++
+cceeeeceeeee`dedbdbdb_^b`abU_cacadabd`dLMZ[XTcT^a^adaaaddcd`aL^`^_`Y\]^`Y_
+ at 19211
+AGA
++
+^\`
+ at 9180
+GAGGG
++
+b`bLb
+ at 19132
+TGTGATTATCCACTGGTATAT
++
+Z[QZZLZ[]J[SHZNaZ[_Ia
+ at 15868
+CTGCCAAGGCTGCCCCCAAA
++
+`c`cc\`\Lb]bL`[`a]L`
+ at 1424
+GGCCCCAGACTTGCTCCCCCAACAAGGACAATGTCCAAGGAGTGTCCCC
++
+eeeeeeeea`bbdaaadad`Oaaaaccada_aa_d`_X`_^`[`_[_W^
+ at 7855
+GTGGGGGCT
++
+]^\]FW]Z`
+ at 17943
+ACATGGGACCAGAAAACACCACCAGGGGTTTGGGGCTGTCCTGAG
++
+ccc`\^`aba\b^`\FR`OOPYG[[W```[Ra_RR_\]\\P\_H_
+ at 11100
+CGGATAACTGAAAATGCATTTTTAACGCCATGACCGTGTCTCAAGGACCCGCTGTGGAAG
++
+b`b_b_a\bc^Tabadaddcddd``bdaa_^aJ\^_\]\\__O[___L^\_aaa^^^UJ^
+ at 15663
+AGGT
++
+aaKa
+ at 4698
+CCAATTGGCACCCCTCTGCCTTCAGCCATT
++
+cccc\`ccc\caccZccccc]^`LY\bL_b
+ at 20649
+TCTGGACTGGATCTTTAGGATGGTGGAGATGATCTGGATGTAGGACAAAAGAACCAGGCAGAAGGGTG
++
+eeeeeaddadacdddebeccdddadd\^abbT_]bccTac]]b]L^][]Ve[^ZaY_^_^`\\Y]^Y`
+ at 17259
+
++
+
+ at 6003
+CTTCAACTCATCTTGTTATTAATACCATCAATATCCCATGAGGCTCATAAAACGAGTCTTTCTTCTTGGAAACATGACCAAGATTGGGCAAACGT
++
+fffffffffffffffffdffecfcefeffdcfdeeebbbdbccccc\db\`^aa`^Y^^^cbcbaa`bbWY^^^__S_YYR]GWY]\]]XX\_`S
+ at 4118
+TCAAATTGTACTGCAAAGAAGGTCCCAGCTGGTCTCTTCTGGGAGTGATCTAACTAACTTAAG
++
+dc^ddeeeeeedeee`ceceddadadddcbde_dedc_ec_a^^b\b\\]VIPZY^T^^^\L_
+ at 18416
+GTGGGGAAGCCGAAGAAGCAGCGGAGATCGATTGTAAGAACGACG
++
+dddacaabdbea\d^cce\da`dd_^__`a`a`b[_^__^\^^^_
+ at 20115
+TGAAAAAGGAAAACATGGTAGTTTTCTTGTATGAGAGAGCCAGAGCCACCTTGGAGATTTTGTTCTCTCTGTGCG
++
+ed^eeafffaddfecdddabc^_badd`bd_ddadaa^bbcad\d\__^_\aaa_aY____aaN_\cdc\^aaYb
+ at 16139
+TCATCCGAAGAGTTGGCAGGCCCTGTGAATTGTGAAAACAGTATACCCACCCCTTTCCC
++
+cabacacY^c\daaddaadad^\ad_a\Y`[ZQ]Y^^OYQ^X^YT\\]U\^RRX^\YJ^
+ at 14123
+GATTTGGGGAAAGGAAACAATAGTTGAGTTTGGGCCACGGGAAATTCAAGATGCCTGGTATGTC
++
+cccccccac^bYbbT_aa_Yb^^Ta\\^]]aaTaaaaab\b\XL`VZZV]QYYY[aa^^^^_^^
+ at 8766
+ACCTGTAAGGTCCGCTCCTGGTGGACACCCACGAAGTCCAGGGCCTCAGGCAGGAAGTTGTAGCGCAGAGTTTTGAGCAGCTGCTCCATC
++
+fcfffffcffeffeeefdefddeecdccacddfdYd`d^\_^`\_abbc\b[ba^Y^Z_^^H^Z_^Y_Y_OKWPZR]]Z]`Z``Z^UHZ^
diff --git a/tests/cut/interleaved.fastq b/tests/cut/interleaved.fastq
new file mode 100644
index 0000000..081a90f
--- /dev/null
+++ b/tests/cut/interleaved.fastq
@@ -0,0 +1,16 @@
+ at read1/1 some text
+TTATTTGTCTCCAGC
++
+##HHHHHHHHHHHHH
+ at read1/2 other text
+GCTGGAGACAAATAA
++
+HHHHHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACA
++
+HHHHHHHHHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGG
++
+#HHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/issue46.fasta b/tests/cut/issue46.fasta
new file mode 100644
index 0000000..0bc0403
--- /dev/null
+++ b/tests/cut/issue46.fasta
@@ -0,0 +1,2 @@
+>readname
+A
diff --git a/tests/cut/linked.fasta b/tests/cut/linked.fasta
new file mode 100644
index 0000000..c010e80
--- /dev/null
+++ b/tests/cut/linked.fasta
@@ -0,0 +1,10 @@
+>r1 5' adapter and 3' adapter
+CCCCCCCCCC
+>r5 only 5' adapter
+CCCCCCCCCCGGGGGGG
+>r3 5' adapter, partial 3' adapter
+CCCGGCCCCC
+>r4 only 3' adapter
+GGGGGGGGGGCCCCCCCCCCTTTTTTTTTTGGGGGGG
+>r2 without any adapter
+GGGGGGGGGGGGGGGGGGG
diff --git a/tests/cut/lowercase.fastq b/tests/cut/lowercase.fastq
new file mode 100644
index 0000000..a3437d1
--- /dev/null
+++ b/tests/cut/lowercase.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGA
++
+)3%)&&&&!.1&(6:<'67..*,:
+ at prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCC
++
+;<:&:A;A!9<<<,7:<=3=;:
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/lowqual.fastq b/tests/cut/lowqual.fastq
new file mode 100644
index 0000000..58c5a65
--- /dev/null
+++ b/tests/cut/lowqual.fastq
@@ -0,0 +1,8 @@
+ at first_sequence
+
++
+
+ at second_sequence
+
++
+
diff --git a/tests/cut/maxlen.fa b/tests/cut/maxlen.fa
new file mode 100644
index 0000000..8b4729b
--- /dev/null
+++ b/tests/cut/maxlen.fa
@@ -0,0 +1,14 @@
+>read_length0a
+T
+>read_length0b
+T
+>read_length1
+T2
+>read_length2
+T02
+>read_length3
+T302
+>read_length4
+T3302
+>read_length5
+T23302
diff --git a/tests/cut/maxn0.2.fasta b/tests/cut/maxn0.2.fasta
new file mode 100644
index 0000000..0255fc7
--- /dev/null
+++ b/tests/cut/maxn0.2.fasta
@@ -0,0 +1,6 @@
+>r1
+
+>r3
+AAAA
+>r4
+AAAAN
diff --git a/tests/cut/maxn0.4.fasta b/tests/cut/maxn0.4.fasta
new file mode 100644
index 0000000..9c830e5
--- /dev/null
+++ b/tests/cut/maxn0.4.fasta
@@ -0,0 +1,8 @@
+>r1
+
+>r3
+AAAA
+>r4
+AAAAN
+>r5
+AAANN
diff --git a/tests/cut/maxn0.fasta b/tests/cut/maxn0.fasta
new file mode 100644
index 0000000..d448df2
--- /dev/null
+++ b/tests/cut/maxn0.fasta
@@ -0,0 +1,4 @@
+>r1
+
+>r3
+AAAA
diff --git a/tests/cut/maxn1.fasta b/tests/cut/maxn1.fasta
new file mode 100644
index 0000000..4edae80
--- /dev/null
+++ b/tests/cut/maxn1.fasta
@@ -0,0 +1,8 @@
+>r1
+
+>r2
+N
+>r3
+AAAA
+>r4
+AAAAN
diff --git a/tests/cut/maxn2.fasta b/tests/cut/maxn2.fasta
new file mode 100644
index 0000000..3eb7ba2
--- /dev/null
+++ b/tests/cut/maxn2.fasta
@@ -0,0 +1,10 @@
+>r1
+
+>r2
+N
+>r3
+AAAA
+>r4
+AAAAN
+>r5
+AAANN
diff --git a/tests/cut/minlen.fa b/tests/cut/minlen.fa
new file mode 100644
index 0000000..fa9b0fe
--- /dev/null
+++ b/tests/cut/minlen.fa
@@ -0,0 +1,16 @@
+>read_length5
+T23302
+>read_length6
+T023302
+>read_length7
+T1023302
+>read_length8
+T11023302
+>read_length9
+T111023302
+>read_length10
+T2111023302
+>read_length11
+T02111023302
+>read_length12
+T002111023302
diff --git a/tests/cut/minlen.noprimer.fa b/tests/cut/minlen.noprimer.fa
new file mode 100644
index 0000000..1befe6e
--- /dev/null
+++ b/tests/cut/minlen.noprimer.fa
@@ -0,0 +1,14 @@
+>read_length6
+23302
+>read_length7
+023302
+>read_length8
+1023302
+>read_length9
+11023302
+>read_length10
+111023302
+>read_length11
+2111023302
+>read_length12
+02111023302
diff --git a/tests/cut/nextseq.fastq b/tests/cut/nextseq.fastq
new file mode 100644
index 0000000..fad6929
--- /dev/null
+++ b/tests/cut/nextseq.fastq
@@ -0,0 +1,8 @@
+ at NS500350:251:HLM7JBGXX:1:11101:12075:1120 1:N:0:TACAGC
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACAGCATCTCGTATTCCGTCTTCTGCTTGAAAAAAAA
++
+AAAAAEEEEEEAEEEEAEAEEEEEEAEEEEEEEEEEEEEEE///E/EE////AAEE/E//////EEEEEEE
+ at NS500350:251:HLM7JBGXX:1:11101:22452:1121 1:N:0:TACAGC
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACAGCATCGCGTATGCCGTCTTATGCTTGAAAAAAAAA
++
+AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/////E/EE//E6///E//A//E//EEEEEEEE
diff --git a/tests/cut/no-trim.fastq b/tests/cut/no-trim.fastq
new file mode 100644
index 0000000..d3668fd
--- /dev/null
+++ b/tests/cut/no-trim.fastq
@@ -0,0 +1,4 @@
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/no_indels.fasta b/tests/cut/no_indels.fasta
new file mode 100644
index 0000000..7c56412
--- /dev/null
+++ b/tests/cut/no_indels.fasta
@@ -0,0 +1,18 @@
+>3p_orig
+TGAACATAGC
+>3p_mism
+TGAACATAGC
+>3p_del
+TGAACATAGCTTAACATATAACCG
+>3p_ins
+TGAACATAGCTTAGGACATATAACCG
+>3p_frontins
+TAGACATATAACCG
+>5p_orig
+TACTGCTTCTCGAA
+>5p_mism
+TACTGCTTCTCGAA
+>5p_del
+TCCTCGAGATGCCATACTGCTTCTCGAA
+>5p_ins
+TCCTCGAGATATGCCATACTGCTTCTCGAA
diff --git a/tests/cut/overlapa.fa b/tests/cut/overlapa.fa
new file mode 100644
index 0000000..a4ad60d
--- /dev/null
+++ b/tests/cut/overlapa.fa
@@ -0,0 +1,40 @@
+>read1
+T0021110233021
+>read2
+T0021110233021
+>read3
+T0021110233021
+>read4
+T0021110233021
+>read5
+T0021110233021
+>read6
+T0021110233021
+>read7
+T0021110233021
+>read8
+T0021110233021
+>read9
+T0021110233021
+>read10
+T0021110233021330201030
+>read11
+T002111023302133020103
+>read12
+T00211102330213302010
+>read13
+T0021110233021330201
+>read14
+T002111023302133020
+>read15
+T00211102330213302
+>read16
+T0021110233021330
+>read17
+T002111023302133
+>read18
+T00211102330213
+>read19
+T0021110233021
+>read20
+T002111023302
diff --git a/tests/cut/overlapb.fa b/tests/cut/overlapb.fa
new file mode 100644
index 0000000..decf1d3
--- /dev/null
+++ b/tests/cut/overlapb.fa
@@ -0,0 +1,38 @@
+>adaptlen18
+ATACTTACCCGTA
+>adaptlen17
+ATACTTACCCGTA
+>adaptlen16
+ATACTTACCCGTA
+>adaptlen15
+ATACTTACCCGTA
+>adaptlen14
+ATACTTACCCGTA
+>adaptlen13
+ATACTTACCCGTA
+>adaptlen12
+ATACTTACCCGTA
+>adaptlen11
+ATACTTACCCGTA
+>adaptlen10
+ATACTTACCCGTA
+>adaptlen9
+TCTCCGTCGATACTTACCCGTA
+>adaptlen8
+CTCCGTCGATACTTACCCGTA
+>adaptlen7
+TCCGTCGATACTTACCCGTA
+>adaptlen6
+CCGTCGATACTTACCCGTA
+>adaptlen5
+CGTCGATACTTACCCGTA
+>adaptlen4
+GTCGATACTTACCCGTA
+>adaptlen3
+TCGATACTTACCCGTA
+>adaptlen2
+CGATACTTACCCGTA
+>adaptlen1
+GATACTTACCCGTA
+>adaptlen0
+ATACTTACCCGTA
diff --git a/tests/cut/paired-filterboth.1.fastq b/tests/cut/paired-filterboth.1.fastq
new file mode 100644
index 0000000..a8b2b28
--- /dev/null
+++ b/tests/cut/paired-filterboth.1.fastq
@@ -0,0 +1,16 @@
+ at read1/1 some text
+TTATTTGTCTCCAGC
++
+##HHHHHHHHHHHHH
+ at read2/1
+CAACAGGCCACA
++
+HHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACA
++
+HHHHHHHHHHHHHHHHHHHH
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired-filterboth.2.fastq b/tests/cut/paired-filterboth.2.fastq
new file mode 100644
index 0000000..655d545
--- /dev/null
+++ b/tests/cut/paired-filterboth.2.fastq
@@ -0,0 +1,16 @@
+ at read1/2 other text
+GCTGGAGACAAATAACAGT
++
+HHHHHHHHHHHHHHHHHHH
+ at read2/2
+TGTGGCCTGTTGCAGT
++
+###HHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGGCAGTG
++
+#HHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/cut/paired-m27.1.fastq b/tests/cut/paired-m27.1.fastq
new file mode 100644
index 0000000..3f2d733
--- /dev/null
+++ b/tests/cut/paired-m27.1.fastq
@@ -0,0 +1,16 @@
+ at read1/1 some text
+TTATTTGTCTCCAGCTTAGACATATCGCCT
++
+##HHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/1
+CAACAGGCCACATTAGACATATCGGATGGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACATTAGACA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired-m27.2.fastq b/tests/cut/paired-m27.2.fastq
new file mode 100644
index 0000000..808df31
--- /dev/null
+++ b/tests/cut/paired-m27.2.fastq
@@ -0,0 +1,16 @@
+ at read1/2 other text
+GCTGGAGACAAATAACAGTGGAGTAGTTTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/2
+TGTGGCCTGTTGCAGTGGAGTAACTCCAGC
++
+###HHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGGCAGTG
++
+#HHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/cut/paired-onlyA.1.fastq b/tests/cut/paired-onlyA.1.fastq
new file mode 100644
index 0000000..3f2d733
--- /dev/null
+++ b/tests/cut/paired-onlyA.1.fastq
@@ -0,0 +1,16 @@
+ at read1/1 some text
+TTATTTGTCTCCAGCTTAGACATATCGCCT
++
+##HHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/1
+CAACAGGCCACATTAGACATATCGGATGGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACATTAGACA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired-onlyA.2.fastq b/tests/cut/paired-onlyA.2.fastq
new file mode 100644
index 0000000..15354e0
--- /dev/null
+++ b/tests/cut/paired-onlyA.2.fastq
@@ -0,0 +1,16 @@
+ at read1/2 other text
+GCTGGAGACAAATAA
++
+HHHHHHHHHHHHHHH
+ at read2/2
+TGTGGCCTGTTG
++
+###HHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGG
++
+#HHHHHHHHHHHHHHHHHHH
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/cut/paired-separate.1.fastq b/tests/cut/paired-separate.1.fastq
new file mode 100644
index 0000000..a8b2b28
--- /dev/null
+++ b/tests/cut/paired-separate.1.fastq
@@ -0,0 +1,16 @@
+ at read1/1 some text
+TTATTTGTCTCCAGC
++
+##HHHHHHHHHHHHH
+ at read2/1
+CAACAGGCCACA
++
+HHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACA
++
+HHHHHHHHHHHHHHHHHHHH
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired-separate.2.fastq b/tests/cut/paired-separate.2.fastq
new file mode 100644
index 0000000..15354e0
--- /dev/null
+++ b/tests/cut/paired-separate.2.fastq
@@ -0,0 +1,16 @@
+ at read1/2 other text
+GCTGGAGACAAATAA
++
+HHHHHHHHHHHHHHH
+ at read2/2
+TGTGGCCTGTTG
++
+###HHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGG
++
+#HHHHHHHHHHHHHHHHHHH
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/cut/paired-too-short.1.fastq b/tests/cut/paired-too-short.1.fastq
new file mode 100644
index 0000000..64322e2
--- /dev/null
+++ b/tests/cut/paired-too-short.1.fastq
@@ -0,0 +1,4 @@
+ at read2/1
+CAACAGGCCACA
++
+HHHHHHHHHHHH
diff --git a/tests/cut/paired-too-short.2.fastq b/tests/cut/paired-too-short.2.fastq
new file mode 100644
index 0000000..96d2253
--- /dev/null
+++ b/tests/cut/paired-too-short.2.fastq
@@ -0,0 +1,4 @@
+ at read2/2
+TGTGGCCTGTTG
++
+###HHHHHHHHH
diff --git a/tests/cut/paired-trimmed.1.fastq b/tests/cut/paired-trimmed.1.fastq
new file mode 100644
index 0000000..fb3f459
--- /dev/null
+++ b/tests/cut/paired-trimmed.1.fastq
@@ -0,0 +1,12 @@
+ at read1/1 some text
+TTATTTGTCTCCAGC
++
+##HHHHHHHHHHHHH
+ at read2/1
+CAACAGGCCACA
++
+HHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACA
++
+HHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired-trimmed.2.fastq b/tests/cut/paired-trimmed.2.fastq
new file mode 100644
index 0000000..1feef27
--- /dev/null
+++ b/tests/cut/paired-trimmed.2.fastq
@@ -0,0 +1,12 @@
+ at read1/2 other text
+GCTGGAGACAAATAACAGTGGAGTAGTTTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/2
+TGTGGCCTGTTGCAGTGGAGTAACTCCAGC
++
+###HHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGGCAGTG
++
+#HHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired-untrimmed.1.fastq b/tests/cut/paired-untrimmed.1.fastq
new file mode 100644
index 0000000..8ab53bd
--- /dev/null
+++ b/tests/cut/paired-untrimmed.1.fastq
@@ -0,0 +1,4 @@
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired-untrimmed.2.fastq b/tests/cut/paired-untrimmed.2.fastq
new file mode 100644
index 0000000..ca52d30
--- /dev/null
+++ b/tests/cut/paired-untrimmed.2.fastq
@@ -0,0 +1,4 @@
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/cut/paired.1.fastq b/tests/cut/paired.1.fastq
new file mode 100644
index 0000000..d6f246d
--- /dev/null
+++ b/tests/cut/paired.1.fastq
@@ -0,0 +1,12 @@
+ at read1/1 some text
+TTATTTGTCTCCAGC
++
+##HHHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACA
++
+HHHHHHHHHHHHHHHHHHHH
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired.2.fastq b/tests/cut/paired.2.fastq
new file mode 100644
index 0000000..eb4df03
--- /dev/null
+++ b/tests/cut/paired.2.fastq
@@ -0,0 +1,12 @@
+ at read1/2 other text
+GCTGGAGACAAATAA
++
+HHHHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGG
++
+#HHHHHHHHHHHHHHHHHHH
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/cut/paired.m14.1.fastq b/tests/cut/paired.m14.1.fastq
new file mode 100644
index 0000000..d6f246d
--- /dev/null
+++ b/tests/cut/paired.m14.1.fastq
@@ -0,0 +1,12 @@
+ at read1/1 some text
+TTATTTGTCTCCAGC
++
+##HHHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACA
++
+HHHHHHHHHHHHHHHHHHHH
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/paired.m14.2.fastq b/tests/cut/paired.m14.2.fastq
new file mode 100644
index 0000000..3cb5248
--- /dev/null
+++ b/tests/cut/paired.m14.2.fastq
@@ -0,0 +1,12 @@
+ at read1/2 other text
+GCTGGAGACAAATAACAGTGGAGTAGTTTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGGCAGTG
++
+#HHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/cut/pairedq.1.fastq b/tests/cut/pairedq.1.fastq
new file mode 100644
index 0000000..e248176
--- /dev/null
+++ b/tests/cut/pairedq.1.fastq
@@ -0,0 +1,8 @@
+ at read1/1 some text
+TTATTTGTCTCCAGC
++
+##HHHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACA
++
+HHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/pairedq.2.fastq b/tests/cut/pairedq.2.fastq
new file mode 100644
index 0000000..306314e
--- /dev/null
+++ b/tests/cut/pairedq.2.fastq
@@ -0,0 +1,8 @@
+ at read1/2 other text
+GCTGGAGACAAATAA
++
+HHHHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGG
++
+#HHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/pairedu.1.fastq b/tests/cut/pairedu.1.fastq
new file mode 100644
index 0000000..7688970
--- /dev/null
+++ b/tests/cut/pairedu.1.fastq
@@ -0,0 +1,16 @@
+ at read1/1 some text
+TTTGTCTCCAGCTTAGACATATCGCC
++
+HHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/1
+CAGGCCACATTAGACATATCGGATGG
++
+HHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/1
+ACTTGATATTAATAACATTAGAC
++
+HHHHHHHHHHHHHHHHHHHHHHH
+ at read4/1
+AGGCCGTTTGAATGTTGACGGGATGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/cut/pairedu.2.fastq b/tests/cut/pairedu.2.fastq
new file mode 100644
index 0000000..dbd88d7
--- /dev/null
+++ b/tests/cut/pairedu.2.fastq
@@ -0,0 +1,16 @@
+ at read1/2 other text
+GAGACAAATAACAGTGGAGTAGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/2
+GCCTGTTGCAGTGGAGTAACTCCA
++
+HHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/2
+ATTAATATCAAGTTGGCAG
++
+HHHHHHHHHHHHHHHHHHH
+ at read4/2
+CCGTCAACATTCAAACGGCCTGTC
++
+########################
diff --git a/tests/cut/plus.fastq b/tests/cut/plus.fastq
new file mode 100644
index 0000000..35849f8
--- /dev/null
+++ b/tests/cut/plus.fastq
@@ -0,0 +1,8 @@
+ at first_sequence some other text
+SEQUENCE1
++first_sequence some other text
+:6;;8<=:<
+ at second_sequence and more text
+SEQUENCE2
++second_sequence and more text
+83<??:(61
diff --git a/tests/cut/polya.fasta b/tests/cut/polya.fasta
new file mode 100644
index 0000000..9b12d5c
--- /dev/null
+++ b/tests/cut/polya.fasta
@@ -0,0 +1,2 @@
+>polyAlong
+CTTAGTTCAATWTTAACCAAACTTCAGAACAG
diff --git a/tests/cut/rest.fa b/tests/cut/rest.fa
new file mode 100644
index 0000000..79c64cd
--- /dev/null
+++ b/tests/cut/rest.fa
@@ -0,0 +1,18 @@
+>read1
+TESTING
+>read2
+TESTING
+>read3
+TESTING
+>read4
+TESTING
+>read5
+TESTING
+>read6
+SOMETHING
+>read7
+SOMETHING
+>read8
+REST
+>read9
+NOREST
diff --git a/tests/cut/restfront.fa b/tests/cut/restfront.fa
new file mode 100644
index 0000000..8b51e6c
--- /dev/null
+++ b/tests/cut/restfront.fa
@@ -0,0 +1,18 @@
+>read1
+REST1
+>read2
+RESTING
+>read3
+
+>read4
+RESTLESS
+>read5
+RESTORE
+>read6
+SOMETHING
+>read7
+SOMETHING
+>read8
+SOMETHING
+>read9
+NOREST
diff --git a/tests/cut/s_1_sequence.txt b/tests/cut/s_1_sequence.txt
new file mode 100644
index 0000000..f728223
--- /dev/null
+++ b/tests/cut/s_1_sequence.txt
@@ -0,0 +1,8 @@
+ at first_sequence
+SEQUENCE1
++
+:6;;8<=:<
+ at second_sequence
+SEQUENCE2
++
+83<??:(61
diff --git a/tests/cut/small.fasta b/tests/cut/small.fasta
new file mode 100644
index 0000000..dde4ba1
--- /dev/null
+++ b/tests/cut/small.fasta
@@ -0,0 +1,6 @@
+>prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGA
+>prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCC
+>prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
diff --git a/tests/cut/small.fastq b/tests/cut/small.fastq
new file mode 100644
index 0000000..a3437d1
--- /dev/null
+++ b/tests/cut/small.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGA
++
+)3%)&&&&!.1&(6:<'67..*,:
+ at prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCC
++
+;<:&:A;A!9<<<,7:<=3=;:
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/small.trimmed.fastq b/tests/cut/small.trimmed.fastq
new file mode 100644
index 0000000..ecb1729
--- /dev/null
+++ b/tests/cut/small.trimmed.fastq
@@ -0,0 +1,8 @@
+ at prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGA
++
+)3%)&&&&!.1&(6:<'67..*,:
+ at prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCC
++
+;<:&:A;A!9<<<,7:<=3=;:
diff --git a/tests/cut/small.untrimmed.fastq b/tests/cut/small.untrimmed.fastq
new file mode 100644
index 0000000..d3668fd
--- /dev/null
+++ b/tests/cut/small.untrimmed.fastq
@@ -0,0 +1,4 @@
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/solid-no-zerocap.fastq b/tests/cut/solid-no-zerocap.fastq
new file mode 100644
index 0000000..c666d5c
--- /dev/null
+++ b/tests/cut/solid-no-zerocap.fastq
@@ -0,0 +1,120 @@
+ at 1_13_85_F3
+T110020300.0113010210002110102330021
++
+7&9<&77)& <7))%4'657-1+9;9,.<8);.;8
+ at 1_13_573_F3
+T312311200.30213011011132
++
+6)3%)&&&& .1&(6:<'67..*,
+ at 1_13_1259_F3
+T002112130.201222332211
++
+=;<:&:A;A 9<<<,7:<=3=;
+ at 1_13_1440_F3
+T110020313.1113211010332111302330001
++
+=<=A:A=57 7<';<6?5;;6:+:=)71>70<,=:
+ at 1_14_177_F3
+T31330222020233321121323302013303311
++
+:8957;;54)'98924905;;)6:7;1:3<88(9:
+ at 1_14_238_F3
+T0133103120031002212223
++
+?><5=;<<<12>=<;1;;=5);
+ at 1_15_1098_F3
+T32333033222233020223032312232220332
++
+#,##(#5##*#($$'#.##)$&#%)$1##-$&##%
+ at 1_16_404_F3
+T03310320002130202331112
++
+78;:;;><>9=9;<<2=><<1;5
+ at 1_16_904_F3
+T21230102331022312232132021122111212
++
+9>=::6;;99=+/'$+#.#&%$&'(($1*$($.#.
+ at 1_16_1315_F3
+T032312311122103330103103
++
+<9<8A?>?::;6&,%;6/)8<<#/
+ at 1_16_1595_F3
+T22323211312111230022210011213302012
++
+>,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+
+ at 1_17_1379_F3
+T32011212111223230232132311321200123
++
+/-1179<1;>>8:':7-%/::0&+=<29,7<8(,2
+ at 1_18_1692_F3
+T12322233031100211233323300112200210
++
+.#(###5%)%2)',2&:+#+&5,($/1#&4&))$6
+ at 1_19_171_F3
+T10101101220213201111011320201230032
++
+)6:65/=3*:(8%)%2>&8&%;%0&#;$3$&:$#&
+ at 1_22_72_F3
+T13303032323221212301322233320210233
++
+3/#678<:.=9::6:(<538295;9+;&*;)+',&
+ at 1_22_1377_F3
+T22221333311222312201132312022322300
++
+)##0%.$.1*%,)95+%%14%$#8-###9-()#9+
+ at 1_23_585_F3
+T300103103101303121221
++
+>55;8><96/18?)<3<58<5
+ at 1_23_809_F3
+T13130101101021211013220302223302112
++
+:7<59@;<<5;/9;=<;7::.)&&&827(+221%(
+ at 1_24_138_F3
+T33211130100120323002
++
+6)68/;906#,25/&;<$0+
+ at 1_24_206_F3
+T33330332002223002020303331321221000
++
+))4(&)9592)#)694(,)292:(=7$.18,()65
+ at 1_25_143_F3
+T23202003031200220301303302012203132
++
+:4;/#&<9;&*;95-7;85&;587#16>%&,9<2&
+ at 1_25_1866_F3
+T03201321022131101112012330221130311
++
+=<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4
+ at 1_27_584_F3
+T10010330110103213112323303012103101
++
+82'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/
+ at 1_27_1227_F3
+T02003022123001003201002031303302011
++
+492:;>A:<;34<<=);:<<;9=7<3::<::3=>'
+ at 1_27_1350_F3
+T13130101101021211013220222221301231
++
+95,)<(4./;<938=64=+2/,.4),3':97#33&
+ at 1_29_477_F3
+T13130101101021211013300302223003030
++
+94=55:75=+:/7><968;;#&+$#3&6,#1#4#'
+ at 1_30_882_F3
+T20102033000233
++
+2(+-:-3<;5##/;
+ at 1_31_221_F3
+T03301311201100030300100233220102031
++
+89>9>5<139/,&:7969972.274&%:78&&746
+ at 1_31_1313_F3
+T0133113130033012232100010101
++
+;3<7=7::)5*4=&;<7>4;795065;9
+ at 1_529_129_F3
+T132222301020322102101322221322302.3302.3.3..221..3
++
+>>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+ &<-9 % @ )%) (
diff --git a/tests/cut/solid.fasta b/tests/cut/solid.fasta
new file mode 100644
index 0000000..5428e58
--- /dev/null
+++ b/tests/cut/solid.fasta
@@ -0,0 +1,4 @@
+>problem1
+T0112021202222201123121023103020
+>problem2
+T20201030313112322220210
diff --git a/tests/cut/solid.fastq b/tests/cut/solid.fastq
new file mode 100644
index 0000000..ab2927a
--- /dev/null
+++ b/tests/cut/solid.fastq
@@ -0,0 +1,120 @@
+ at 1_13_85_F3
+T110020300.0113010210002110102330021
++
+7&9<&77)&!<7))%4'657-1+9;9,.<8);.;8
+ at 1_13_573_F3
+T312311200.30213011011132
++
+6)3%)&&&&!.1&(6:<'67..*,
+ at 1_13_1259_F3
+T002112130.201222332211
++
+=;<:&:A;A!9<<<,7:<=3=;
+ at 1_13_1440_F3
+T110020313.1113211010332111302330001
++
+=<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at 1_14_177_F3
+T31330222020233321121323302013303311
++
+:8957;;54)'98924905;;)6:7;1:3<88(9:
+ at 1_14_238_F3
+T0133103120031002212223
++
+?><5=;<<<12>=<;1;;=5);
+ at 1_15_1098_F3
+T32333033222233020223032312232220332
++
+#,##(#5##*#($$'#.##)$&#%)$1##-$&##%
+ at 1_16_404_F3
+T03310320002130202331112
++
+78;:;;><>9=9;<<2=><<1;5
+ at 1_16_904_F3
+T21230102331022312232132021122111212
++
+9>=::6;;99=+/'$+#.#&%$&'(($1*$($.#.
+ at 1_16_1315_F3
+T032312311122103330103103
++
+<9<8A?>?::;6&,%;6/)8<<#/
+ at 1_16_1595_F3
+T22323211312111230022210011213302012
++
+>,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+
+ at 1_17_1379_F3
+T32011212111223230232132311321200123
++
+/-1179<1;>>8:':7-%/::0&+=<29,7<8(,2
+ at 1_18_1692_F3
+T12322233031100211233323300112200210
++
+.#(###5%)%2)',2&:+#+&5,($/1#&4&))$6
+ at 1_19_171_F3
+T10101101220213201111011320201230032
++
+)6:65/=3*:(8%)%2>&8&%;%0&#;$3$&:$#&
+ at 1_22_72_F3
+T13303032323221212301322233320210233
++
+3/#678<:.=9::6:(<538295;9+;&*;)+',&
+ at 1_22_1377_F3
+T22221333311222312201132312022322300
++
+)##0%.$.1*%,)95+%%14%$#8-###9-()#9+
+ at 1_23_585_F3
+T300103103101303121221
++
+>55;8><96/18?)<3<58<5
+ at 1_23_809_F3
+T13130101101021211013220302223302112
++
+:7<59@;<<5;/9;=<;7::.)&&&827(+221%(
+ at 1_24_138_F3
+T33211130100120323002
++
+6)68/;906#,25/&;<$0+
+ at 1_24_206_F3
+T33330332002223002020303331321221000
++
+))4(&)9592)#)694(,)292:(=7$.18,()65
+ at 1_25_143_F3
+T23202003031200220301303302012203132
++
+:4;/#&<9;&*;95-7;85&;587#16>%&,9<2&
+ at 1_25_1866_F3
+T03201321022131101112012330221130311
++
+=<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4
+ at 1_27_584_F3
+T10010330110103213112323303012103101
++
+82'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/
+ at 1_27_1227_F3
+T02003022123001003201002031303302011
++
+492:;>A:<;34<<=);:<<;9=7<3::<::3=>'
+ at 1_27_1350_F3
+T13130101101021211013220222221301231
++
+95,)<(4./;<938=64=+2/,.4),3':97#33&
+ at 1_29_477_F3
+T13130101101021211013300302223003030
++
+94=55:75=+:/7><968;;#&+$#3&6,#1#4#'
+ at 1_30_882_F3
+T20102033000233
++
+2(+-:-3<;5##/;
+ at 1_31_221_F3
+T03301311201100030300100233220102031
++
+89>9>5<139/,&:7969972.274&%:78&&746
+ at 1_31_1313_F3
+T0133113130033012232100010101
++
+;3<7=7::)5*4=&;<7>4;795065;9
+ at 1_529_129_F3
+T132222301020322102101322221322302.3302.3.3..221..3
++
+>>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+!&<-9!%!@!!)%)!!(
diff --git a/tests/cut/solid5p-anchored.fasta b/tests/cut/solid5p-anchored.fasta
new file mode 100644
index 0000000..a779451
--- /dev/null
+++ b/tests/cut/solid5p-anchored.fasta
@@ -0,0 +1,32 @@
+>read1
+212322332333012001112122203233202221000211
+>read2
+01212322332333200121311212133113001311002032
+>read3
+2201212322332333211133003002232323010012320300
+>read4
+02010102312033021011121312131
+>read5
+21313210102120020302022233110
+>read6
+31203203013323021010020301321
+>read7
+1301020302201212322332333203020130202120211322010013211
+>read8
+310321030130120302201212322332333232202123123111113113003200330
+>read9
+002132103320302201212322332333020123133023120320131020333011
+>read10
+0322031320033220302201212322332333201130233321321011303133231200
+>read11
+02010102312033021011121312131
+>read12
+1
+>read13
+
+>read14
+
+>read15
+
+>read16
+
diff --git a/tests/cut/solid5p-anchored.fastq b/tests/cut/solid5p-anchored.fastq
new file mode 100644
index 0000000..c1da73d
--- /dev/null
+++ b/tests/cut/solid5p-anchored.fastq
@@ -0,0 +1,64 @@
+ at read1
+212322332333012001112122203233202221000211
++
+58)2";%4A,8>0;9C\'?276>#)49"<,>?/\'!A4$.%+
+ at read2
+01212322332333200121311212133113001311002032
++
+4<@;(<3.37/''=:-9AA<&C2%$$;?A&5!C69:?-;&;65.
+ at read3
+2201212322332333211133003002232323010012320300
++
+!<A-BB&A/)'103&2$!00>#97*B.0A-@(*","B3><4&16(:
+ at read4
+02010102312033021011121312131
++
+&-81+%)7;<)6?83!&CB9"9B6307=&
+ at read5
+21313210102120020302022233110
++
+9)27,(-*=,#4:;"/4++5<, at -784*'
+ at read6
+31203203013323021010020301321
++
+!.;:C%97@>75-";';*)A67CCC")$*
+ at read7
+1301020302201212322332333203020130202120211322010013211
++
+;0B at A"98!<=!*;5;650;';79!+8,4(2=+98:B at C@:+3*>2+6+2++C0.
+ at read8
+310321030130120302201212322332333232202123123111113113003200330
++
+/$-"=6+1.8?AB!?'#.585 at 6:47@?>.315A-'9<%">6,+)*,)1-;:(691>?C)4A;
+ at read9
+002132103320302201212322332333020123133023120320131020333011
++
+&?527&:=;6 at 6@03%95(-0#$:B8::B*4?@&)6>79C>)6C'5-#<!B:>0:A8+2*
+ at read10
+0322031320033220302201212322332333201130233321321011303133231200
++
+53)>2.+9?7%=&21;8!820961%3#0'5C.28347,2(55*1.,>%:(1A'A5=@7&&5?4'
+ at read11
+02010102312033021011121312131
++
+8B"195'@,@&:5=7;!&-9:%<!)>((>
+ at read12
+1
++
+C
+ at read13
+
++
+
+ at read14
+
++
+
+ at read15
+
++
+
+ at read16
+
++
+
diff --git a/tests/cut/solid5p-anchored.notrim.fasta b/tests/cut/solid5p-anchored.notrim.fasta
new file mode 100644
index 0000000..bdfe76d
--- /dev/null
+++ b/tests/cut/solid5p-anchored.notrim.fasta
@@ -0,0 +1,32 @@
+>read1
+T1212322332333012001112122203233202221000211
+>read2
+T201212322332333200121311212133113001311002032
+>read3
+T02201212322332333211133003002232323010012320300
+>read4
+T302010102312033021011121312131
+>read5
+T121313210102120020302022233110
+>read6
+T331203203013323021010020301321
+>read7
+T21301020302201212322332333203020130202120211322010013211
+>read8
+T2310321030130120302201212322332333232202123123111113113003200330
+>read9
+T0002132103320302201212322332333020123133023120320131020333011
+>read10
+T00322031320033220302201212322332333201130233321321011303133231200
+>read11
+T402010102312033021011121312131
+>read12
+T11
+>read13
+T1
+>read14
+T
+>read15
+T
+>read16
+T
diff --git a/tests/cut/solid5p-anchored.notrim.fastq b/tests/cut/solid5p-anchored.notrim.fastq
new file mode 100644
index 0000000..946aa9c
--- /dev/null
+++ b/tests/cut/solid5p-anchored.notrim.fastq
@@ -0,0 +1,64 @@
+ at read1
+T1212322332333012001112122203233202221000211
++
+:58)2";%4A,8>0;9C\'?276>#)49"<,>?/\'!A4$.%+
+ at read2
+T201212322332333200121311212133113001311002032
++
+44<@;(<3.37/''=:-9AA<&C2%$$;?A&5!C69:?-;&;65.
+ at read3
+T02201212322332333211133003002232323010012320300
++
+2!<A-BB&A/)'103&2$!00>#97*B.0A-@(*","B3><4&16(:
+ at read4
+T302010102312033021011121312131
++
+<&-81+%)7;<)6?83!&CB9"9B6307=&
+ at read5
+T121313210102120020302022233110
++
+$9)27,(-*=,#4:;"/4++5<, at -784*'
+ at read6
+T331203203013323021010020301321
++
+4!.;:C%97@>75-";';*)A67CCC")$*
+ at read7
+T21301020302201212322332333203020130202120211322010013211
++
+,;0B at A"98!<=!*;5;650;';79!+8,4(2=+98:B at C@:+3*>2+6+2++C0.
+ at read8
+T2310321030130120302201212322332333232202123123111113113003200330
++
+C/$-"=6+1.8?AB!?'#.585 at 6:47@?>.315A-'9<%">6,+)*,)1-;:(691>?C)4A;
+ at read9
+T0002132103320302201212322332333020123133023120320131020333011
++
+(&?527&:=;6 at 6@03%95(-0#$:B8::B*4?@&)6>79C>)6C'5-#<!B:>0:A8+2*
+ at read10
+T00322031320033220302201212322332333201130233321321011303133231200
++
+&53)>2.+9?7%=&21;8!820961%3#0'5C.28347,2(55*1.,>%:(1A'A5=@7&&5?4'
+ at read11
+T402010102312033021011121312131
++
+&8B"195'@,@&:5=7;!&-9:%<!)>((>
+ at read12
+T11
++
+?C
+ at read13
+T1
++
+C
+ at read14
+T
++
+
+ at read15
+T
++
+
+ at read16
+T
++
+
diff --git a/tests/cut/solid5p.fasta b/tests/cut/solid5p.fasta
new file mode 100644
index 0000000..29c26a6
--- /dev/null
+++ b/tests/cut/solid5p.fasta
@@ -0,0 +1,32 @@
+>read1
+12001112122203233202221000211
+>read2
+00121311212133113001311002032
+>read3
+11133003002232323010012320300
+>read4
+02010102312033021011121312131
+>read5
+21313210102120020302022233110
+>read6
+31203203013323021010020301321
+>read7
+03020130202120211322010013211
+>read8
+32202123123111113113003200330
+>read9
+20123133023120320131020333011
+>read10
+01130233321321011303133231200
+>read11
+02010102312033021011121312131
+>read12
+1
+>read13
+
+>read14
+
+>read15
+
+>read16
+
diff --git a/tests/cut/solid5p.fastq b/tests/cut/solid5p.fastq
new file mode 100644
index 0000000..5849d87
--- /dev/null
+++ b/tests/cut/solid5p.fastq
@@ -0,0 +1,64 @@
+ at read1
+12001112122203233202221000211
++
+;9C\'?276>#)49"<,>?/\'!A4$.%+
+ at read2
+00121311212133113001311002032
++
+-9AA<&C2%$$;?A&5!C69:?-;&;65.
+ at read3
+11133003002232323010012320300
++
+!00>#97*B.0A-@(*","B3><4&16(:
+ at read4
+02010102312033021011121312131
++
+&-81+%)7;<)6?83!&CB9"9B6307=&
+ at read5
+21313210102120020302022233110
++
+9)27,(-*=,#4:;"/4++5<, at -784*'
+ at read6
+31203203013323021010020301321
++
+!.;:C%97@>75-";';*)A67CCC")$*
+ at read7
+03020130202120211322010013211
++
+8,4(2=+98:B at C@:+3*>2+6+2++C0.
+ at read8
+32202123123111113113003200330
++
+-'9<%">6,+)*,)1-;:(691>?C)4A;
+ at read9
+20123133023120320131020333011
++
+?@&)6>79C>)6C'5-#<!B:>0:A8+2*
+ at read10
+01130233321321011303133231200
++
+47,2(55*1.,>%:(1A'A5=@7&&5?4'
+ at read11
+02010102312033021011121312131
++
+8B"195'@,@&:5=7;!&-9:%<!)>((>
+ at read12
+1
++
+C
+ at read13
+
++
+
+ at read14
+
++
+
+ at read15
+
++
+
+ at read16
+
++
+
diff --git a/tests/cut/solidbfast.fastq b/tests/cut/solidbfast.fastq
new file mode 100644
index 0000000..c9117c4
--- /dev/null
+++ b/tests/cut/solidbfast.fastq
@@ -0,0 +1,120 @@
+ at abc:1_13_85
+T110020300.0113010210002110102330021
++
+7&9<&77)&!<7))%4'657-1+9;9,.<8);.;8
+ at abc:1_13_573
+T312311200.30213011011132
++
+6)3%)&&&&!.1&(6:<'67..*,
+ at abc:1_13_1259
+T002112130.201222332211
++
+=;<:&:A;A!9<<<,7:<=3=;
+ at abc:1_13_1440
+T110020313.1113211010332111302330001
++
+=<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at abc:1_14_177
+T31330222020233321121323302013303311
++
+:8957;;54)'98924905;;)6:7;1:3<88(9:
+ at abc:1_14_238
+T0133103120031002212223
++
+?><5=;<<<12>=<;1;;=5);
+ at abc:1_15_1098
+T32333033222233020223032312232220332
++
+#,##(#5##*#($$'#.##)$&#%)$1##-$&##%
+ at abc:1_16_404
+T03310320002130202331112
++
+78;:;;><>9=9;<<2=><<1;5
+ at abc:1_16_904
+T21230102331022312232132021122111212
++
+9>=::6;;99=+/'$+#.#&%$&'(($1*$($.#.
+ at abc:1_16_1315
+T032312311122103330103103
++
+<9<8A?>?::;6&,%;6/)8<<#/
+ at abc:1_16_1595
+T22323211312111230022210011213302012
++
+>,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+
+ at abc:1_17_1379
+T32011212111223230232132311321200123
++
+/-1179<1;>>8:':7-%/::0&+=<29,7<8(,2
+ at abc:1_18_1692
+T12322233031100211233323300112200210
++
+.#(###5%)%2)',2&:+#+&5,($/1#&4&))$6
+ at abc:1_19_171
+T10101101220213201111011320201230032
++
+)6:65/=3*:(8%)%2>&8&%;%0&#;$3$&:$#&
+ at abc:1_22_72
+T13303032323221212301322233320210233
++
+3/#678<:.=9::6:(<538295;9+;&*;)+',&
+ at abc:1_22_1377
+T22221333311222312201132312022322300
++
+)##0%.$.1*%,)95+%%14%$#8-###9-()#9+
+ at abc:1_23_585
+T300103103101303121221
++
+>55;8><96/18?)<3<58<5
+ at abc:1_23_809
+T13130101101021211013220302223302112
++
+:7<59@;<<5;/9;=<;7::.)&&&827(+221%(
+ at abc:1_24_138
+T33211130100120323002
++
+6)68/;906#,25/&;<$0+
+ at abc:1_24_206
+T33330332002223002020303331321221000
++
+))4(&)9592)#)694(,)292:(=7$.18,()65
+ at abc:1_25_143
+T23202003031200220301303302012203132
++
+:4;/#&<9;&*;95-7;85&;587#16>%&,9<2&
+ at abc:1_25_1866
+T03201321022131101112012330221130311
++
+=<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4
+ at abc:1_27_584
+T10010330110103213112323303012103101
++
+82'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/
+ at abc:1_27_1227
+T02003022123001003201002031303302011
++
+492:;>A:<;34<<=);:<<;9=7<3::<::3=>'
+ at abc:1_27_1350
+T13130101101021211013220222221301231
++
+95,)<(4./;<938=64=+2/,.4),3':97#33&
+ at abc:1_29_477
+T13130101101021211013300302223003030
++
+94=55:75=+:/7><968;;#&+$#3&6,#1#4#'
+ at abc:1_30_882
+T20102033000233
++
+2(+-:-3<;5##/;
+ at abc:1_31_221
+T03301311201100030300100233220102031
++
+89>9>5<139/,&:7969972.274&%:78&&746
+ at abc:1_31_1313
+T0133113130033012232100010101
++
+;3<7=7::)5*4=&;<7>4;795065;9
+ at abc:1_529_129
+T132222301020322102101322221322302.3302.3.3..221..3
++
+>>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+!&<-9!%!@!!)%)!!(
diff --git a/tests/cut/solidmaq.fastq b/tests/cut/solidmaq.fastq
new file mode 100644
index 0000000..195be3d
--- /dev/null
+++ b/tests/cut/solidmaq.fastq
@@ -0,0 +1,120 @@
+ at 552:1_13_85/1
+CAAGATAANACCTACAGCAAAGCCACAGTTAAGC
++
+&9<&77)&!<7))%4'657-1+9;9,.<8);.;8
+ at 552:1_13_573/1
+CGTCCGAANTAGCTACCACCCTG
++
+)3%)&&&&!.1&(6:<'67..*,
+ at 552:1_13_1259/1
+AGCCGCTANGACGGGTTGGCC
++
+;<:&:A;A!9<<<,7:<=3=;
+ at 552:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at 552:1_14_177/1
+CTTAGGGAGAGTTTGCCGCTGTTAGACTTATTCC
++
+8957;;54)'98924905;;)6:7;1:3<88(9:
+ at 552:1_14_238/1
+CTTCATCGAATCAAGGCGGGT
++
+><5=;<<<12>=<;1;;=5);
+ at 552:1_15_1098/1
+GTTTATTGGGGTTAGAGGTATGTCGGTGGGATTG
++
+,##(#5##*#($$'#.##)$&#%)$1##-$&##%
+ at 552:1_16_404/1
+TTCATGAAAGCTAGAGTTCCCG
++
+8;:;;><>9=9;<<2=><<1;5
+ at 552:1_16_904/1
+CGTACAGTTCAGGTCGGTGCTGAGCCGGCCCGCG
++
+>=::6;;99=+/'$+#.#&%$&'(($1*$($.#.
+ at 552:1_16_1315/1
+TGTCGTCCCGGCATTTACATCAT
++
+9<8A?>?::;6&,%;6/)8<<#/
+ at 552:1_16_1595/1
+GTGTGCCTCGCCCGTAAGGGCAACCGCTTAGACG
++
+,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+
+ at 552:1_17_1379/1
+GACCGCGCCCGGTGTAGTGCTGTCCTGCGAACGT
++
+-1179<1;>>8:':7-%/::0&+=<29,7<8(,2
+ at 552:1_18_1692/1
+GTGGGTTATCCAAGCCGTTTGTTAACCGGAAGCA
++
+#(###5%)%2)',2&:+#+&5,($/1#&4&))$6
+ at 552:1_19_171/1
+ACACCACGGAGCTGACCCCACCTGAGACGTAATG
++
+6:65/=3*:(8%)%2>&8&%;%0&#;$3$&:$#&
+ at 552:1_22_72/1
+TTATATGTGTGGCGCGTACTGGGTTTGAGCAGTT
++
+/#678<:.=9::6:(<538295;9+;&*;)+',&
+ at 552:1_22_1377/1
+GGGCTTTTCCGGGTCGGACCTGTCGAGGTGGTAA
++
+##0%.$.1*%,)95+%%14%$#8-###9-()#9+
+ at 552:1_23_585/1
+AACATCATCACTATCGCGGC
++
+55;8><96/18?)<3<58<5
+ at 552:1_23_809/1
+TCTACACCACAGCGCCACTGGATAGGGTTAGCCG
++
+7<59@;<<5;/9;=<;7::.)&&&827(+221%(
+ at 552:1_24_138/1
+TGCCCTACAACGATGTAAG
++
+)68/;906#,25/&;<$0+
+ at 552:1_24_206/1
+TTTATTGAAGGGTAAGAGATATTTCTGCGGCAAA
++
+)4(&)9592)#)694(,)292:(=7$.18,()65
+ at 552:1_25_143/1
+TGAGAATATCGAAGGATACTATTAGACGGATCTG
++
+4;/#&<9;&*;95-7;85&;587#16>%&,9<2&
+ at 552:1_25_1866/1
+TGACTGCAGGCTCCACCCGACGTTAGGCCTATCC
++
+<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4
+ at 552:1_27_584/1
+AACATTACCACATGCTCCGTGTTATACGCATCAC
++
+2'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/
+ at 552:1_27_1227/1
+GAATAGGCGTAACAATGACAAGATCTATTAGACC
++
+92:;>A:<;34<<=);:<<;9=7<3::<::3=>'
+ at 552:1_27_1350/1
+TCTACACCACAGCGCCACTGGAGGGGGCTACGTC
++
+5,)<(4./;<938=64=+2/,.4),3':97#33&
+ at 552:1_29_477/1
+TCTACACCACAGCGCCACTTAATAGGGTAATATA
++
+4=55:75=+:/7><968;;#&+$#3&6,#1#4#'
+ at 552:1_30_882/1
+ACAGATTAAAGTT
++
+(+-:-3<;5##/;
+ at 552:1_31_221/1
+TTACTCCGACCAAATATAACAAGTTGGACAGATC
++
+9>9>5<139/,&:7969972.274&%:78&&746
+ at 552:1_31_1313/1
+CTTCCTCTAATTACGGTGCAAACACAC
++
+3<7=7::)5*4=&;<7>4;795065;9
+ at 552:1_529_129/1
+TGGGGTACAGATGGCAGCACTGGGGCTGGTAGNTTAGNTNTNNGGCNNT
++
+>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+!&<-9!%!@!!)%)!!(
diff --git a/tests/cut/solidqual.fastq b/tests/cut/solidqual.fastq
new file mode 100644
index 0000000..80f4714
--- /dev/null
+++ b/tests/cut/solidqual.fastq
@@ -0,0 +1,120 @@
+ at 1_13_85_F3
+T110020300.0113010210002110102330021
++
+7&9<&77)&!<7))%4'657-1+9;9,.<8);.;8
+ at 1_13_573_F3
+T312311200.3021301101113203302010003
++
+6)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5
+ at 1_13_1259_F3
+T002112130.201222332211133020123031
++
+=;<:&:A;A!9<<<,7:<=3=;:<&<?<?8<;=<
+ at 1_13_1440_F3
+T110020313.1113211010332111302330001
++
+=<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at 1_14_177_F3
+T31330222020233321121323302013303311
++
+:8957;;54)'98924905;;)6:7;1:3<88(9:
+ at 1_14_238_F3
+T01331031200310022122230330201030313
++
+?><5=;<<<12>=<;1;;=5);.;14:0>2;:3;7
+ at 1_15_1098_F3
+T
++
+
+ at 1_16_404_F3
+T03310320002130202331112133020103031
++
+78;:;;><>9=9;<<2=><<1;58;9<<;>(<;<;
+ at 1_16_904_F3
+T21230102331022312232132021122111212
++
+9>=::6;;99=+/'$+#.#&%$&'(($1*$($.#.
+ at 1_16_1315_F3
+T0323123111221033301031032330201000
++
+<9<8A?>?::;6&,%;6/)8<<#/;79(448&*.
+ at 1_16_1595_F3
+T22323211312111230022210011213302012
++
+>,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+
+ at 1_17_1379_F3
+T32011212111223230232132311321200123
++
+/-1179<1;>>8:':7-%/::0&+=<29,7<8(,2
+ at 1_18_1692_F3
+T12322233031100211233323300112200210
++
+.#(###5%)%2)',2&:+#+&5,($/1#&4&))$6
+ at 1_19_171_F3
+T10101101220213201111011320201230
++
+)6:65/=3*:(8%)%2>&8&%;%0&#;$3$&:
+ at 1_22_72_F3
+T133030323232212123013222333202
++
+3/#678<:.=9::6:(<538295;9+;&*;
+ at 1_22_1377_F3
+T22221333311222312201132312022322300
++
+)##0%.$.1*%,)95+%%14%$#8-###9-()#9+
+ at 1_23_585_F3
+T30010310310130312122123302013303131
++
+>55;8><96/18?)<3<58<5:;96=7:1=8=:-<
+ at 1_23_809_F3
+T131301011010212110132203022233021
++
+:7<59@;<<5;/9;=<;7::.)&&&827(+221
+ at 1_24_138_F3
+T3321113010012032300203302012303131
++
+6)68/;906#,25/&;<$0+250#2,<)5,9/+7
+ at 1_24_206_F3
+T33330332002223002020303331321221000
++
+))4(&)9592)#)694(,)292:(=7$.18,()65
+ at 1_25_143_F3
+T2320200303120022030130330201220313
++
+:4;/#&<9;&*;95-7;85&;587#16>%&,9<2
+ at 1_25_1866_F3
+T03201321022131101112012330221130311
++
+=<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4
+ at 1_27_584_F3
+T10010330110103213112323303012103101
++
+82'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/
+ at 1_27_1227_F3
+T0200302212300100320100203130330201
++
+492:;>A:<;34<<=);:<<;9=7<3::<::3=>
+ at 1_27_1350_F3
+T1313010110102121101322022222130123
++
+95,)<(4./;<938=64=+2/,.4),3':97#33
+ at 1_29_477_F3
+T13130101101021211013300302223
++
+94=55:75=+:/7><968;;#&+$#3&6,
+ at 1_30_882_F3
+T20102033000233133320103031311233200
++
+2(+-:-3<;5##/;:(%&84'#:,?3&&8>-();5
+ at 1_31_221_F3
+T03301311201100030300100233220102031
++
+89>9>5<139/,&:7969972.274&%:78&&746
+ at 1_31_1313_F3
+T01331131300330122321000101010330201
++
+;3<7=7::)5*4=&;<7>4;795065;9';896'=
+ at 1_529_129_F3
+T132222301020322102101322221322302.3302
++
+>>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+!&<-9
diff --git a/tests/cut/sra.fastq b/tests/cut/sra.fastq
new file mode 100644
index 0000000..ea95638
--- /dev/null
+++ b/tests/cut/sra.fastq
@@ -0,0 +1,24 @@
+ at 1_13_85_F3
+T110020300.0113010210002110102330021
++
+7&9<&77)&!<7))%4'657-1+9;9,.<8);.;8
+ at 1_13_573_F3
+T312311200.30213011011132
++
+6)3%)&&&&!.1&(6:<'67..*,
+ at 1_13_1259_F3
+T002112130.201222332211
++
+=;<:&:A;A!9<<<,7:<=3=;
+ at 1_13_1440_F3
+T110020313.1113211010332111302330001
++
+=<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at 1_14_177_F3
+T31330222020233321121323302013303311
++
+:8957;;54)'98924905;;)6:7;1:3<88(9:
+ at 1_14_238_F3
+T0133103120031002212223
++
+?><5=;<<<12>=<;1;;=5);
diff --git a/tests/cut/stripped.fasta b/tests/cut/stripped.fasta
new file mode 100644
index 0000000..2ca63a2
--- /dev/null
+++ b/tests/cut/stripped.fasta
@@ -0,0 +1,4 @@
+>first
+SEQUENCE1
+>second
+SEQUENCE2
diff --git a/tests/cut/suffix.fastq b/tests/cut/suffix.fastq
new file mode 100644
index 0000000..72392e0
--- /dev/null
+++ b/tests/cut/suffix.fastq
@@ -0,0 +1,120 @@
+ at 1_13_85_my_suffix_no_adapter
+T110020300.0113010210002110102330021
++
+7&9<&77)&!<7))%4'657-1+9;9,.<8);.;8
+ at 1_13_573_my_suffix_1
+T312311200.30213011011132
++
+6)3%)&&&&!.1&(6:<'67..*,
+ at 1_13_1259_my_suffix_1
+T002112130.201222332211
++
+=;<:&:A;A!9<<<,7:<=3=;
+ at 1_13_1440_my_suffix_no_adapter
+T110020313.1113211010332111302330001
++
+=<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at 1_14_177_my_suffix_no_adapter
+T31330222020233321121323302013303311
++
+:8957;;54)'98924905;;)6:7;1:3<88(9:
+ at 1_14_238_my_suffix_1
+T0133103120031002212223
++
+?><5=;<<<12>=<;1;;=5);
+ at 1_15_1098_my_suffix_no_adapter
+T32333033222233020223032312232220332
++
+#,##(#5##*#($$'#.##)$&#%)$1##-$&##%
+ at 1_16_404_my_suffix_1
+T03310320002130202331112
++
+78;:;;><>9=9;<<2=><<1;5
+ at 1_16_904_my_suffix_no_adapter
+T21230102331022312232132021122111212
++
+9>=::6;;99=+/'$+#.#&%$&'(($1*$($.#.
+ at 1_16_1315_my_suffix_1
+T032312311122103330103103
++
+<9<8A?>?::;6&,%;6/)8<<#/
+ at 1_16_1595_my_suffix_no_adapter
+T22323211312111230022210011213302012
++
+>,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+
+ at 1_17_1379_my_suffix_no_adapter
+T32011212111223230232132311321200123
++
+/-1179<1;>>8:':7-%/::0&+=<29,7<8(,2
+ at 1_18_1692_my_suffix_no_adapter
+T12322233031100211233323300112200210
++
+.#(###5%)%2)',2&:+#+&5,($/1#&4&))$6
+ at 1_19_171_my_suffix_no_adapter
+T10101101220213201111011320201230032
++
+)6:65/=3*:(8%)%2>&8&%;%0&#;$3$&:$#&
+ at 1_22_72_my_suffix_no_adapter
+T13303032323221212301322233320210233
++
+3/#678<:.=9::6:(<538295;9+;&*;)+',&
+ at 1_22_1377_my_suffix_no_adapter
+T22221333311222312201132312022322300
++
+)##0%.$.1*%,)95+%%14%$#8-###9-()#9+
+ at 1_23_585_my_suffix_1
+T300103103101303121221
++
+>55;8><96/18?)<3<58<5
+ at 1_23_809_my_suffix_no_adapter
+T13130101101021211013220302223302112
++
+:7<59@;<<5;/9;=<;7::.)&&&827(+221%(
+ at 1_24_138_my_suffix_1
+T33211130100120323002
++
+6)68/;906#,25/&;<$0+
+ at 1_24_206_my_suffix_no_adapter
+T33330332002223002020303331321221000
++
+))4(&)9592)#)694(,)292:(=7$.18,()65
+ at 1_25_143_my_suffix_no_adapter
+T23202003031200220301303302012203132
++
+:4;/#&<9;&*;95-7;85&;587#16>%&,9<2&
+ at 1_25_1866_my_suffix_no_adapter
+T03201321022131101112012330221130311
++
+=<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4
+ at 1_27_584_my_suffix_no_adapter
+T10010330110103213112323303012103101
++
+82'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/
+ at 1_27_1227_my_suffix_no_adapter
+T02003022123001003201002031303302011
++
+492:;>A:<;34<<=);:<<;9=7<3::<::3=>'
+ at 1_27_1350_my_suffix_no_adapter
+T13130101101021211013220222221301231
++
+95,)<(4./;<938=64=+2/,.4),3':97#33&
+ at 1_29_477_my_suffix_no_adapter
+T13130101101021211013300302223003030
++
+94=55:75=+:/7><968;;#&+$#3&6,#1#4#'
+ at 1_30_882_my_suffix_1
+T20102033000233
++
+2(+-:-3<;5##/;
+ at 1_31_221_my_suffix_no_adapter
+T03301311201100030300100233220102031
++
+89>9>5<139/,&:7969972.274&%:78&&746
+ at 1_31_1313_my_suffix_1
+T0133113130033012232100010101
++
+;3<7=7::)5*4=&;<7>4;795065;9
+ at 1_529_129_my_suffix_no_adapter
+T132222301020322102101322221322302.3302.3.3..221..3
++
+>>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+!&<-9!%!@!!)%)!!(
diff --git a/tests/cut/trimN3.fasta b/tests/cut/trimN3.fasta
new file mode 100644
index 0000000..c05f5ed
--- /dev/null
+++ b/tests/cut/trimN3.fasta
@@ -0,0 +1,2 @@
+>read1
+CAGTCGGTCCTGAGAGATGGGCGAGCGCTGG
diff --git a/tests/cut/trimN5.fasta b/tests/cut/trimN5.fasta
new file mode 100644
index 0000000..b1faa5f
--- /dev/null
+++ b/tests/cut/trimN5.fasta
@@ -0,0 +1,2 @@
+>read1
+GGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAG
diff --git a/tests/cut/twoadapters.fasta b/tests/cut/twoadapters.fasta
new file mode 100644
index 0000000..c03a129
--- /dev/null
+++ b/tests/cut/twoadapters.fasta
@@ -0,0 +1,6 @@
+>read1
+GATCCTCCTGGAGCTGGCTGATACCAGTATACCAGTGCTGATTGTTG
+>read2
+CTCGAGAATTCTGGATCCTCTCTTCTGCTACCTTTGGGATTTGCTTGCTCTTG
+>read3 (no adapter)
+AATGAAGGTTGTAACCATAACAGGAAGTCATGCGCATTTAGTCGAGCACGTAAGTTCATACGGAAATGGGTAAG
diff --git a/tests/cut/twoadapters.first.fasta b/tests/cut/twoadapters.first.fasta
new file mode 100644
index 0000000..aab7419
--- /dev/null
+++ b/tests/cut/twoadapters.first.fasta
@@ -0,0 +1,2 @@
+>read1
+GATCCTCCTGGAGCTGGCTGATACCAGTATACCAGTGCTGATTGTTG
diff --git a/tests/cut/twoadapters.second.fasta b/tests/cut/twoadapters.second.fasta
new file mode 100644
index 0000000..2c491d3
--- /dev/null
+++ b/tests/cut/twoadapters.second.fasta
@@ -0,0 +1,2 @@
+>read2
+CTCGAGAATTCTGGATCCTCTCTTCTGCTACCTTTGGGATTTGCTTGCTCTTG
diff --git a/tests/cut/twoadapters.unknown.fasta b/tests/cut/twoadapters.unknown.fasta
new file mode 100644
index 0000000..88f7875
--- /dev/null
+++ b/tests/cut/twoadapters.unknown.fasta
@@ -0,0 +1,2 @@
+>read3 (no adapter)
+AATGAAGGTTGTAACCATAACAGGAAGTCATGCGCATTTAGTCGAGCACGTAAGTTCATACGGAAATGGGTAAG
diff --git a/tests/cut/unconditional-back.fastq b/tests/cut/unconditional-back.fastq
new file mode 100644
index 0000000..d03f33e
--- /dev/null
+++ b/tests/cut/unconditional-back.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGATTAGA
++
+)3%)&&&&!.1&(6:<'67..*,:75)'7
+ at prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCCTTAGACG
++
+;<:&:A;A!9<<<,7:<=3=;:<&<?<?8
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGT
++
+<=A:A=57!7<';<6?5;;6:+:=)71>7
diff --git a/tests/cut/unconditional-both.fastq b/tests/cut/unconditional-both.fastq
new file mode 100644
index 0000000..303b042
--- /dev/null
+++ b/tests/cut/unconditional-both.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+GAANTAGCTACCACCCTGATTAGA
++
+&&&!.1&(6:<'67..*,:75)'7
+ at prefix:1_13_1259/1
+CTANGACGGGTTGGCCCTTAGACG
++
+A;A!9<<<,7:<=3=;:<&<?<?8
+ at prefix:1_13_1440/1
+TCTNCCCTGCCACATTGCCCTAGT
++
+=57!7<';<6?5;;6:+:=)71>7
diff --git a/tests/cut/unconditional-front.fastq b/tests/cut/unconditional-front.fastq
new file mode 100644
index 0000000..383b3db
--- /dev/null
+++ b/tests/cut/unconditional-front.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+GAANTAGCTACCACCCTGATTAGACAAAT
++
+&&&!.1&(6:<'67..*,:75)'77&&&5
+ at prefix:1_13_1259/1
+CTANGACGGGTTGGCCCTTAGACGTATCT
++
+A;A!9<<<,7:<=3=;:<&<?<?8<;=<&
+ at prefix:1_13_1440/1
+TCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/cut/wildcard.fa b/tests/cut/wildcard.fa
new file mode 100644
index 0000000..2dae07a
--- /dev/null
+++ b/tests/cut/wildcard.fa
@@ -0,0 +1,4 @@
+>1
+TGCATGCA
+>2
+TGCATGCA
diff --git a/tests/cut/wildcardN.fa b/tests/cut/wildcardN.fa
new file mode 100644
index 0000000..ef44dbc
--- /dev/null
+++ b/tests/cut/wildcardN.fa
@@ -0,0 +1,6 @@
+>perfect
+TTT
+>withN
+TTT
+>1mism
+TTTGGGGCGG
diff --git a/tests/cut/wildcard_adapter.fa b/tests/cut/wildcard_adapter.fa
new file mode 100644
index 0000000..27d5dab
--- /dev/null
+++ b/tests/cut/wildcard_adapter.fa
@@ -0,0 +1,8 @@
+>1
+
+>2
+
+>3b
+TGGCTGGCC
+>4b
+TGGCTGGCC
diff --git a/tests/cut/wildcard_adapter_anywhere.fa b/tests/cut/wildcard_adapter_anywhere.fa
new file mode 100644
index 0000000..8ba6688
--- /dev/null
+++ b/tests/cut/wildcard_adapter_anywhere.fa
@@ -0,0 +1,8 @@
+>1
+TGCATGCA
+>2
+TGCATGCA
+>3b
+TGGCTGGCC
+>4b
+TGGCTGGCC
diff --git a/tests/data/454.fa b/tests/data/454.fa
new file mode 100644
index 0000000..92caddf
--- /dev/null
+++ b/tests/data/454.fa
@@ -0,0 +1,118 @@
+>000163_1255_2627 length=52 uaccno=E0R4ISW01DCIQD
+CCATCTCATCCCTGCGTGTCCCATCTGTTCCCTTCCTTGTCTCAGTGTGGTG
+>000652_1085_0667 length=122 uaccno=E0R4ISW01CXJXP
+ATTGAAGAGGTTGGTAAGTTTTAAGTTGGTAGGTGGTTGGGGAGTGGTTGGAGAGGAGTTGTTGGGAGTTTGTGTCCTGCTGAGACACGCAACGGGGATAGGCAAGGCACACAGGGGATAGG
+>000653_1285_1649 length=135 uaccno=E0R4ISW01DE4SJ
+AATTAGTCGAGCGTTGTGGTGGGTATTTGTAATTTTAGCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGGAGGTTGCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG
+>000902_0715_2005 length=92 uaccno=E0R4ISW01B03K3
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG
+>001146_1255_0340 length=92 uaccno=E0R4ISW01DCGYU
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG
+>001210_1147_1026 length=171 uaccno=E0R4ISW01C2Z5W
+TAGGGAGGTGGTGAGTGTTGTGTGTTTAGATTGTGTGTGGTGGTTGGGAGTGGGAGTTGTATTTTAGGGTGTGGGTTGGGAGAGTGAAAGTTGTGGGTGTTTTGGATGGTGGGTTAGGTGGTTGTGCCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>001278_1608_2022 length=109 uaccno=E0R4ISW01D7HW4
+CACACACACTCTTCCCCATACCTACTCACACACACACACACACACACAAACATACACAAATAATTCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG
+>001333_1518_1176 length=142 uaccno=E0R4ISW01DZKTM
+AATTGTCGTTTGATTGTTGGAAAGTAGAGGGTCGGGTTGGGGTAGATTCGAAAGGGGAATTTTGAGAAAAGAAATGGAGGGAGGTAGGAAAATTTTTTGCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG
+>001398_1584_1549 length=154 uaccno=E0R4ISW01D5DPB
+TAATGAAATGGAATGGAATGGAATGGAATGAAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGAAATGGAATGGAGTATAAAGGAATGGAATTACTGAGACACGCAACAGGGGAAGGCAAGGCACACAGGGGATAGG
+>001455_1136_2179 length=92 uaccno=E0R4ISW01C12AD
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG
+>001481_1165_0549 length=92 uaccno=E0R4ISW01C4KON
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG
+>001744_1376_3512 length=144 uaccno=E0R4ISW01DM5T2
+TAAGTAGGGAAGGTTTGAGGTTGTTGGTGTTGGTAGTAGGGGTGTTTTAGTTAGGGGTTGTAGTTTGTTAAGGGAATTTTATTTGAGTTTAGAATTGAGGCTGAGACACGCAAAAGGGGATAGGCAAGGCACACAGGGGATAGG
+>001893_1084_1137 length=162 uaccno=E0R4ISW01CXG4Z
+TGTATATTTTGTTGGGTTTGTATATATTGTTAGGTGTGGTTGGTGAGTTGTATTGGTGGTGGTGTAAGGTGAGTGGAAATGGGAATGGATTGTAGATATGTTGGATTTGTGGTTTTTGGTTGAGACACGAACAGGGGATAGGCAAGGCACACAGGGGATAGG
+>001927_0254_0706 length=182 uaccno=E0R4ISW01AWLLG
+TGGAATCATCTAAGGGACACAAATAGAATCATCATTGAATGGAATCGAATGGAATCATCTAATGTACTCGAATGGAATTATTATTGAATAGAATAGAATGGAATTATCGAATGGAATCAAATGGAATGTAATGGAATGCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>002007_1338_1037 length=139 uaccno=E0R4ISW01DJRTR
+GGGTTGTGTATTTGGATAGTATGTGGAAAATGGTATTAAAAAGAATTTGTAGTTGGATTGTTGGTGGTTATTTAGTTTTTGGGTAATGGGTAGATTCCTGAGACACGCAAAGGGATAGGCAAGGCACACAGGGGATAGG
+>002186_1130_0654 length=92 uaccno=E0R4ISW01C1H5C
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG
+>002282_1237_2702 length=134 uaccno=E0R4ISW01DAXWG
+AATTAGCCGGGCGTGATGGCGGGCGTTTGTAGTTTTAGTTATTCGGGAGGTTGAGGTAGGAGAATGGCGTGAATTCGGGAAGCGGAGTTTGCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG
+>002382_1259_0997 length=107 uaccno=E0R4ISW01DCT37
+TAAGGGTTGAAGCGAGGTAGGTAGTTTGTTTGTGGTTTTGTTTCGTATTTTTGTTTCGTATCCCTGAGACACGCAACAGAGGATAGGCAAGGCACACAGGGGATAGG
+>002477_0657_0655 length=174 uaccno=E0R4ISW01BVY8H
+TTTTTGGAAAGTTGGGTGGGTATAGTTTTGAGTAGTTAGAGGTATTATAATAGTATTAGGAAGTTGAATGTGAGGGTATAAGAGTTAATTTGATTTTTCGTTGATATGTTTGTTGTTTGAAGTTAGAGTGCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG
+>003149_1553_2333 length=170 uaccno=E0R4ISW01D2OBZ
+TATTTAGTTTTAGTTTGTTTAGGTGGTTATAGAATACGGAGTTTATGAAGTTGATTAGGAATATTATTAGTTGAATTAAGAATTGGGAAGAGAGGGGAACGGGAAGGGACGTGAGTGATTATTATTGCTGAGACACGCAAAGGGGATAGGCAAGGCACACAGGGGATAGG
+>003194_1475_2845 length=101 uaccno=E0R4ISW01DVT7J
+TATTTTGGGTTAAGTCGGGTTTAGTTGTTAGGGCGAGAAGTTAGTTGTTGACCCCTGCTGAGACACGCAAAAGGGGATAGGCAAGGCACACAGGGGATAGG
+>003206_1315_0479 length=95 uaccno=E0R4ISW01DHQPD
+GGGTTGGATAATATGATGGTGTTGGGGAATATTTAGGTATGTGGTTTGTGGCTGAGACACGCAACAGAGGATAGGCAAGGCACACAGGGGATAGG
+>003271_0173_0314 length=125 uaccno=E0R4ISW01APHAK
+GTTTATTTGTTATTTATTTTTAGGTTTAGAAGAGTGTTTGGTATTTATTGAGGATTTAGTATTTGTTAGAAGGATTGGATTCTGAGACACGCAACAGGGGGTAGGCAAGGCACACAGGGGATAGG
+>003443_1737_2250 length=67 uaccno=E0R4ISW01EITSS
+TGTAGGTTGTGTTGTAGGTTGTCCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>002633_1776_1582 length=81 uaccno=E0R4ISW01EL8JK
+CAGGGTGGATTGGGGAACACACAGTGTGGCCGCGTGATTCTGAGACACGCAACAGGGAAGGCAAGGCACACAGGGGATAGG
+>002663_0725_3154 length=126 uaccno=E0R4ISW01B1Z2S
+GCGTTTTATATTATAATTTAATATTTTGGAGGTTGGGTGCGGTGGTTTACGTTTGTAGTTTAGTATTTGGGAGGTTAAGGTAGCTGAGACACGCAACGGGGATAGGCAAGGCACACAGGGGATAGG
+>002761_1056_4055 length=121 uaccno=E0R4ISW01CU2V9
+AATTTTATTCGATTTATGTGATGATTTATTTATTTTATTTGAAGATGATTTTATTCGAGATTATTCGATGATTCCATTCCTGAGACACGCAAGGGGATAGGCAAGGCACACAGGGGATAGG
+>002843_0289_2275 length=122 uaccno=E0R4ISW01AZPE9
+ATTGAAGAGGTTGGTAAGTTTTAAGTTGGTAGGTGGTTGGGGAGTGGTTGGAGAGGAGTTGTTGGGAGTTTGTGTCCTGCTGAGACACGCAACGGGGATAGGCAAGGCACACAGGGGATAGG
+>002934_1762_2177 length=92 uaccno=E0R4ISW01EK0Q7
+GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG
+>003515_1711_1058 length=122 uaccno=E0R4ISW01EGIPG
+AATTGAATGGAATTATTATTGAATGGATTCGAATGGAATTATTATTGAATGGAATCATCGAGTGGAATCGAATGGAATCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>003541_1276_1589 length=112 uaccno=E0R4ISW01DECAV
+TAGTTTAGGGTGGTAGTTTGGATAAGGTAGTTTTACGGTTTAGTAGTAGTAGGTTAAGTAGGAAAACTGCTGAGACACGCAAAGGGGATAGGCAAGGCACACAGGGGATAGG
+>003587_1522_1804 length=152 uaccno=E0R4ISW01DZXX6
+AATTTATGTAGTGGAAGTAGGATATAAAGAATAGGTTAATGGATTTTGAGATATTAAAAAGAGTAGGAAATTAGTTGAGAGGTTAAGTAGTAGTTTATTTTAGCCACCCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG
+>003592_0076_0430 length=134 uaccno=E0R4ISW01AGYTC
+AATTAGTTAGGCGTGGTGGCGGGTGTTTGTAGTTTTAGTTATTCGGGAGGTTGAGGTAGGAGAATGTTGTGAATTTAGGAGGTGGAGTTTGCTGAGACACGCAACAGGGGAAGGCAAGGCACACAGGGGATAGG
+>003957_0595_0965 length=173 uaccno=E0R4ISW01BQJIV
+TAATATTAGGTGTCAATTTGACTGGATCGAGGGATGTGTGTCGGTGAGAGTCTCACTAGAGGTTGATATTTGAGTCGTTAGACTGGGAGAGGAAGACCGAACTGTCAAGTGTATGGGCGCCATCCAATTCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>003986_1127_2937 length=103 uaccno=E0R4ISW01C1AFF
+TAATGGAATGGAATTTTCGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATTACTGAGACACGCAACAGGGGAAGGCAAGGCACACAGGGGATAGG
+>004012_1559_1491 length=111 uaccno=E0R4ISW01D26M9
+TAGTGGATATAAATGGAATGGATTGGAATGGAATGGATACGAATGGAATGGATTGGAGTGGAATGGATTGACTGAGACACGCAACAGGGGGCAAGGCACACAGGGGATAGG
+>004030_1508_2061 length=166 uaccno=E0R4ISW01DYPWF
+TACGTATATACGCGTACGCGTATACGTATATACGCGTATACGTATACGCGTACGTATATATACGCGTATACGTTTACGTACGTACGCGTATATACGTACGTATACACACACGCATATGCATACTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>004038_1061_2047 length=152 uaccno=E0R4ISW01CVG5D
+AATTGATTCGAATGGAATGGATTGGAATGGAACGGATTTGAATGGAATGGATTGGAATGGAATGGATTGAATGGAATGGATTGGAGAGGATTGGATTTGAATGGAATTCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>004105_1121_0391 length=135 uaccno=E0R4ISW01C0PH1
+AATTAGTTGGGCGTGGTGGCGAGTGTTTGTAATTTTAGTTATTTAGGAGGTTGAGGTAGGAGAATTATTTGAACCCGGTAGACGGAAGTTGCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>004129_1618_3423 length=122 uaccno=E0R4ISW01D8ELT
+AATTGAATGGTATTGAAAGGTATTAATTTAGTGGAATGGAATGGAATGTATTGGAATGGAAAATAATGGAATGGAGTGCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>004203_0451_0902 length=115 uaccno=E0R4ISW01BDWC4
+TAGTTGGTGTGTTGTAATCGAGACGTAGTTGGTTGGTACGGGTTAGGGTTTTGATTGGGTTGTTGTGTTTGCTGAGACACGCAACATGGGATAGGCAAGGCACACAGGGGATAGG
+>004626_1937_0919 length=223 uaccno=E0R4ISW01E0CVD
+TAGAGTAGATAGTAGGGTTAGAGAAGGTAGGGTACGTTTAGTTTGTTAGTAAGGTTTAAGTTTTGGGTGGGAAAGGTTAGTGGCGGGAAGGGACGAAGGTGGTAATCGAGAGTAGATTTAGAGAAGTTTTTGAAGTGGGCGTTGGGAGTTTTCGAAGTATTGAGAGAGAGGAGCTTGTGCTGAGACATGCAACAGAGGATAGGCAAGGCACACAGGGGATAGG
+>004913_0641_2071 length=135 uaccno=E0R4ISW01BULRD
+AATTAGTCGAGCGTTGTGGTGGGTATTTGTAATTTTAGCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGGAGGTTGCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG
+>005063_0599_1983 length=127 uaccno=E0R4ISW01BQWX9
+ATGTGGTGAAGATTGGTTTTAGGTGTTTTAATGTGGATTTTCAGGGGTTTTAAAAGGGTTGGGAGAGTGAAATATATATAAGGCTGAGACACGCAAAAGGGGATAGGCAAGGCACACAGGGGATAGG
+>005140_0759_3209 length=116 uaccno=E0R4ISW01B4ZKR
+TAGTATAGAGGGTTTGTGGTCGTGAGGGTGTTGATGGCGGGAGGGTTTTGATGGTAGGAGGGCCCGTGCTGTGCTGAGACACGCAACAGGGGAAGGCAAGGCACACAGGGGATAGG
+>005351_0883_3221 length=137 uaccno=E0R4ISW01CFVHJ
+TTAGGTGTTATAGTTGAGTGAGATGTTAGTGTTTAATGGTTTTATTTAGGTTGATGGGTTAATGAGGGGGTATTTGATAGTTTTGAAGATTTGACTGAGACACGCAACGGGGATAGGCAAGGCACACAGGGGATAGG
+>005380_1702_1187 length=207 uaccno=E0R4ISW01EFQC1
+TAGGGTTTTTCGAGTATATATTTAGTAGTACGCTCGACTTCTCTTATATAAAGGTTTTGGTTTTTATAGGTTTTTCCATTGTGTCTGCCTGGGGGAGGGCCCTTCTCCTTCAGGATACTGTAGCTTCTCTGCGTGATAAGCCAGCATTCACGGCTTTCAGGTGCTGAGACATGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>005568_1060_1943 length=63 uaccno=E0R4ISW01CVDWP
+ATAGCGTATTTCTCACCTGCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>005740_1536_2697 length=159 uaccno=E0R4ISW01D06VV
+TAAAGAGGTGTTATTATTAGTTAGGAGAGGAGGTGGTTAGATAGTAGTGGGATTATAGGGGAATATAGAGTTGTTAGTTTAGGGATAAGGGATTGATCGATGGGTTAGGTCTCTGCTGAGACACGCAAAAGGGGATAGGCAAGGCACACAGGGGATAGG
+>005753_1884_3877 length=95 uaccno=E0R4ISW01EVRNB
+AAACTGAGTTGTGATGTTTGCATTCAACTCACAGAGTTCAACATTCCTTTAACTGAGACACGCAACAGGGTTAGGCAAGGCACACAGGGTATAGG
+>read_equals_adapter 1a
+TGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>read_equals_start_of_adapter 1b
+TGAGACACGCAACAGGGGAAAG
+>read_equals_end_of_adapter 1c
+GAAAGGCAAGGCACACAGGGGATAGG
+>read_equals_middle_of_adapter 1d
+GCAACAGGGGAAAGGCAAGGCACACAGG
+>read_ends_with_adapter 2a
+GCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG
+>read_ends_with_start_of_adapter 2b
+GCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGTGAGACACGCAACAGGGGAAAGGCAAGG
+>read_contains_adapter_in_the_middle 3
+CGTAGTTGGTTGGTACGTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGGGGTTAGGGTTTTGATTGGGTTGT
+>read_starts_with_adapter 4a
+TGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGGAAAGGTTTTGGTTTTTATAGGTTTTT
+>read_starts_with_end_of_adapter 4b
+AACAGGGGAAAGGCAAGGCACACAGGGGATAGGAAAGGTTTTGGTTTTTATAGGTTTTT
diff --git a/tests/data/E3M.fasta b/tests/data/E3M.fasta
new file mode 100644
index 0000000..daa7686
--- /dev/null
+++ b/tests/data/E3M.fasta
@@ -0,0 +1,59 @@
+>E3MFGYR02JWQ7T length=260 xy=3946_2103 region=2 run=R_2008_01_09_16_16_00_
+tcagGGTCTACATGTTGGTTAACCCGTACTGATTTGAATTGGCTCTTTGTCTTTCCAAAG
+GGAATTCATCTTCTTATGGCACACATAAAGGATAAATACAAGAATCTTCCTATTTACATC
+ACTGAAAATGGCATGGCTGAATCAAGGAATGACTCAATACCAGTCAATGAAGCCCGCAAG
+GATAGTATAAGGATTAGATACCATGATGGCCATCTTAAATTCCTTCTTCAAGCGATCAAG
+GAAGGTGTTAATTTGAAGGGGCTTa
+>E3MFGYR02JA6IL length=265 xy=3700_3115 region=2 run=R_2008_01_09_16_16_00_
+tcagTTTTTTTTGGAAAGGAAAACGGACGTACTCATAGATGGATCATACTGACGTTAGGA
+AAATAATTCATAAGACAATAAGGAAACAAAGTGTAAAAAAAAAACCTAAATGCTCAAGGA
+AAATACATAGCCATCTGAACAGATTTCTGCTGGAAGCCACATTTCTCGTAGAACGCCTTG
+TTCTCGACGCTGCAATCAAGAATCACCTTGTAGCATCCCATTGAACGCGCATGCTCCGTG
+AGGAACTTGATGATTCTCTTTCCCAAATGcc
+>E3MFGYR02JHD4H length=292 xy=3771_2095 region=2 run=R_2008_01_09_16_16_00_
+tcagAAAGACAAGTGGTATCAACGCAGAGTGGCCATTACGCCGGGGACTAGGTCATGTTA
+AGAGTGTAGCTTTGTGATGCTCTGCATCCGTCTTATGATAAAATTGAGGTTATCCTGAAA
+TAAAGTGTCTCAAACGATTTATTTTCCATTTATTGTATTTAATTTGAGTCCAAACTAGAT
+TAGAGATCTCTGTAATAAAACATGTTTGTTAGTTTAATTTCAATAACATTTAGTATTGTG
+TCGTAAAAAAAAAAAAAACGAAAAAAAAAAAAACAAAAAAAAAAACAAATGTACGGccgg
+ctagagaacg
+>E3MFGYR02GFKUC length=295 xy=2520_2738 region=2 run=R_2008_01_09_16_16_00_
+tcagCGGCCGGGCCTCTCATCGGTGGTGGAATCACTGGCCTTGTTTACGAGGTTGTCTTT
+ATCAGCCACACCCACGAGCAGCTTCCCACCACTGACTACTAGAGGGGGGGAAATGAAAAA
+TAAAAAAAAAAAATTGTGTATTATTGAATTTCTCTGGAATCTTCTTCTGTGTATGGTTTT
+CCTTCCTTGTGTTTTCTTCCTAATTCACTTTCGAGGGTTGTACTTGTTCCTTTCGTCTTA
+AATCCTTGGATGGTTGATGATCATGAAGTTCTCTTTAAAGTTAAATTATTATCATTTTG
+>E3MFGYR02FTGED length=277 xy=2268_2739 region=2 run=R_2008_01_09_16_16_00_
+tcagTGGTAATGGGGGGAAATTTAATTTTCTGATTTTATTATATATAGTTAATTGATGCT
+TTCGACGGTTTATATTTATGCGATTTGGTTTAGGTTTCAATGGAATTTTGTTGGTAGTTT
+ATATGATTGTATATAGTTATCAGCAACCTTATATTGTTTGCTTGCCTTTCTAGAGCACTC
+AGTGGAGATTTGAAACTTTGTTAGTGGAAAATTTGCAATTGTATGTTAATTGGAGATGGA
+GACAAAAAAGGAGGCAGATATTAATATTTATTTGGATATCA
+>E3MFGYR02FR9G7 length=256 xy=2255_0361 region=2 run=R_2008_01_09_16_16_00_
+tcagCTCCGTAAGAAGGTGCTGCCCGCCGTCATCGTCCGCCAGCGCAAGCCTTGGCGCCG
+AAAGGACGGTGTTTACATGTACTTCGAAGATAATGCTGGTGTTATCGTGAATCCCAAGGG
+TGAAATGAAAGGTTCTGCTATCACTGGTCCAATTGGGAAGGAGTGTGCTGATCTGTGGCC
+CAGGATTGCAAGTGCTGCCAATGCTATTGTTTAAGCTAGGATTTTAGTTTTTGTAATGTT
+TCAGCTTCTTGAAGTTGTTTc
+>E3MFGYR02GAZMS length=271 xy=2468_1618 region=2 run=R_2008_01_09_16_16_00_
+tcagAAAGAAGTAAGGTAAATAACAAACGACAGAGTGGCACATACTCCGGCAGTTCATGG
+GCAGTGACCCAGTTCAGAGAACCAAAGAACCTGAATAAGAATCTATGTCTACTGTGAATT
+TTGTGGCTTTCGTTGGAACGAAGGTAGCTTCGAAACAATAAAGTTATCTACTTCGCAATA
+TGAAGTGTTTCTGTTAGTTCTATGGTTCCTACTCCTAGCACCTCTTTTTCTTATAGAAAT
+GGACCACCGTGATTGGTACAAAAGNTGTACCTAGAtga
+>E3MFGYR02HHZ8O length=150 xy=2958_1574 region=2 run=R_2008_01_09_16_16_00_
+tcagACTTTCTTCTTTACCGTAACGTTGTTAAATTATCTGAGTATATGAAGGACCCTATT
+TGGGTTCTATAACTACAGAACATATCTCAGTCCAATAGTGACGGAATAACAATATTATAA
+ACTAGTTTAACGCTTTATGAAAAAAAAAAAAAAAgaaaaaaaaacatgtcggccgctgag
+acacgcaacaggggataggcaaggcacacaggggataggnn
+>E3MFGYR02GPGB1 length=221 xy=2633_0607 region=2 run=R_2008_01_09_16_16_00_
+tcagAAGCAGTGGTATCAACGCAGAGTGGCCATTACGGCCGGGTCTGATGAGTATGTGTC
+GAAGATCCCAAATAACAAGGTTGGTCTTGTAATTGGTAAAGGTGGAGAAACAATAAAGAA
+TATGCAAGCTTCAACTGGAGCAAGAATTCAGGTGATTCCTCTTCATCTTCCACCTGGTGA
+CACATCTACCAAAAAAAAAAAAAAAAAAAAACCAAATGTCGGCCGctgagacacgcaaca
+gggataggcaaggcacacaggggataggn
+>E3MFGYR02F7Z7G length=130 xy=2434_1658 region=2 run=R_2008_01_09_16_16_00_
+tcagAATCATCCACTTTTTAACGTTTTGTTTTGTTCATCTCTTAACAACAATTCTAGGGC
+GACAGAGAGAGTAAGTACCCACTAACCAGTCCCCAAGTACCAAAATAACAATTTAAACAA
+CAAAACACAAACAGatcttatcaacaaaactcaaagttcctaactgagacacgcaacagg
+ggataagacaaggcacacaggggataggnnnnnnnnnnn
diff --git a/tests/data/E3M.qual b/tests/data/E3M.qual
new file mode 100644
index 0000000..908e628
--- /dev/null
+++ b/tests/data/E3M.qual
@@ -0,0 +1,59 @@
+>E3MFGYR02JWQ7T length=260 xy=3946_2103 region=2 run=R_2008_01_09_16_16_00_
+23 24 26 38 31 11 27 28 25 28 22 25 27 28 36 27 32 22 33 23 27 16 40 33 18 28 28 24 25 20 26 26 37 31 10 21 27 16 36 28 32 22 27 26 28 37 30 9 28 27 26 36 29 8 33 23 37 30 9 37
+30 9 34 26 32 22 28 28 28 22 33 23 28 31 21 28 26 33 23 28 27 28 28 28 21 25 37 33 16 34 28 25 28 37 33 17 28 28 27 34 27 25 30 25 26 24 34 27 34 27 23 28 36 32 14 24 28 27 27 23
+26 25 27 25 36 32 18 1 27 29 21 26 24 27 31 22 27 26 26 34 26 28 27 33 26 34 26 33 26 28 26 27 27 27 27 28 19 25 25 31 23 28 28 28 27 33 26 26 26 27 18 21 35 31 12 21 28 34 28 32
+26 27 27 23 25 27 28 26 34 28 34 28 27 34 28 28 26 28 26 19 32 27 28 25 27 27 26 33 25 34 28 24 28 21 30 21 37 33 16 23 12 27 18 27 18 25 34 28 24 30 22 22 23 28 27 25 26 34 28 33
+26 19 6 34 28 25 25 32 27 34 28 37 33 17 25 34 28 36 32 18 2 17 24 14 17
+>E3MFGYR02JA6IL length=265 xy=3700_3115 region=2 run=R_2008_01_09_16_16_00_
+24 24 26 28 45 32 22 17 12 9 5 1 36 28 40 34 15 36 27 42 35 21 6 28 34 24 27 28 28 21 28 28 28 28 25 27 28 28 28 27 36 28 27 28 28 24 28 28 28 28 28 24 28 28 36 27 28 36 28 43
+36 22 10 28 19 5 36 28 28 25 28 37 28 28 12 28 33 26 28 24 11 35 26 41 34 15 27 40 33 18 28 28 24 24 44 26 17 13 10 7 6 4 2 1 22 9 27 36 33 17 27 26 26 27 28 30 22 33 26 36
+33 19 4 25 18 27 24 22 24 26 31 23 27 24 28 25 25 31 23 27 27 28 26 32 28 7 27 23 24 25 26 33 25 32 24 24 34 26 25 23 27 33 29 8 25 25 26 25 26 25 27 29 20 28 26 32 24 33 25 25
+29 20 24 26 28 23 25 26 26 27 25 27 27 27 18 27 28 31 23 27 31 23 27 23 27 33 27 34 27 27 26 28 26 27 28 27 37 33 15 24 33 26 27 27 18 26 25 27 27 27 25 28 26 27 25 34 28 27 24 27
+25 34 28 31 23 22 34 28 26 27 27 28 27 34 28 25 25 23 36 32 14 37 33 17 37 33 17 23 25 25 15
+>E3MFGYR02JHD4H length=292 xy=3771_2095 region=2 run=R_2008_01_09_16_16_00_
+19 23 27 28 41 34 16 27 27 27 27 16 28 22 33 23 23 28 27 27 36 28 28 28 28 22 26 26 28 26 34 24 36 27 26 37 28 28 27 28 36 28 43 36 22 9 24 21 26 28 36 27 27 28 28 28 27 37 28 36
+27 28 24 28 27 27 28 24 28 28 40 33 14 26 21 28 27 28 27 28 23 27 27 28 27 27 26 33 25 27 26 25 34 27 28 28 27 28 28 38 34 22 10 34 28 27 27 34 27 34 28 27 27 33 27 27 28 35 30 11
+28 37 33 17 27 28 26 27 27 23 25 36 32 14 27 27 24 32 28 7 28 36 32 19 3 30 21 22 37 33 15 21 34 27 28 22 26 36 33 17 34 28 37 33 17 26 21 26 24 34 27 35 31 12 20 27 27 28 25 34
+28 27 25 27 27 25 27 28 27 28 23 28 27 28 20 28 38 34 22 9 23 24 28 28 36 32 13 27 19 7 20 26 37 33 17 21 9 37 33 17 23 32 25 22 29 21 27 24 34 30 10 28 26 25 28 33 26 23 21 27
+28 27 26 23 32 20 11 7 5 3 2 1 1 1 1 1 1 1 20 25 33 21 13 8 6 4 3 2 2 1 1 1 1 23 34 25 16 11 9 7 5 4 3 1 1 21 37 33 17 21 27 25 28 28 34 27 32 27 21 9
+17 25 20 27 18 17 32 24 17 16
+>E3MFGYR02GFKUC length=295 xy=2520_2738 region=2 run=R_2008_01_09_16_16_00_
+24 23 24 27 28 36 28 37 28 39 32 13 34 25 22 28 27 28 26 28 28 37 28 28 36 28 26 36 28 36 28 27 28 27 28 26 36 28 36 28 35 26 28 41 34 17 28 28 28 27 36 28 37 28 28 27 28 41 34 16
+25 28 28 26 27 36 28 28 27 28 41 34 17 28 25 28 28 27 28 27 26 27 34 27 37 33 17 25 33 27 26 27 27 28 25 28 28 27 27 25 27 26 28 38 32 23 17 12 8 2 37 33 17 28 26 38 34 23 12 1
+28 34 23 15 10 8 6 4 3 2 1 1 1 31 23 28 26 26 28 26 34 27 24 34 27 28 34 27 37 33 16 27 24 25 28 34 27 34 27 28 28 34 26 26 34 28 27 27 28 27 28 27 28 28 34 28 38 34 23 11
+34 28 34 27 34 26 34 28 28 27 26 38 35 22 9 27 30 22 33 26 28 34 28 34 28 28 27 28 37 33 15 25 27 23 32 27 6 32 25 28 22 26 26 32 24 27 33 26 26 17 34 30 11 28 26 27 22 33 26 34
+30 10 26 30 22 34 28 33 25 26 27 34 28 31 26 24 28 28 28 28 26 28 28 27 28 32 24 26 34 26 27 28 26 34 30 10 32 28 7 27 33 25 35 31 12 34 27 25 30 22 23 28 27 23 38 34 23 11 26
+>E3MFGYR02FTGED length=277 xy=2268_2739 region=2 run=R_2008_01_09_16_16_00_
+21 24 28 24 28 35 27 28 35 28 28 44 35 24 16 9 2 41 34 17 40 34 15 34 26 43 36 22 9 28 25 26 26 41 34 20 5 26 37 28 27 27 28 28 28 28 28 28 37 28 36 28 37 28 28 28 27 26 26 38
+31 11 28 24 28 28 36 27 36 29 8 26 27 28 36 29 8 27 28 27 28 28 24 34 27 5 32 22 40 33 14 28 37 28 41 34 16 28 32 24 23 34 28 34 27 38 34 22 9 27 34 28 34 27 27 26 26 36 32 13
+28 27 26 28 28 25 34 26 27 28 28 27 28 23 27 28 34 26 27 25 27 26 28 23 32 24 34 28 33 26 28 26 27 27 18 25 36 32 13 27 27 32 24 27 32 25 35 31 12 27 28 26 27 21 27 27 27 26 28 28
+27 26 28 33 25 22 28 28 37 33 17 26 37 33 17 20 36 32 14 28 34 27 26 27 28 34 28 38 34 22 8 37 33 15 27 28 34 27 33 26 27 26 27 28 28 33 25 34 28 34 28 34 26 24 24 28 25 34 28 28
+27 25 23 38 33 24 17 11 5 34 28 25 31 26 22 27 27 27 26 22 34 26 34 27 26 24 34 30 11 19 37 33 15 34 28 27 25 28 25 27 27
+>E3MFGYR02FR9G7 length=256 xy=2255_0361 region=2 run=R_2008_01_09_16_16_00_
+21 22 26 28 28 24 35 26 27 28 36 28 28 37 28 36 27 28 28 26 25 24 37 30 9 28 36 28 28 21 28 26 28 28 28 28 36 28 28 35 26 27 25 25 28 28 36 28 23 31 20 32 22 29 18 27 27 34 25 28
+39 33 13 36 27 28 28 35 25 28 28 40 34 15 27 28 28 27 27 28 28 28 34 28 27 27 34 28 27 27 27 34 27 28 28 28 27 34 27 27 28 34 26 28 27 27 27 27 28 34 27 27 35 31 11 34 27 34 30 10
+28 27 34 30 10 27 28 37 33 15 33 25 33 26 26 28 26 27 27 27 28 26 26 28 27 34 27 26 31 23 34 28 34 28 37 33 15 34 28 34 28 27 23 27 28 27 27 28 23 28 27 25 27 24 27 22 34 28 37 33
+16 26 33 26 25 34 26 25 28 33 25 27 27 23 27 28 28 32 24 34 27 27 27 27 28 27 29 20 27 33 28 8 32 27 23 28 25 24 34 28 26 38 34 22 9 27 26 38 34 23 13 3 27 26 34 28 26 28 36 32
+14 23 28 27 20 33 25 28 30 22 26 33 25 23 34 28 23 34 30 10 27
+>E3MFGYR02GAZMS length=271 xy=2468_1618 region=2 run=R_2008_01_09_16_16_00_
+18 25 28 28 40 34 17 19 33 26 21 17 34 24 31 21 28 41 34 17 28 37 28 28 41 34 17 27 27 21 28 18 24 23 26 25 31 20 28 26 27 28 23 25 27 25 33 23 30 20 28 28 26 31 21 27 28 23 38 31
+11 28 28 28 28 28 26 39 33 13 28 28 35 25 28 26 28 27 28 35 26 36 27 35 31 11 28 32 24 34 28 26 25 34 28 28 34 28 24 33 25 27 27 28 26 27 27 26 27 27 27 27 27 26 27 28 34 27 38 34
+22 10 25 23 32 25 28 37 33 16 26 26 29 20 33 26 27 18 27 25 23 13 32 24 27 22 24 27 34 28 27 27 36 32 14 27 27 18 26 33 29 8 28 34 27 23 26 28 27 28 27 32 24 28 27 23 34 26 25 27
+27 24 34 28 26 25 27 36 32 17 25 25 27 33 27 27 27 34 28 28 28 27 25 34 28 33 27 34 28 28 27 23 25 34 28 27 27 27 28 27 34 27 20 23 38 34 24 15 7 26 22 11 28 27 23 26 36 32 14 22
+34 28 28 33 27 27 30 22 25 22 24 27 34 28 34 28 26 26 27 37 33 20 6 28 0 25 28 27 24 34 28 25 28 28 27 25 26 26
+>E3MFGYR02HHZ8O length=150 xy=2958_1574 region=2 run=R_2008_01_09_16_16_00_
+22 22 25 23 25 28 41 34 17 28 37 28 28 35 28 6 24 30 19 28 25 32 22 27 25 37 28 28 27 15 38 31 11 36 28 27 24 28 28 27 20 28 23 26 25 22 19 28 35 26 34 25 26 41 34 17 26 28 36 29
+7 36 29 8 35 26 28 28 28 24 33 23 28 24 27 27 23 25 34 24 26 24 28 27 22 28 26 28 24 27 28 34 27 34 27 26 27 28 26 27 28 28 34 28 31 23 25 30 22 27 29 21 26 27 34 27 28 26 37 33
+17 17 26 18 28 34 30 11 19 6 27 24 27 35 30 11 27 22 28 32 19 11 6 4 3 2 1 1 1 1 1 1 1 1 27 36 28 19 14 11 8 6 4 2 19 19 27 27 28 27 33 26 33 26 25 27 25 28 26 22
+28 25 27 27 28 25 34 28 28 24 38 34 21 7 28 25 17 33 26 26 31 26 34 27 27 27 27 26 26 28 38 34 23 12 27 28 25 33 27 0 0
+>E3MFGYR02GPGB1 length=221 xy=2633_0607 region=2 run=R_2008_01_09_16_16_00_
+21 24 27 28 36 28 28 28 26 28 28 36 28 28 27 24 28 36 27 28 28 28 23 27 27 28 28 37 28 36 27 27 37 28 28 28 37 28 36 27 41 34 17 28 28 28 28 27 28 28 28 26 28 28 28 28 28 28 28 28
+28 37 28 28 27 28 39 32 13 41 34 16 28 37 28 28 34 28 34 28 34 28 34 27 27 26 34 28 27 27 34 28 34 28 34 28 27 37 33 15 34 27 28 34 28 28 28 37 33 16 28 34 26 27 37 33 16 27 34 27
+26 27 27 27 28 34 28 26 23 34 27 25 34 27 28 26 34 28 27 25 28 34 27 27 33 26 34 28 27 28 34 27 27 27 27 34 28 34 27 25 26 34 27 26 24 27 28 34 27 32 24 27 31 23 28 34 27 27 25 28
+27 25 27 27 27 28 27 17 32 24 35 16 8 4 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 21 9 36 31 13 24 27 26 28 34 28 34 27 19 22 23 19 28 28 26 26 20 23 22 26 34 27 25 25
+36 32 17 27 27 24 24 14 21 34 27 31 23 23 28 22 27 27 28 36 32 18 2 27 27 22 25 15 0
+>E3MFGYR02F7Z7G length=130 xy=2434_1658 region=2 run=R_2008_01_09_16_16_00_
+22 21 23 28 26 15 12 21 28 21 36 28 27 27 43 35 23 12 1 36 28 27 27 41 34 20 5 28 43 36 22 9 27 35 26 28 26 27 26 28 22 33 26 37 28 26 36 27 28 35 27 31 20 26 28 13 38 32 12 26
+23 24 27 28 27 22 25 28 19 27 28 20 36 27 25 20 26 41 34 17 28 28 17 36 28 35 27 20 28 28 43 36 22 8 33 26 25 27 27 31 26 38 34 22 10 25 34 28 26 34 27 32 27 5 37 33 17 20 23 13
+27 37 33 19 4 27 28 20 37 33 17 24 26 23 27 21 26 33 26 26 27 28 34 27 21 38 34 21 7 28 25 24 37 33 17 28 34 28 32 24 27 33 27 27 20 28 27 27 22 28 19 25 22 28 32 26 27 23 37 33
+20 5 24 24 34 28 28 11 26 30 25 33 26 28 25 22 26 27 27 38 34 23 11 28 26 28 34 26 0 0 0 0 0 0 0 0 0 0 0
diff --git a/tests/data/adapter.fasta b/tests/data/adapter.fasta
new file mode 100644
index 0000000..3519ebc
--- /dev/null
+++ b/tests/data/adapter.fasta
@@ -0,0 +1,4 @@
+>adapter1
+GCCGAACTTCTTAGACTGCCTTAAGGACGT
+>adapter2
+CAGGTATATCGA
diff --git a/tests/data/anchored-back.fasta b/tests/data/anchored-back.fasta
new file mode 100644
index 0000000..651f3fb
--- /dev/null
+++ b/tests/data/anchored-back.fasta
@@ -0,0 +1,8 @@
+>read1
+sequenceBACKADAPTER
+>read2
+sequenceBACKADAPTERblabla
+>read3
+sequenceBACKADA
+>read4
+sequenceBECKADAPTER
diff --git a/tests/data/anchored.fasta b/tests/data/anchored.fasta
new file mode 100644
index 0000000..2af20a4
--- /dev/null
+++ b/tests/data/anchored.fasta
@@ -0,0 +1,8 @@
+>read1
+FRONTADAPTsequence
+>read2
+blablaFRONTADAPTsequence
+>read3
+NTADAPTsequence
+>read4
+FRINTADAPTsequence
diff --git a/tests/data/anchored_no_indels.fasta b/tests/data/anchored_no_indels.fasta
new file mode 100644
index 0000000..dcf626a
--- /dev/null
+++ b/tests/data/anchored_no_indels.fasta
@@ -0,0 +1,12 @@
+>no_mismatch (adapter: TTAGACATAT)
+TTAGACATATGAGGTCAG
+>one_mismatch
+TAAGACATATGAGGTCAG
+>two_mismatches
+TAAGACGTATGAGGTCAG
+>insertion
+ATTAGACATATGAGGTCAG
+>deletion
+TAGACATATGAGGTCAG
+>mismatch_plus_wildcard
+TNAGACGTATGAGGTCAG
diff --git a/tests/data/anywhere_repeat.fastq b/tests/data/anywhere_repeat.fastq
new file mode 100644
index 0000000..120d100
--- /dev/null
+++ b/tests/data/anywhere_repeat.fastq
@@ -0,0 +1,28 @@
+ at prefix:1_13_1400/1
+CGTCCGAANTAGCTACCACCCTGATTAGACAAAT
++
+)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5
+ at prefix:1_13_1500/1
+CAAGACAAGACCTGCCACATTGCCCTAGTATTAA
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1550/1
+CAAGACAAGACCTGCCACATTGCCCTAGTCAAGA
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1600/1
+CAAGATGTCCCCTGCCACATTGCCCTAGTCAAGA
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1700/1
+CAAGATGTCCCCTGCCACATTGCCCTAGTTTATT
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1800/1
+GTTCATGTCCCCTGCCACATTGCCCTAGTTTATT
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
+ at prefix:1_13_1900/1
+ATGGCTGTCCCCTGCCACATTGCCCTAGTCAAGA
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
\ No newline at end of file
diff --git a/tests/data/dos.fastq b/tests/data/dos.fastq
new file mode 100644
index 0000000..6b1ecec
--- /dev/null
+++ b/tests/data/dos.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGATTAGACAAAT
++
+)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5
+ at prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT
++
+;<:&:A;A!9<<<,7:<=3=;:<&<?<?8<;=<&
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/data/empty.fastq b/tests/data/empty.fastq
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/example.fa b/tests/data/example.fa
new file mode 100644
index 0000000..b1fc713
--- /dev/null
+++ b/tests/data/example.fa
@@ -0,0 +1,18 @@
+>read1
+MYSEQUENCEADAPTER
+>read2
+MYSEQUENCEADAP
+>read3
+MYSEQUENCEADAPTERSOMETHINGELSE
+>read4
+MYSEQUENCEADABTER
+>read5
+MYSEQUENCEADAPTR
+>read6
+MYSEQUENCEADAPPTER
+>read7
+ADAPTERMYSEQUENCE
+>read8
+PTERMYSEQUENCE
+>read9
+SOMETHINGADAPTERMYSEQUENCE
diff --git a/tests/data/illumina.fastq.gz b/tests/data/illumina.fastq.gz
new file mode 100644
index 0000000..23f9a93
Binary files /dev/null and b/tests/data/illumina.fastq.gz differ
diff --git a/tests/data/illumina5.fastq b/tests/data/illumina5.fastq
new file mode 100644
index 0000000..c915c8d
--- /dev/null
+++ b/tests/data/illumina5.fastq
@@ -0,0 +1,20 @@
+ at SEQ:1:1101:9010:3891#0/1 adapter start: 51
+ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGGGCCTAACTTCTTAGACTGCCTTAAGGACGTAAGCCAAGATGGGAAAGGTC
++
+FFFFFEDBE at 79@@>@CBCBFDBDFDDDDD<@C>ADD at B;5:978 at CBDDFFDB4B?DB21;84?DDBC9DEBAB;=@<@@B@@@@B>CCBBDE98>>0 at 7
+ at SEQ:1:1101:9240:3898#0/1
+CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG
++
+GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH at GGGDGDFEEFC@=D?GBGFGF:FB6D
+ at SEQ:1:1101:9207:3899#0/1 adapter start: 64
+TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAACGCCTAACTTCTTAGACTGCCTTAAGGACGTATACATA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHFHHFHFFFFFBHHGHHHFFHHFHGGHHDEBFG<FGGDG
+ at SEQ:1:1101:9148:3908#0/1 adapter start: 28
+ACGACGCAATGGAGAAAGACGGAGAGCGGCCTAACTTCTTAGACTGCCTTAAGGACGTCCAACGGCGTCCATCTCGAAGGAGTCGCCAGCGATAACCGGAG
++
+HHHHHHHHHHHHGHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHDHDHHFHHHHHFFFFFHHHEFBEGEGGFFFHHHFHHHHHHFHHEHHGHEHD
+ at SEQ:1:1101:9044:3916#0/1 adapter start: 78
+AACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGAGCCTAACTTCTTAGACTGCCTTA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHGHHHHHHHHHHHHFHEBFHFFEFHEFHHGHFHHHHGGHGHHFHGGGHG
diff --git a/tests/data/illumina64.fastq b/tests/data/illumina64.fastq
new file mode 100644
index 0000000..bc5b102
--- /dev/null
+++ b/tests/data/illumina64.fastq
@@ -0,0 +1,80 @@
+ at 14569
+AAGTTTATTCCTGGACGAAGGAAGAAAAGGCCAGATGGGAAACAAGAACAAGCCCCTGTTGAAGACGCAGGGCCAACAGGGGCCAACGAAGCTGC
++
+cceeeeceeeee`dedbdbdb_^b`abU_cacadabd`dLMZ[XTcT^a^adaaaddcd`aL^`^_`Y\]^`Y_BBBBBBBBBBBBBBBBBBBBB
+ at 19211
+AGAGGGCGTGTGATTGCTGGATGTGGGCGGGGGGCCGGGGGAGCCCCATGGGCAGGAGACCTGAGAGCCAGGCGGTGAGGCACTATGAACGCGAG
++
+^\`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 9180
+GAGGGGCAGCGACTAGTCACCGGACCTGTCAGGCAAGCATAAGCCGTGCGTCAGCACCACGCTGACGGTGCTCCCGCACTCGCGGGACGCGCCAC
++
+b`bLbBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 19132
+TGTGATTATCCACTGGTATATCGGCGTGCCGTCCGCACGAGGAAAAAAGGCATTATTGTTGTGGATCTGTACCATCGTTTGTCCCGTTACCCTTC
++
+Z[QZZLZ[]J[SHZNaZ[_IaBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 15868
+CTGCCAAGGCTGCCCCCAAACCTGGCCCTCCGCGCACCCCACCACGGATCCTGACGTCCTGTCCCCCGCGGCTATGACAGCCAAGTCCCGTCAGC
++
+`c`cc\`\Lb]bL`[`a]L`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 1424
+GGCCCCAGACTTGCTCCCCCAACAAGGACAATGTCCAAGGAGTGTCCCCTGGGAAGGGTGGGCCTCCCCAGGTGCGGGCGGTGGGCACTGCCCCC
++
+eeeeeeeea`bbdaaadad`Oaaaaccada_aa_d`_X`_^`[`_[_W^BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 7855
+GTGGGGGCTACAATGTGGCTCCAAGTTTTTTCCCGGGAGGTAAGGCCGGGAGCCCCCGCCCTGAGGGGGCGGGAAAGAGGAAGCCCGACGCGGAC
++
+]^\]FW]Z`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 17943
+ACATGGGACCAGAAAACACCACCAGGGGTTTGGGGCTGTCCTGAGGCTCGGGTAGCAAGCAGCGGGGCTCCGTGTCCAAGCACGCCGGTGTCACC
++
+ccc`\^`aba\b^`\FR`OOPYG[[W```[Ra_RR_\]\\P\_H_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 11100
+CGGATAACTGAAAATGCATTTTTAACGCCATGACCGTGTCTCAAGGACCCGCTGTGGAAGGGGCGCCGCAGCCAGAAGCTGGCCATGTCAGCGCG
++
+b`b_b_a\bc^Tabadaddcddd``bdaa_^aJ\^_\]\\__O[___L^\_aaa^^^UJ^BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 15663
+AGGTGAAGTGGCAGGAGGACCGCCGGAAGAAGCTCTTCAGAACTCAGGGGGAGGGGGAAAGCAGAAACCAGAAGTCCAGTGAGCAGGGGGCTGAG
++
+aaKaBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 4698
+CCAATTGGCACCCCTCTGCCTTCAGCCATTCCCTCTGGCTACTGCTCTCTGGTCGGGGCGCCTGGGCGACAGACTCTCTCCCCCCACCCCCCCGC
++
+cccc\`ccc\caccZccccc]^`LY\bL_bBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 20649
+TCTGGACTGGATCTTTAGGATGGTGGAGATGATCTGGATGTAGGACAAAAGAACCAGGCAGAAGGGTGTCATCAGAAGAACACTGCTAGACACCA
++
+eeeeeaddadacdddebeccdddadd\^abbT_]bccTac]]b]L^][]Ve[^ZaY_^_^`\\Y]^Y`BBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 17259
+GCCTTGTGTTGTTCCTGGCATCACCGCAGGGAGCCCTGGGGGGCCAGGCGGGCGCTGACCCTGGGCACTGCCGCGCCTGGAGGGGCTGAGCACCG
++
+BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 6003
+CTTCAACTCATCTTGTTATTAATACCATCAATATCCCATGAGGCTCATAAAACGAGTCTTTCTTCTTGGAAACATGACCAAGATTGGGCAAACGT
++
+fffffffffffffffffdffecfcefeffdcfdeeebbbdbccccc\db\`^aa`^Y^^^cbcbaa`bbWY^^^__S_YYR]GWY]\]]XX\_`S
+ at 4118
+TCAAATTGTACTGCAAAGAAGGTCCCAGCTGGTCTCTTCTGGGAGTGATCTAACTAACTTAAGCTGACCCTGTGACTGGCTGAGGATAATCCCTT
++
+dc^ddeeeeeedeee`ceceddadadddcbde_dedc_ec_a^^b\b\\]VIPZY^T^^^\L_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 18416
+GTGGGGAAGCCGAAGAAGCAGCGGAGATCGATTGTAAGAACGACGTCCATGACCAGGGTTGGTGGAGACTGCTTCTCTGCATGCGGGGGAAGGCG
++
+dddacaabdbea\d^cce\da`dd_^__`a`a`b[_^__^\^^^_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 20115
+TGAAAAAGGAAAACATGGTAGTTTTCTTGTATGAGAGAGCCAGAGCCACCTTGGAGATTTTGTTCTCTCTGTGCGCACCAGTGATGACACAGGGG
++
+ed^eeafffaddfecdddabc^_badd`bd_ddadaa^bbcad\d\__^_\aaa_aY____aaN_\cdc\^aaYbBBBBBBBBBBBBBBBBBBBB
+ at 16139
+TCATCCGAAGAGTTGGCAGGCCCTGTGAATTGTGAAAACAGTATACCCACCCCTTTCCCGGAGCAGGACGCTGAATGTCCAGAGGATGCCAGACC
++
+cabacacY^c\daaddaadad^\ad_a\Y`[ZQ]Y^^OYQ^X^YT\\]U\^RRX^\YJ^BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 14123
+GATTTGGGGAAAGGAAACAATAGTTGAGTTTGGGCCACGGGAAATTCAAGATGCCTGGTATGTCAAGTCTGGCAGTTGAAGCAGCAGGGCTGGCG
++
+cccccccac^bYbbT_aa_Yb^^Ta\\^]]aaTaaaaab\b\XL`VZZV]QYYY[aa^^^^_^^BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+ at 8766
+ACCTGTAAGGTCCGCTCCTGGTGGACACCCACGAAGTCCAGGGCCTCAGGCAGGAAGTTGTAGCGCAGAGTTTTGAGCAGCTGCTCCATCAGGGA
++
+fcfffffcffeffeeefdefddeecdccacddfdYd`d^\_^`\_abbc\b[ba^Y^Z_^^H^Z_^Y_Y_OKWPZR]]Z]`Z``Z^UHZ^BBBBB
diff --git a/tests/data/interleaved.fastq b/tests/data/interleaved.fastq
new file mode 100644
index 0000000..1da3fdb
--- /dev/null
+++ b/tests/data/interleaved.fastq
@@ -0,0 +1,32 @@
+ at read1/1 some text
+TTATTTGTCTCCAGCTTAGACATATCGCCT
++
+##HHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read1/2 other text
+GCTGGAGACAAATAACAGTGGAGTAGTTTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/1
+CAACAGGCCACATTAGACATATCGGATGGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/2
+TGTGGCCTGTTGCAGTGGAGTAACTCCAGC
++
+###HHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACATTAGACA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGGCAGTG
++
+#HHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/data/issue46.fasta b/tests/data/issue46.fasta
new file mode 100644
index 0000000..50c9ce5
--- /dev/null
+++ b/tests/data/issue46.fasta
@@ -0,0 +1,2 @@
+>readname
+CGTGA
diff --git a/tests/data/lengths.fa b/tests/data/lengths.fa
new file mode 100644
index 0000000..c03f249
--- /dev/null
+++ b/tests/data/lengths.fa
@@ -0,0 +1,28 @@
+>read_length0a
+T330201030313112312
+>read_length0b
+T1330201030313112312
+>read_length1
+T21330201030313112312
+>read_length2
+T021330201030313112312
+>read_length3
+T3021330201030313112312
+>read_length4
+T33021330201030313112312
+>read_length5
+T233021330201030313112312
+>read_length6
+T0233021330201030313112312
+>read_length7
+T10233021330201030313112312
+>read_length8
+T110233021330201030313112312
+>read_length9
+T1110233021330201030313112312
+>read_length10
+T21110233021330201030313112312
+>read_length11
+T021110233021330201030313112312
+>read_length12
+T0021110233021330201030313112312
diff --git a/tests/data/linked.fasta b/tests/data/linked.fasta
new file mode 100644
index 0000000..5d21f89
--- /dev/null
+++ b/tests/data/linked.fasta
@@ -0,0 +1,10 @@
+>r1 5' adapter and 3' adapter
+AAAAAAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG
+>r5 only 5' adapter
+AAAAAAAAAACCCCCCCCCCGGGGGGG
+>r3 5' adapter, partial 3' adapter
+AAAAAAAAAACCCGGCCCCCTTTTT
+>r4 only 3' adapter
+GGGGGGGGGGCCCCCCCCCCTTTTTTTTTTGGGGGGG
+>r2 without any adapter
+GGGGGGGGGGGGGGGGGGG
diff --git a/tests/data/lowqual.fastq b/tests/data/lowqual.fastq
new file mode 100644
index 0000000..7d7d92b
--- /dev/null
+++ b/tests/data/lowqual.fastq
@@ -0,0 +1,8 @@
+ at first_sequence
+SEQUENCE1
++
+#########
+ at second_sequence
+SEQUENCE2
++
+#########
diff --git a/tests/data/maxn.fasta b/tests/data/maxn.fasta
new file mode 100644
index 0000000..1110d12
--- /dev/null
+++ b/tests/data/maxn.fasta
@@ -0,0 +1,12 @@
+>r1
+
+>r2
+N
+>r3
+AAAA
+>r4
+AAAAN
+>r5
+AAANN
+>r6
+AANNN
diff --git a/tests/data/multiblock.fastq.gz b/tests/data/multiblock.fastq.gz
new file mode 100644
index 0000000..8c38897
Binary files /dev/null and b/tests/data/multiblock.fastq.gz differ
diff --git a/tests/data/nextseq.fastq b/tests/data/nextseq.fastq
new file mode 100644
index 0000000..0b6acc1
--- /dev/null
+++ b/tests/data/nextseq.fastq
@@ -0,0 +1,8 @@
+ at NS500350:251:HLM7JBGXX:1:11101:12075:1120 1:N:0:TACAGC
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACAGCATCTCGTATTCCGTCTTCTGCTTGAAAAAAAAAAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
++
+AAAAAEEEEEEAEEEEAEAEEEEEEAEEEEEEEEEEEEEEE///E/EE////AAEE/E//////EEEEEEE6///////E6EEA/AEAEAE6EEEEEEEEEEEEAEAA/E/EEEEA//EEEEEAEAEE/EEEAEEEE<E/AEEEEE/EEE
+ at NS500350:251:HLM7JBGXX:1:11101:22452:1121 1:N:0:TACAGC
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACAGCATCGCGTATGCCGTCTTATGCTTGAAAAAAAAAAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
++
+AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/////E/EE//E6///E//A//E//EEEEEEEE6//EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE<EEEEEEEE
diff --git a/tests/data/no_indels.fasta b/tests/data/no_indels.fasta
new file mode 100644
index 0000000..7a6afb7
--- /dev/null
+++ b/tests/data/no_indels.fasta
@@ -0,0 +1,20 @@
+# 3' adapter: TTAGACATAT
+# 5' adapter: GAGATTGCCA
+>3p_orig
+TGAACATAGCTTAGACATATAACCG
+>3p_mism
+TGAACATAGCTTACACATATAACCG
+>3p_del
+TGAACATAGCTTAACATATAACCG
+>3p_ins
+TGAACATAGCTTAGGACATATAACCG
+>3p_frontins
+TAGACATATAACCG
+>5p_orig
+TCCTCGAGATTGCCATACTGCTTCTCGAA
+>5p_mism
+TCCTCGAGATAGCCATACTGCTTCTCGAA
+>5p_del
+TCCTCGAGATGCCATACTGCTTCTCGAA
+>5p_ins
+TCCTCGAGATATGCCATACTGCTTCTCGAA
diff --git a/tests/data/overlapa.fa b/tests/data/overlapa.fa
new file mode 100644
index 0000000..3a4fac7
--- /dev/null
+++ b/tests/data/overlapa.fa
@@ -0,0 +1,40 @@
+>read1
+T0021110233021330201030313112312
+>read2
+T002111023302133020103031311231
+>read3
+T00211102330213302010303131123
+>read4
+T0021110233021330201030313112
+>read5
+T002111023302133020103031311
+>read6
+T00211102330213302010303131
+>read7
+T0021110233021330201030313
+>read8
+T002111023302133020103031
+>read9
+T00211102330213302010303
+>read10
+T0021110233021330201030
+>read11
+T002111023302133020103
+>read12
+T00211102330213302010
+>read13
+T0021110233021330201
+>read14
+T002111023302133020
+>read15
+T00211102330213302
+>read16
+T0021110233021330
+>read17
+T002111023302133
+>read18
+T00211102330213
+>read19
+T0021110233021
+>read20
+T002111023302
diff --git a/tests/data/overlapb.fa b/tests/data/overlapb.fa
new file mode 100644
index 0000000..c268fc3
--- /dev/null
+++ b/tests/data/overlapb.fa
@@ -0,0 +1,38 @@
+>adaptlen18
+TTAGACATATCTCCGTCGATACTTACCCGTA
+>adaptlen17
+TAGACATATCTCCGTCGATACTTACCCGTA
+>adaptlen16
+AGACATATCTCCGTCGATACTTACCCGTA
+>adaptlen15
+GACATATCTCCGTCGATACTTACCCGTA
+>adaptlen14
+ACATATCTCCGTCGATACTTACCCGTA
+>adaptlen13
+CATATCTCCGTCGATACTTACCCGTA
+>adaptlen12
+ATATCTCCGTCGATACTTACCCGTA
+>adaptlen11
+TATCTCCGTCGATACTTACCCGTA
+>adaptlen10
+ATCTCCGTCGATACTTACCCGTA
+>adaptlen9
+TCTCCGTCGATACTTACCCGTA
+>adaptlen8
+CTCCGTCGATACTTACCCGTA
+>adaptlen7
+TCCGTCGATACTTACCCGTA
+>adaptlen6
+CCGTCGATACTTACCCGTA
+>adaptlen5
+CGTCGATACTTACCCGTA
+>adaptlen4
+GTCGATACTTACCCGTA
+>adaptlen3
+TCGATACTTACCCGTA
+>adaptlen2
+CGATACTTACCCGTA
+>adaptlen1
+GATACTTACCCGTA
+>adaptlen0
+ATACTTACCCGTA
diff --git a/tests/data/paired.1.fastq b/tests/data/paired.1.fastq
new file mode 100644
index 0000000..3f2d733
--- /dev/null
+++ b/tests/data/paired.1.fastq
@@ -0,0 +1,16 @@
+ at read1/1 some text
+TTATTTGTCTCCAGCTTAGACATATCGCCT
++
+##HHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/1
+CAACAGGCCACATTAGACATATCGGATGGT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/1
+CCAACTTGATATTAATAACATTAGACA
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/1
+GACAGGCCGTTTGAATGTTGACGGGATGTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
diff --git a/tests/data/paired.2.fastq b/tests/data/paired.2.fastq
new file mode 100644
index 0000000..808df31
--- /dev/null
+++ b/tests/data/paired.2.fastq
@@ -0,0 +1,16 @@
+ at read1/2 other text
+GCTGGAGACAAATAACAGTGGAGTAGTTTT
++
+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read2/2
+TGTGGCCTGTTGCAGTGGAGTAACTCCAGC
++
+###HHHHHHHHHHHHHHHHHHHHHHHHHHH
+ at read3/2
+TGTTATTAATATCAAGTTGGCAGTG
++
+#HHHHHHHHHHHHHHHHHHHHHHHH
+ at read4/2
+CATCCCGTCAACATTCAAACGGCCTGTCCA
++
+HH############################
diff --git a/tests/data/plus.fastq b/tests/data/plus.fastq
new file mode 100644
index 0000000..35849f8
--- /dev/null
+++ b/tests/data/plus.fastq
@@ -0,0 +1,8 @@
+ at first_sequence some other text
+SEQUENCE1
++first_sequence some other text
+:6;;8<=:<
+ at second_sequence and more text
+SEQUENCE2
++second_sequence and more text
+83<??:(61
diff --git a/tests/data/polya.fasta b/tests/data/polya.fasta
new file mode 100644
index 0000000..4f02229
--- /dev/null
+++ b/tests/data/polya.fasta
@@ -0,0 +1,6 @@
+>polyA
+AAACTTCAGAACAGAAAAAAAAAAAAAAAAAAAAA
+>polyAlong
+CTTAGTTCAATWTTAACCAAACTTCAGAACAGAAAAAAAAAAAAAAAAAAAAAGAAAAAAAAAAAAAAAAAAAA
+>polyA2
+AAACTTAACAAGAACAAGAAAAAAAAAAAAAAAAAAAAA
diff --git a/tests/data/prefix-adapter.fasta b/tests/data/prefix-adapter.fasta
new file mode 100644
index 0000000..b56e57b
--- /dev/null
+++ b/tests/data/prefix-adapter.fasta
@@ -0,0 +1,2 @@
+>prefixadapter
+^FRONTADAPT
diff --git a/tests/data/rest.fa b/tests/data/rest.fa
new file mode 100644
index 0000000..31277ed
--- /dev/null
+++ b/tests/data/rest.fa
@@ -0,0 +1,18 @@
+>read1
+TESTINGADAPTERREST1
+>read2
+TESTINGADAPTERRESTING
+>read3
+TESTINGADAPTER
+>read4
+TESTINGADAPTERRESTLESS
+>read5
+TESTINGADAPTERRESTORE
+>read6
+ADAPTERSOMETHING
+>read7
+DAPTERSOMETHING
+>read8
+RESTADAPTERSOMETHING
+>read9
+NOREST
diff --git a/tests/data/rest.txt b/tests/data/rest.txt
new file mode 100644
index 0000000..31b1941
--- /dev/null
+++ b/tests/data/rest.txt
@@ -0,0 +1,5 @@
+REST1 read1
+RESTING read2
+RESTLESS read4
+RESTORE read5
+SOMETHING read8
diff --git a/tests/data/restfront.txt b/tests/data/restfront.txt
new file mode 100644
index 0000000..3cdba2f
--- /dev/null
+++ b/tests/data/restfront.txt
@@ -0,0 +1,6 @@
+TESTING read1
+TESTING read2
+TESTING read3
+TESTING read4
+TESTING read5
+REST read8
diff --git a/tests/data/s_1_sequence.txt.gz b/tests/data/s_1_sequence.txt.gz
new file mode 100644
index 0000000..3967383
Binary files /dev/null and b/tests/data/s_1_sequence.txt.gz differ
diff --git a/tests/data/simple.fasta b/tests/data/simple.fasta
new file mode 100644
index 0000000..e5c1d4c
--- /dev/null
+++ b/tests/data/simple.fasta
@@ -0,0 +1,7 @@
+# a comment
+# another one
+>first_sequence
+SEQUENCE1
+>second_sequence
+SEQUEN
+CE2
diff --git a/tests/data/simple.fastq b/tests/data/simple.fastq
new file mode 100644
index 0000000..f728223
--- /dev/null
+++ b/tests/data/simple.fastq
@@ -0,0 +1,8 @@
+ at first_sequence
+SEQUENCE1
++
+:6;;8<=:<
+ at second_sequence
+SEQUENCE2
++
+83<??:(61
diff --git a/tests/data/small.fastq b/tests/data/small.fastq
new file mode 100644
index 0000000..767ca22
--- /dev/null
+++ b/tests/data/small.fastq
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGATTAGACAAAT
++
+)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5
+ at prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT
++
+;<:&:A;A!9<<<,7:<=3=;:<&<?<?8<;=<&
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/data/small.fastq.bz2 b/tests/data/small.fastq.bz2
new file mode 100644
index 0000000..d71a53a
Binary files /dev/null and b/tests/data/small.fastq.bz2 differ
diff --git a/tests/data/small.fastq.gz b/tests/data/small.fastq.gz
new file mode 100644
index 0000000..f843389
Binary files /dev/null and b/tests/data/small.fastq.gz differ
diff --git a/tests/data/small.fastq.xz b/tests/data/small.fastq.xz
new file mode 100644
index 0000000..a7f38cb
Binary files /dev/null and b/tests/data/small.fastq.xz differ
diff --git a/tests/data/small.myownextension b/tests/data/small.myownextension
new file mode 100644
index 0000000..767ca22
--- /dev/null
+++ b/tests/data/small.myownextension
@@ -0,0 +1,12 @@
+ at prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGATTAGACAAAT
++
+)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5
+ at prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT
++
+;<:&:A;A!9<<<,7:<=3=;:<&<?<?8<;=<&
+ at prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
diff --git a/tests/data/solid.csfasta b/tests/data/solid.csfasta
new file mode 100644
index 0000000..1045429
--- /dev/null
+++ b/tests/data/solid.csfasta
@@ -0,0 +1,63 @@
+# Tue May 5 13:57:32 2009 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/s0103/s0103_20090430_552to561_2_2/552to561/results.01/primary.20090505091459275 --name=s0103_20090430_552to561_2_2_552to561 --tag=F3 --minlength=35 --mincalls=25 --prefix=T /data/results/s0103/s0103_20090430_552to561_2_2/552to561/jobs/postPrimerSetPrimary.197/rawseq
+# Cwd: /state/partition1/home/pipeline
+# Title: s0103_20090430_552to561_2_2_552to561
+>1_13_85_F3
+T110020300.0113010210002110102330021
+>1_13_573_F3
+T312311200.3021301101113203302010003
+>1_13_1259_F3
+T002112130.2012223322111330201230313
+>1_13_1440_F3
+T110020313.1113211010332111302330001
+>1_14_177_F3
+T31330222020233321121323302013303311
+>1_14_238_F3
+T01331031200310022122230330201030313
+>1_15_1098_F3
+T32333033222233020223032312232220332
+>1_16_404_F3
+T03310320002130202331112133020103031
+>1_16_904_F3
+T21230102331022312232132021122111212
+>1_16_1315_F3
+T03231231112210333010310323302010003
+>1_16_1595_F3
+T22323211312111230022210011213302012
+>1_17_1379_F3
+T32011212111223230232132311321200123
+>1_18_1692_F3
+T12322233031100211233323300112200210
+>1_19_171_F3
+T10101101220213201111011320201230032
+>1_22_72_F3
+T13303032323221212301322233320210233
+>1_22_1377_F3
+T22221333311222312201132312022322300
+>1_23_585_F3
+T30010310310130312122123302013303131
+>1_23_809_F3
+T13130101101021211013220302223302112
+>1_24_138_F3
+T33211130100120323002033020123031311
+>1_24_206_F3
+T33330332002223002020303331321221000
+>1_25_143_F3
+T23202003031200220301303302012203132
+>1_25_1866_F3
+T03201321022131101112012330221130311
+>1_27_584_F3
+T10010330110103213112323303012103101
+>1_27_1227_F3
+T02003022123001003201002031303302011
+>1_27_1350_F3
+T13130101101021211013220222221301231
+>1_29_477_F3
+T13130101101021211013300302223003030
+>1_30_882_F3
+T20102033000233133320103031311233200
+>1_31_221_F3
+T03301311201100030300100233220102031
+>1_31_1313_F3
+T01331131300330122321000101010330201
+>1_529_129_F3
+T132222301020322102101322221322302.3302.3.3..221..3
diff --git a/tests/data/solid.fasta b/tests/data/solid.fasta
new file mode 100644
index 0000000..f9f1777
--- /dev/null
+++ b/tests/data/solid.fasta
@@ -0,0 +1,4 @@
+>problem1
+T01120212022222011231210231030201330
+>problem2
+T20201030313112322220210033020133031
diff --git a/tests/data/solid.fastq b/tests/data/solid.fastq
new file mode 100644
index 0000000..82c7b7a
--- /dev/null
+++ b/tests/data/solid.fastq
@@ -0,0 +1,120 @@
+ at 1_13_85_F3
+T110020300.0113010210002110102330021
++
+7&9<&77)& <7))%4'657-1+9;9,.<8);.;8
+ at 1_13_573_F3
+T312311200.3021301101113203302010003
++
+6)3%)&&&& .1&(6:<'67..*,:75)'77&&&5
+ at 1_13_1259_F3
+T002112130.2012223322111330201230313
++
+=;<:&:A;A 9<<<,7:<=3=;:<&<?<?8<;=<&
+ at 1_13_1440_F3
+T110020313.1113211010332111302330001
++
+=<=A:A=57 7<';<6?5;;6:+:=)71>70<,=:
+ at 1_14_177_F3
+T31330222020233321121323302013303311
++
+:8957;;54)'98924905;;)6:7;1:3<88(9:
+ at 1_14_238_F3
+T01331031200310022122230330201030313
++
+?><5=;<<<12>=<;1;;=5);.;14:0>2;:3;7
+ at 1_15_1098_F3
+T32333033222233020223032312232220332
++
+#,##(#5##*#($$'#.##)$&#%)$1##-$&##%
+ at 1_16_404_F3
+T03310320002130202331112133020103031
++
+78;:;;><>9=9;<<2=><<1;58;9<<;>(<;<;
+ at 1_16_904_F3
+T21230102331022312232132021122111212
++
+9>=::6;;99=+/'$+#.#&%$&'(($1*$($.#.
+ at 1_16_1315_F3
+T03231231112210333010310323302010003
++
+<9<8A?>?::;6&,%;6/)8<<#/;79(448&*.)
+ at 1_16_1595_F3
+T22323211312111230022210011213302012
++
+>,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+
+ at 1_17_1379_F3
+T32011212111223230232132311321200123
++
+/-1179<1;>>8:':7-%/::0&+=<29,7<8(,2
+ at 1_18_1692_F3
+T12322233031100211233323300112200210
++
+.#(###5%)%2)',2&:+#+&5,($/1#&4&))$6
+ at 1_19_171_F3
+T10101101220213201111011320201230032
++
+)6:65/=3*:(8%)%2>&8&%;%0&#;$3$&:$#&
+ at 1_22_72_F3
+T13303032323221212301322233320210233
++
+3/#678<:.=9::6:(<538295;9+;&*;)+',&
+ at 1_22_1377_F3
+T22221333311222312201132312022322300
++
+)##0%.$.1*%,)95+%%14%$#8-###9-()#9+
+ at 1_23_585_F3
+T30010310310130312122123302013303131
++
+>55;8><96/18?)<3<58<5:;96=7:1=8=:-<
+ at 1_23_809_F3
+T13130101101021211013220302223302112
++
+:7<59@;<<5;/9;=<;7::.)&&&827(+221%(
+ at 1_24_138_F3
+T33211130100120323002033020123031311
++
+6)68/;906#,25/&;<$0+250#2,<)5,9/+7)
+ at 1_24_206_F3
+T33330332002223002020303331321221000
++
+))4(&)9592)#)694(,)292:(=7$.18,()65
+ at 1_25_143_F3
+T23202003031200220301303302012203132
++
+:4;/#&<9;&*;95-7;85&;587#16>%&,9<2&
+ at 1_25_1866_F3
+T03201321022131101112012330221130311
++
+=<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4
+ at 1_27_584_F3
+T10010330110103213112323303012103101
++
+82'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/
+ at 1_27_1227_F3
+T02003022123001003201002031303302011
++
+492:;>A:<;34<<=);:<<;9=7<3::<::3=>'
+ at 1_27_1350_F3
+T13130101101021211013220222221301231
++
+95,)<(4./;<938=64=+2/,.4),3':97#33&
+ at 1_29_477_F3
+T13130101101021211013300302223003030
++
+94=55:75=+:/7><968;;#&+$#3&6,#1#4#'
+ at 1_30_882_F3
+T20102033000233133320103031311233200
++
+2(+-:-3<;5##/;:(%&84'#:,?3&&8>-();5
+ at 1_31_221_F3
+T03301311201100030300100233220102031
++
+89>9>5<139/,&:7969972.274&%:78&&746
+ at 1_31_1313_F3
+T01331131300330122321000101010330201
++
+;3<7=7::)5*4=&;<7>4;795065;9';896'=
+ at 1_529_129_F3
+T132222301020322102101322221322302.3302.3.3..221..3
++
+>>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+ &<-9 % @ )%) (
diff --git a/tests/data/solid.qual b/tests/data/solid.qual
new file mode 100644
index 0000000..f7c5c43
--- /dev/null
+++ b/tests/data/solid.qual
@@ -0,0 +1,63 @@
+# Tue May 5 13:57:32 2009 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/s0103/s0103_20090430_552to561_2_2/552to561/results.01/primary.20090505091459275 --name=s0103_20090430_552to561_2_2_552to561 --tag=F3 --minlength=35 --mincalls=25 --prefix=T /data/results/s0103/s0103_20090430_552to561_2_2/552to561/jobs/postPrimerSetPrimary.197/rawseq
+# Cwd: /state/partition1/home/pipeline
+# Title: s0103_20090430_552to561_2_2_552to561
+>1_13_85_F3
+22 5 24 27 5 22 22 8 5 -1 27 22 8 8 4 19 6 21 20 22 12 16 10 24 26 24 11 13 27 23 8 26 13 26 23
+>1_13_573_F3
+21 8 18 4 8 5 5 5 5 -1 13 16 5 7 21 25 27 6 21 22 13 13 9 11 25 22 20 8 6 22 22 5 5 5 20
+>1_13_1259_F3
+28 26 27 25 5 25 32 26 32 -1 24 27 27 27 11 22 25 27 28 18 28 26 25 27 5 27 30 27 30 23 27 26 28 27 5
+>1_13_1440_F3
+28 27 28 32 25 32 28 20 22 -1 22 27 6 26 27 21 30 20 26 26 21 25 10 25 28 8 22 16 29 22 15 27 11 28 25
+>1_14_177_F3
+25 23 24 20 22 26 26 20 19 8 6 24 23 24 17 19 24 15 20 26 26 8 21 25 22 26 16 25 18 27 23 23 7 24 25
+>1_14_238_F3
+30 29 27 20 28 26 27 27 27 16 17 29 28 27 26 16 26 26 28 20 8 26 13 26 16 19 25 15 29 17 26 25 18 26 22
+>1_15_1098_F3
+2 11 2 2 7 2 20 2 2 9 2 7 3 3 6 2 13 2 2 8 3 5 2 4 8 3 16 2 2 12 3 5 2 2 4
+>1_16_404_F3
+22 23 26 25 26 26 29 27 29 24 28 24 26 27 27 17 28 29 27 27 16 26 20 23 26 24 27 27 26 29 7 27 26 27 26
+>1_16_904_F3
+24 29 28 25 25 21 26 26 24 24 28 10 14 6 3 10 2 13 2 5 4 3 5 6 7 7 3 16 9 3 7 3 13 2 13
+>1_16_1315_F3
+27 24 27 23 32 30 29 30 25 25 26 21 5 11 4 26 21 14 8 23 27 27 2 14 26 22 24 7 19 19 23 5 9 13 8
+>1_16_1595_F3
+29 11 27 28 27 29 31 21 27 26 30 27 28 29 25 14 28 13 29 5 26 26 23 26 8 16 22 25 28 5 11 29 16 28 10
+>1_17_1379_F3
+14 12 16 16 22 24 27 16 26 29 29 23 25 6 25 22 12 4 14 25 25 15 5 10 28 27 17 24 11 22 27 23 7 11 17
+>1_18_1692_F3
+13 2 7 2 2 2 20 4 8 4 17 8 6 11 17 5 25 10 2 10 5 20 11 7 3 14 16 2 5 19 5 8 8 3 21
+>1_19_171_F3
+8 21 25 21 20 14 28 18 9 25 7 23 4 8 4 17 29 5 23 5 4 26 4 15 5 2 26 3 18 3 5 25 3 2 5
+>1_22_72_F3
+18 14 2 21 22 23 27 25 13 28 24 25 25 21 25 7 27 20 18 23 17 24 20 26 24 10 26 5 9 26 8 10 6 11 5
+>1_22_1377_F3
+8 2 2 15 4 13 3 13 16 9 4 11 8 24 20 10 4 4 16 19 4 3 2 23 12 2 2 2 24 12 7 8 2 24 10
+>1_23_585_F3
+29 20 20 26 23 29 27 24 21 14 16 23 30 8 27 18 27 20 23 27 20 25 26 24 21 28 22 25 16 28 23 28 25 12 27
+>1_23_809_F3
+25 22 27 20 24 31 26 27 27 20 26 14 24 26 28 27 26 22 25 25 13 8 5 5 5 23 17 22 7 10 17 17 16 4 7
+>1_24_138_F3
+21 8 21 23 14 26 24 15 21 2 11 17 20 14 5 26 27 3 15 10 17 20 15 2 17 11 27 8 20 11 24 14 10 22 8
+>1_24_206_F3
+8 8 19 7 5 8 24 20 24 17 8 2 8 21 24 19 7 11 8 17 24 17 25 7 28 22 3 13 16 23 11 7 8 21 20
+>1_25_143_F3
+25 19 26 14 2 5 27 24 26 5 9 26 24 20 12 22 26 23 20 5 26 20 23 22 2 16 21 29 4 5 11 24 27 17 5
+>1_25_1866_F3
+28 27 29 24 26 27 31 22 30 7 28 21 11 27 5 30 28 21 28 7 28 27 21 19 16 25 30 6 27 16 28 26 6 25 19
+>1_27_584_F3
+23 17 6 7 6 9 13 12 23 10 4 2 17 8 7 12 5 18 13 11 13 17 11 8 11 10 13 6 25 5 11 6 7 5 14
+>1_27_1227_F3
+19 24 17 25 26 29 32 25 27 26 18 19 27 27 28 8 26 25 27 27 26 24 28 22 27 18 25 25 27 25 25 18 28 29 6
+>1_27_1350_F3
+24 20 11 8 27 7 19 13 14 26 27 24 18 23 28 21 19 28 10 17 14 11 13 19 8 11 18 6 25 24 22 2 18 18 5
+>1_29_477_F3
+24 19 28 20 20 25 22 20 28 10 25 14 22 29 27 24 21 23 26 26 2 5 10 3 2 18 5 21 11 2 16 2 19 2 6
+>1_30_882_F3
+17 7 10 12 25 12 18 27 26 20 2 2 14 26 25 7 4 5 23 19 6 2 25 11 30 18 5 5 23 29 12 7 8 26 20
+>1_31_221_F3
+23 24 29 24 29 20 27 16 18 24 14 11 5 25 22 24 21 24 24 22 17 13 17 22 19 5 4 25 22 23 5 5 22 19 21
+>1_31_1313_F3
+26 18 27 22 28 22 25 25 8 20 9 19 28 5 26 27 22 29 19 26 22 24 20 15 21 20 26 24 6 26 23 24 21 6 28
+>1_529_129_F3
+29 29 4 14 7 7 33 21 12 5 20 32 15 25 21 8 29 26 6 16 8 33 9 18 23 14 30 7 20 28 4 33 10 -1 5 27 12 24 -1 4 -1 31 -1 -1 8 4 8 -1 -1 7
diff --git a/tests/data/solid5p.fasta b/tests/data/solid5p.fasta
new file mode 100644
index 0000000..6b6d2cb
--- /dev/null
+++ b/tests/data/solid5p.fasta
@@ -0,0 +1,34 @@
+# used adapter: CCGGAGGTCAGCTCGCTATA
+# in colorspace: C0302201212322332333
+>read1
+T1212322332333012001112122203233202221000211
+>read2
+T201212322332333200121311212133113001311002032
+>read3
+T02201212322332333211133003002232323010012320300
+>read4
+T0302201212322332333002010102312033021011121312131
+>read5
+T20302201212322332333221313210102120020302022233110
+>read6
+T20302211212322332333031203203013323021010020301321
+>read7
+T21301020302201212322332333203020130202120211322010013211
+>read8
+T2310321030130120302201212322332333232202123123111113113003200330
+>read9
+T0002132103320302201212322332333020123133023120320131020333011
+>read10
+T00322031320033220302201212322332333201130233321321011303133231200
+>read11
+T0302201212322332333.02010102312033021011121312131
+>read12
+T030220121232233233321
+>read13
+T03022012123223323332
+>read14
+T0302201212322332333
+>read15
+T030220121232233233
+>read16
+T030220121232233233
diff --git a/tests/data/solid5p.fastq b/tests/data/solid5p.fastq
new file mode 100644
index 0000000..1efba0a
--- /dev/null
+++ b/tests/data/solid5p.fastq
@@ -0,0 +1,64 @@
+ at read1
+T1212322332333012001112122203233202221000211
++
+:58)2";%4A,8>0;9C\'?276>#)49"<,>?/\'!A4$.%+
+ at read2
+T201212322332333200121311212133113001311002032
++
+44<@;(<3.37/''=:-9AA<&C2%$$;?A&5!C69:?-;&;65.
+ at read3
+T02201212322332333211133003002232323010012320300
++
+2!<A-BB&A/)'103&2$!00>#97*B.0A-@(*","B3><4&16(:
+ at read4
+T0302201212322332333002010102312033021011121312131
++
+74-:$-;&@>@0581-82'<&-81+%)7;<)6?83!&CB9"9B6307=&
+ at read5
+T20302201212322332333221313210102120020302022233110
++
+';4!-6?0$45.C#B+$(4+$9)27,(-*=,#4:;"/4++5<, at -784*'
+ at read6
+T20302211212322332333031203203013323021010020301321
++
++3"85:2=3<")$66*#4".4!.;:C%97@>75-";';*)A67CCC")$*
+ at read7
+T21301020302201212322332333203020130202120211322010013211
++
+,;0B at A"98!<=!*;5;650;';79!+8,4(2=+98:B at C@:+3*>2+6+2++C0.
+ at read8
+T2310321030130120302201212322332333232202123123111113113003200330
++
+C/$-"=6+1.8?AB!?'#.585 at 6:47@?>.315A-'9<%">6,+)*,)1-;:(691>?C)4A;
+ at read9
+T0002132103320302201212322332333020123133023120320131020333011
++
+(&?527&:=;6 at 6@03%95(-0#$:B8::B*4?@&)6>79C>)6C'5-#<!B:>0:A8+2*
+ at read10
+T00322031320033220302201212322332333201130233321321011303133231200
++
+&53)>2.+9?7%=&21;8!820961%3#0'5C.28347,2(55*1.,>%:(1A'A5=@7&&5?4'
+ at read11
+T0302201212322332333.02010102312033021011121312131
++
+6=@!85+6<A(&#@7"'C:&8B"195'@,@&:5=7;!&-9:%<!)>((>
+ at read12
+T030220121232233233321
++
+4&1.?+<-0(!(;://+0@?C
+ at read13
+T03022012123223323332
++
+!&,>"772,,/2/2A1C%5C
+ at read14
+T0302201212322332333
++
+@%$#B$A0B0&((<C*+.A
+ at read15
+T030220121232233233
++
+?B=,A#5"*?7268++:2
+ at read16
+T030220121232233233
++
+C=C=C:=+ at 77@723!C5
diff --git a/tests/data/sra.fastq b/tests/data/sra.fastq
new file mode 100644
index 0000000..a92a89c
--- /dev/null
+++ b/tests/data/sra.fastq
@@ -0,0 +1,24 @@
+ at 1_13_85_F3
+T110020300.0113010210002110102330021
++
+!7&9<&77)& <7))%4'657-1+9;9,.<8);.;8
+ at 1_13_573_F3
+T312311200.3021301101113203302010003
++
+!6)3%)&&&& .1&(6:<'67..*,:75)'77&&&5
+ at 1_13_1259_F3
+T002112130.2012223322111330201230313
++
+!=;<:&:A;A 9<<<,7:<=3=;:<&<?<?8<;=<&
+ at 1_13_1440_F3
+T110020313.1113211010332111302330001
++
+!=<=A:A=57 7<';<6?5;;6:+:=)71>70<,=:
+ at 1_14_177_F3
+T31330222020233321121323302013303311
++
+!:8957;;54)'98924905;;)6:7;1:3<88(9:
+ at 1_14_238_F3
+T01331031200310022122230330201030313
++
+!?><5=;<<<12>=<;1;;=5);.;14:0>2;:3;7
diff --git a/tests/data/suffix-adapter.fasta b/tests/data/suffix-adapter.fasta
new file mode 100644
index 0000000..65c68c3
--- /dev/null
+++ b/tests/data/suffix-adapter.fasta
@@ -0,0 +1,2 @@
+>suffixadapter
+BACKADAPTER$
diff --git a/tests/data/toolong.fa b/tests/data/toolong.fa
new file mode 100644
index 0000000..79a9a79
--- /dev/null
+++ b/tests/data/toolong.fa
@@ -0,0 +1,14 @@
+>read_length6
+T023302
+>read_length7
+T1023302
+>read_length8
+T11023302
+>read_length9
+T111023302
+>read_length10
+T2111023302
+>read_length11
+T02111023302
+>read_length12
+T002111023302
diff --git a/tests/data/tooshort.fa b/tests/data/tooshort.fa
new file mode 100644
index 0000000..a5e4711
--- /dev/null
+++ b/tests/data/tooshort.fa
@@ -0,0 +1,12 @@
+>read_length0a
+T
+>read_length0b
+T
+>read_length1
+T2
+>read_length2
+T02
+>read_length3
+T302
+>read_length4
+T3302
diff --git a/tests/data/tooshort.noprimer.fa b/tests/data/tooshort.noprimer.fa
new file mode 100644
index 0000000..e5e22b4
--- /dev/null
+++ b/tests/data/tooshort.noprimer.fa
@@ -0,0 +1,14 @@
+>read_length0a
+
+>read_length0b
+
+>read_length1
+
+>read_length2
+2
+>read_length3
+02
+>read_length4
+302
+>read_length5
+3302
diff --git a/tests/data/trimN3.fasta b/tests/data/trimN3.fasta
new file mode 100644
index 0000000..d936bad
--- /dev/null
+++ b/tests/data/trimN3.fasta
@@ -0,0 +1,2 @@
+>read1
+CAGTCGGTCCTGAGAGATGGGCGAGCGCTGGNANNNNNNNG
diff --git a/tests/data/trimN5.fasta b/tests/data/trimN5.fasta
new file mode 100644
index 0000000..ce681fe
--- /dev/null
+++ b/tests/data/trimN5.fasta
@@ -0,0 +1,2 @@
+>read1
+NGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAG
diff --git a/tests/data/twoadapters.fasta b/tests/data/twoadapters.fasta
new file mode 100644
index 0000000..68d59a1
--- /dev/null
+++ b/tests/data/twoadapters.fasta
@@ -0,0 +1,6 @@
+>read1
+GATCCTCCTGGAGCTGGCTGATACCAGTATACCAGTGCTGATTGTTGAATTTCAGGAATTTCTCAAGCTCGGTAGC
+>read2
+CTCGAGAATTCTGGATCCTCTCTTCTGCTACCTTTGGGATTTGCTTGCTCTTGGTTCTCTAGTTCTTGTAGTGGTG
+>read3 (no adapter)
+AATGAAGGTTGTAACCATAACAGGAAGTCATGCGCATTTAGTCGAGCACGTAAGTTCATACGGAAATGGGTAAG
diff --git a/tests/data/wildcard.fa b/tests/data/wildcard.fa
new file mode 100644
index 0000000..f482927
--- /dev/null
+++ b/tests/data/wildcard.fa
@@ -0,0 +1,4 @@
+>1
+ANGTACGTTGCATGCA
+>2
+ACGTANGTTGCATGCA
diff --git a/tests/data/wildcardN.fa b/tests/data/wildcardN.fa
new file mode 100644
index 0000000..5c15266
--- /dev/null
+++ b/tests/data/wildcardN.fa
@@ -0,0 +1,6 @@
+>perfect
+TTTGGGGGGG
+>withN
+TTTGGNGGGG
+>1mism
+TTTGGGGCGG
diff --git a/tests/data/wildcard_adapter.fa b/tests/data/wildcard_adapter.fa
new file mode 100644
index 0000000..a62b84c
--- /dev/null
+++ b/tests/data/wildcard_adapter.fa
@@ -0,0 +1,8 @@
+>1
+ACGTAAAACGTTGCATGCA
+>2
+ACGTGGGACGTTGCATGCA
+>3b
+TGGCTGGCCACGTCCCACGTAA
+>4b
+TGGCTGGCCACGTTTTACGTCC
diff --git a/tests/data/withplus.fastq b/tests/data/withplus.fastq
new file mode 100644
index 0000000..b71fc07
--- /dev/null
+++ b/tests/data/withplus.fastq
@@ -0,0 +1,8 @@
+ at first_sequence
+SEQUENCE1
++this is different
+:6;;8<=:<
+ at second_sequence
+SEQUENCE2
++also different
+83<??:(61
diff --git a/tests/testadapters.py b/tests/testadapters.py
new file mode 100644
index 0000000..4d3147b
--- /dev/null
+++ b/tests/testadapters.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+from nose.tools import raises, assert_raises
+
+from cutadapt.seqio import Sequence
+from cutadapt.adapters import (Adapter, Match, ColorspaceAdapter, FRONT, BACK,
+ parse_braces, LinkedAdapter)
+
+def test_issue_52():
+ adapter = Adapter(
+ sequence='GAACTCCAGTCACNNNNN',
+ where=BACK,
+ max_error_rate=0.12,
+ min_overlap=5,
+ read_wildcards=False,
+ adapter_wildcards=True)
+ read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC')
+ am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, front=None, adapter=adapter, read=read)
+ assert am.wildcards() == 'GGC'
+ """
+ The result above should actually be 'CGGC' since the correct
+ alignment is this one:
+
+ adapter GAACTCCAGTCACNNNNN
+ mismatches X X
+ read CCCCAGAACTACAGTC-CCGGC
+
+ Since we do not keep the alignment, guessing 'GGC' is the best we
+ can currently do.
+ """
+
+
+def test_issue_80():
+ # This issue turned out to not be an actual issue with the alignment
+ # algorithm. The following alignment is found because it has more matches
+ # than the 'obvious' one:
+ #
+ # TCGTATGCCGTCTTC
+ # =========X==XX=
+ # TCGTATGCCCTC--C
+ #
+ # This is correct, albeit a little surprising, since an alignment without
+ # indels would have only two errors.
+
+ adapter = Adapter(
+ sequence="TCGTATGCCGTCTTC",
+ where=BACK,
+ max_error_rate=0.2,
+ min_overlap=3,
+ read_wildcards=False,
+ adapter_wildcards=False)
+ read = Sequence(name="seq2", sequence="TCGTATGCCCTCC")
+ result = adapter.match_to(read)
+ assert result.errors == 3, result
+ assert result.astart == 0, result
+ assert result.astop == 15, result
+
+
+def test_str():
+ a = Adapter('ACGT', where=BACK, max_error_rate=0.1)
+ str(a)
+ str(a.match_to(Sequence(name='seq', sequence='TTACGT')))
+ ca = ColorspaceAdapter('0123', where=BACK, max_error_rate=0.1)
+ str(ca)
+
+
+ at raises(ValueError)
+def test_color():
+ ColorspaceAdapter('0123', where=FRONT, max_error_rate=0.1)
+
+
+def test_parse_braces():
+ assert parse_braces('') == ''
+ assert parse_braces('A') == 'A'
+ assert parse_braces('A{0}') == ''
+ assert parse_braces('A{1}') == 'A'
+ assert parse_braces('A{2}') == 'AA'
+ assert parse_braces('A{2}C') == 'AAC'
+ assert parse_braces('ACGTN{3}TGACCC') == 'ACGTNNNTGACCC'
+ assert parse_braces('ACGTN{10}TGACCC') == 'ACGTNNNNNNNNNNTGACCC'
+ assert parse_braces('ACGTN{3}TGA{4}CCC') == 'ACGTNNNTGAAAACCC'
+ assert parse_braces('ACGTN{0}TGA{4}CCC') == 'ACGTTGAAAACCC'
+
+
+def test_parse_braces_fail():
+ for expression in ['{', '}', '{}', '{5', '{1}', 'A{-7}', 'A{', 'A{1', 'N{7', 'AN{7', 'A{4{}',
+ 'A{4}{3}', 'A{b}', 'A{6X}', 'A{X6}']:
+ assert_raises(ValueError, lambda: parse_braces(expression))
+
+
+def test_linked_adapter():
+ linked_adapter = LinkedAdapter('AAAA', 'TTTT')
+ sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT')
+ match = linked_adapter.match_to(sequence)
+ trimmed = linked_adapter.trimmed(match)
+ assert trimmed.name == 'seq'
+ assert trimmed.sequence == 'CCCCC'
+
+
+def test_info_record():
+ adapter = Adapter(
+ sequence='GAACTCCAGTCACNNNNN',
+ where=BACK,
+ max_error_rate=0.12,
+ min_overlap=5,
+ read_wildcards=False,
+ adapter_wildcards=True,
+ name="Foo")
+ read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC')
+ am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, front=None,
+ adapter=adapter, read=read)
+ print(am.get_info_record())
+ assert am.get_info_record() == (
+ "abc",
+ 2,
+ 5,
+ 21,
+ 'CCCCA',
+ 'GAACTACAGTCCCGGC',
+ '',
+ 'Foo',
+ '',
+ '',
+ ''
+ )
diff --git a/tests/testalign.py b/tests/testalign.py
new file mode 100644
index 0000000..0ede180
--- /dev/null
+++ b/tests/testalign.py
@@ -0,0 +1,123 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+
+from cutadapt.align import (locate, compare_prefixes, compare_suffixes,
+ Aligner)
+from cutadapt.adapters import BACK
+
+
+class TestAligner():
+ def test(self):
+ reference = 'CTCCAGCTTAGACATATC'
+ aligner = Aligner(reference, 0.1, flags=BACK)
+ aligner.locate('CC')
+
+ def test_100_percent_error_rate(self):
+ reference = 'GCTTAGACATATC'
+ aligner = Aligner(reference, 1.0, flags=BACK)
+ aligner.locate('CAA')
+
+
+def test_polya():
+ s = 'AAAAAAAAAAAAAAAAA'
+ t = 'ACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
+ result = locate(s, t, 0.0, BACK)
+ #start_s, stop_s, start_t, stop_t, matches, cost = result
+ assert result == (0, len(s), 4, 4 + len(s), len(s), 0)
+
+
+# Sequences with IUPAC wildcards
+# R=A|G, Y=C|T, S=G|C, W=A|T, K=G|T, M=A|C, B=C|G|T, D=A|G|T, H=A|C|T, V=A|C|G,
+# N=A|C|G|T, X={}
+WILDCARD_SEQUENCES = [
+ 'CCCATTGATC', # original sequence without wildcards
+ 'CCCRTTRATC', # R=A|G
+ 'YCCATYGATC', # Y=C|T
+ 'CSSATTSATC', # S=G|C
+ 'CCCWWWGATC', # W=A|T
+ 'CCCATKKATC', # K=G|T
+ 'CCMATTGMTC', # M=A|C
+ 'BCCATTBABC', # B=C|G|T
+ 'BCCATTBABC', # B
+ 'CCCDTTDADC', # D=A|G|T
+ 'CHCATHGATC', # H=A|C|T
+ 'CVCVTTVATC', # V=A|C|G
+ 'CCNATNGATC', # N=A|C|G|T
+ 'CCCNTTNATC', # N
+# 'CCCXTTXATC', # X
+]
+
+
+def test_compare_prefixes():
+ assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1)
+ assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0)
+ assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0)
+ assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)
+
+ a = WILDCARD_SEQUENCES[0]
+ for s in WILDCARD_SEQUENCES:
+ r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG'
+ result = compare_prefixes(a, r, wildcard_query=True)
+ assert result == (0, 10, 0, 10, 10, 0), result
+
+ result = compare_prefixes(r, a, wildcard_ref=True)
+ assert result == (0, 10, 0, 10, 10, 0)
+
+ for s in WILDCARD_SEQUENCES:
+ for t in WILDCARD_SEQUENCES:
+ r = s + 'GCCAGGG'
+ result = compare_prefixes(s, r, )
+ assert result == (0, 10, 0, 10, 10, 0)
+
+ result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True)
+ assert result == (0, 10, 0, 10, 10, 0)
+
+ r = WILDCARD_SEQUENCES[0] + 'GCCAGG'
+ for wildc_ref in (False, True):
+ for wildc_query in (False, True):
+ result = compare_prefixes('CCCXTTXATC', r, wildcard_ref=wildc_ref, wildcard_query=wildc_query)
+ assert result == (0, 10, 0, 10, 8, 2)
+
+
+def test_compare_suffixes():
+ assert compare_suffixes('AAXAA', 'TTTTTTTAAAAA') == (0, 5, 7, 12, 4, 1)
+ assert compare_suffixes('AANAA', 'TTTTTTTAACAA', wildcard_ref=True) == (0, 5, 7, 12, 5, 0)
+ assert compare_suffixes('AANAA', 'TTTTTTTAACAA', wildcard_ref=True) == (0, 5, 7, 12, 5, 0)
+ assert compare_suffixes('AAAAAX', 'TTTTTTTAAAAA') == (0, 6, 6, 12, 4, 2)
+
+
+def test_wildcards_in_adapter():
+ r = 'CATCTGTCC' + WILDCARD_SEQUENCES[0] + 'GCCAGGGTTGATTCGGCTGATCTGGCCG'
+ for a in WILDCARD_SEQUENCES:
+ result = locate(a, r, 0.0, BACK, wildcard_ref=True)
+ assert result == (0, 10, 9, 19, 10, 0), result
+
+ a = 'CCCXTTXATC'
+ result = locate(a, r, 0.0, BACK, wildcard_ref=True)
+ assert result is None
+
+
+def test_wildcards_in_read():
+ a = WILDCARD_SEQUENCES[0]
+ for s in WILDCARD_SEQUENCES:
+ r = 'CATCTGTCC' + s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG'
+ result = locate(a, r, 0.0, BACK, wildcard_query=True)
+ if 'X' in s:
+ assert result is None
+ else:
+ assert result == (0, 10, 9, 19, 10, 0), result
+
+
+def test_wildcards_in_both():
+ for a in WILDCARD_SEQUENCES:
+ for s in WILDCARD_SEQUENCES:
+ if 'X' in s or 'X' in a:
+ continue
+ r = 'CATCTGTCC' + s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG'
+ result = locate(a, r, 0.0, BACK, wildcard_ref=True, wildcard_query=True)
+ assert result == (0, 10, 9, 19, 10, 0), result
+
+
+def test_no_match():
+ a = locate('CTGATCTGGCCG', 'AAAAGGG', 0.1, BACK)
+ assert a is None, a
diff --git a/tests/testcolorspace.py b/tests/testcolorspace.py
new file mode 100644
index 0000000..16a9d88
--- /dev/null
+++ b/tests/testcolorspace.py
@@ -0,0 +1,140 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+
+from cutadapt.colorspace import encode, decode
+from cutadapt.scripts.cutadapt import main
+from .utils import run, datapath
+
+# If there are any unknown characters in the test sequence,
+# round tripping will only work if all characters after the
+# first unknown character are also unknown:
+# encode("TNGN") == "T444", but
+# decode("T444") == "TNNN".
+
+sequences = [
+ "",
+ "C",
+ "ACGGTC",
+ "TN",
+ "TN.",
+ "TNN.N",
+ "CCGGCAGCATTCATTACGACAACGTGGCACCGTGTTTTCTCGGTGGTA",
+ "TGCAGTTGATGATCGAAGAAAACGACATCATCAGCCAGCAAGTGC",
+ "CAGGGTTTGATGAGTGGCTGTGGGTGCTGGCGTATCCGGG"
+ ]
+
+
+def test_encode():
+ assert encode("AA") == "A0"
+ assert encode("AC") == "A1"
+ assert encode("AG") == "A2"
+ assert encode("AT") == "A3"
+ assert encode("CA") == "C1"
+ assert encode("CC") == "C0"
+ assert encode("CG") == "C3"
+ assert encode("CT") == "C2"
+ assert encode("GA") == "G2"
+ assert encode("GC") == "G3"
+ assert encode("GG") == "G0"
+ assert encode("GT") == "G1"
+ assert encode("TA") == "T3"
+ assert encode("TC") == "T2"
+ assert encode("TG") == "T1"
+ assert encode("TT") == "T0"
+
+ assert encode("TN") == "T4"
+ assert encode("NT") == "N4"
+ assert encode("NN") == "N4"
+
+ assert encode("ACGGTC") == "A13012"
+ assert encode("TTT.N") == "T0044"
+ assert encode("TTNT.N") == "T04444"
+
+
+def test_decode():
+ for s in sequences:
+ expected = s.replace('.', 'N')
+ encoded = encode(s)
+ assert decode(encoded) == expected
+ assert decode('A.') == 'AN'
+ assert decode('C.') == 'CN'
+ assert decode('G.') == 'GN'
+ assert decode('T.') == 'TN'
+
+
+def test_qualtrim_csfastaqual():
+ '''-q with csfasta/qual files'''
+ run("-c -q 10", "solidqual.fastq", "solid.csfasta", 'solid.qual')
+
+
+def test_E3M():
+ '''Read the E3M dataset'''
+ # not really colorspace, but a fasta/qual file pair
+ main(['-o', '/dev/null', datapath("E3M.fasta"), datapath("E3M.qual")])
+
+
+def test_bwa():
+ '''MAQ-/BWA-compatible output'''
+ run("-c -e 0.12 -a 330201030313112312 -x 552: --maq", "solidmaq.fastq", "solid.csfasta", 'solid.qual')
+
+
+def test_bfast():
+ '''BFAST-compatible output'''
+ run("-c -e 0.12 -a 330201030313112312 -x abc: --strip-f3", "solidbfast.fastq", "solid.csfasta", 'solid.qual')
+
+
+def test_trim_095():
+ '''some reads properly trimmed since cutadapt 0.9.5'''
+ run("-c -e 0.122 -a 330201030313112312", "solid.fasta", "solid.fasta")
+
+
+def test_solid():
+ run("-c -e 0.122 -a 330201030313112312", "solid.fastq", "solid.fastq")
+
+
+def test_solid_basespace_adapter():
+ '''colorspace adapter given in basespace'''
+ run("-c -e 0.122 -a CGCCTTGGCCGTACAGCAG", "solid.fastq", "solid.fastq")
+
+
+def test_solid5p():
+ '''test 5' colorspace adapter'''
+ # this is not a real adapter, just a random string
+ # in colorspace: C0302201212322332333
+ run("-c -e 0.1 --trim-primer -g CCGGAGGTCAGCTCGCTATA", "solid5p.fasta", "solid5p.fasta")
+
+
+def test_solid5p_prefix_notrim():
+ '''test anchored 5' colorspace adapter, no primer trimming'''
+ run("-c -e 0.1 -g ^CCGGAGGTCAGCTCGCTATA", "solid5p-anchored.notrim.fasta", "solid5p.fasta")
+
+
+def test_solid5p_prefix():
+ '''test anchored 5' colorspace adapter'''
+ run("-c -e 0.1 --trim-primer -g ^CCGGAGGTCAGCTCGCTATA", "solid5p-anchored.fasta", "solid5p.fasta")
+
+
+def test_solid5p_fastq():
+ '''test 5' colorspace adapter'''
+ # this is not a real adapter, just a random string
+ # in colorspace: C0302201212322332333
+ run("-c -e 0.1 --trim-primer -g CCGGAGGTCAGCTCGCTATA", "solid5p.fastq", "solid5p.fastq")
+
+
+def test_solid5p_prefix_notrim_fastq():
+ '''test anchored 5' colorspace adapter, no primer trimming'''
+ run("-c -e 0.1 -g ^CCGGAGGTCAGCTCGCTATA", "solid5p-anchored.notrim.fastq", "solid5p.fastq")
+
+
+def test_solid5p_prefix_fastq():
+ '''test anchored 5' colorspace adapter'''
+ run("-c -e 0.1 --trim-primer -g ^CCGGAGGTCAGCTCGCTATA", "solid5p-anchored.fastq", "solid5p.fastq")
+
+
+def test_sra_fastq():
+ '''test SRA-formatted colorspace FASTQ'''
+ run("-c -e 0.1 --format sra-fastq -a CGCCTTGGCCGTACAGCAG", "sra.fastq", "sra.fastq")
+
+
+def test_no_zero_cap():
+ run("--no-zero-cap -c -e 0.122 -a CGCCTTGGCCGTACAGCAG", "solid-no-zerocap.fastq", "solid.fastq")
diff --git a/tests/testfilters.py b/tests/testfilters.py
new file mode 100644
index 0000000..3976e72
--- /dev/null
+++ b/tests/testfilters.py
@@ -0,0 +1,42 @@
+# coding: utf-8
+"""
+Tests write output (should it return True or False or write)
+"""
+from __future__ import print_function, division, absolute_import
+
+from cutadapt.filters import NContentFilter, DISCARD, KEEP, LegacyPairedRedirector, PairedRedirector
+from cutadapt.seqio import Sequence
+
+def test_ncontentfilter():
+ # third parameter is True if read should be discarded
+ params = [
+ ('AAA', 0, KEEP),
+ ('AAA', 1, KEEP),
+ ('AAACCTTGGN', 1, KEEP),
+ ('AAACNNNCTTGGN', 0.5, KEEP),
+ ('NNNNNN', 1, DISCARD),
+ ('ANAAAA', 1/6, KEEP),
+ ('ANAAAA', 0, DISCARD)
+ ]
+ for seq, count, expected in params:
+ filter = NContentFilter(count=count)
+ _seq = Sequence('read1', seq, qualities='#'*len(seq))
+ assert filter(_seq) == expected
+
+
+def test_ncontentfilter_paired():
+ params = [
+ ('AAA', 'AAA', 0, KEEP),
+ ('AAAN', 'AAA', 0, DISCARD),
+ ('AAA', 'AANA', 0, DISCARD),
+ ('ANAA', 'AANA', 1, KEEP),
+ ]
+ for seq1, seq2, count, expected in params:
+ filter = NContentFilter(count=count)
+ filter_legacy = LegacyPairedRedirector(None, filter)
+ filter_both = PairedRedirector(None, filter)
+ read1 = Sequence('read1', seq1, qualities='#'*len(seq1))
+ read2 = Sequence('read1', seq2, qualities='#'*len(seq2))
+ assert filter_legacy(read1, read2) == filter(read1)
+ # discard entire pair if one of the reads fulfills criteria
+ assert filter_both(read1, read2) == expected
diff --git a/tests/testmodifiers.py b/tests/testmodifiers.py
new file mode 100644
index 0000000..5101755
--- /dev/null
+++ b/tests/testmodifiers.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+
+from cutadapt.seqio import Sequence
+from cutadapt.modifiers import UnconditionalCutter, NEndTrimmer, QualityTrimmer
+
+def test_unconditional_cutter():
+ uc = UnconditionalCutter(length=5)
+ s = 'abcdefg'
+ assert UnconditionalCutter(length=2)(s) == 'cdefg'
+ assert UnconditionalCutter(length=-2)(s) == 'abcde'
+ assert UnconditionalCutter(length=100)(s) == ''
+ assert UnconditionalCutter(length=-100)(s) == ''
+
+
+def test_nend_trimmer():
+ trimmer = NEndTrimmer()
+ seqs = ['NNNNAAACCTTGGNNN', 'NNNNAAACNNNCTTGGNNN', 'NNNNNN']
+ trims = ['AAACCTTGG', 'AAACNNNCTTGG', '']
+ for seq, trimmed in zip(seqs, trims):
+ _seq = Sequence('read1', seq, qualities='#'*len(seq))
+ _trimmed = Sequence('read1', trimmed, qualities='#'*len(trimmed))
+ assert trimmer(_seq) == _trimmed
+
+
+def test_quality_trimmer():
+ read = Sequence('read1', 'ACGTTTACGTA', '##456789###')
+
+ qt = QualityTrimmer(10, 10, 33)
+ assert qt(read) == Sequence('read1', 'GTTTAC', '456789')
+
+ qt = QualityTrimmer(0, 10, 33)
+ assert qt(read) == Sequence('read1', 'ACGTTTAC', '##456789')
+
+ qt = QualityTrimmer(10, 0, 33)
+ assert qt(read) == Sequence('read1', 'GTTTACGTA', '456789###')
diff --git a/tests/testpaired.py b/tests/testpaired.py
new file mode 100644
index 0000000..51cc1d0
--- /dev/null
+++ b/tests/testpaired.py
@@ -0,0 +1,273 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+
+import shutil
+from nose.tools import raises
+from cutadapt.scripts import cutadapt
+from .utils import run, files_equal, datapath, cutpath, redirect_stderr, temporary_path
+
+def run_paired(params, in1, in2, expected1, expected2):
+ if type(params) is str:
+ params = params.split()
+ with temporary_path('tmp1-' + expected1) as p1:
+ with temporary_path('tmp2-' + expected2) as p2:
+ params += ['-o', p1, '-p', p2]
+ params += [datapath(in1), datapath(in2)]
+ assert cutadapt.main(params) is None
+ assert files_equal(cutpath(expected1), p1)
+ assert files_equal(cutpath(expected2), p2)
+
+
+def run_interleaved(params, inpath, expected):
+ if type(params) is str:
+ params = params.split()
+ with temporary_path(expected) as tmp:
+ params += ['--interleaved', '-o', tmp, datapath(inpath)]
+ assert cutadapt.main(params) is None
+ assert files_equal(cutpath(expected), tmp)
+
+
+def run_interleaved2(params, inpath, expected1, expected2):
+ assert False # unused function
+ if type(params) is str:
+ params = params.split()
+ with temporary_path('tmp1-' + expected1) as p1:
+ with temporary_path('tmp2-' + expected2) as p2:
+ params += ['--interleaved', '-o', p1, '-p', p2]
+ params += [datapath(inpath)]
+ assert cutadapt.main(params) is None
+ assert files_equal(cutpath(expected), p1)
+ assert files_equal(cutpath(expected), p2)
+
+
+def test_paired_separate():
+ '''test separate trimming of paired-end reads'''
+ run('-a TTAGACATAT', 'paired-separate.1.fastq', 'paired.1.fastq')
+ run('-a CAGTGGAGTA', 'paired-separate.2.fastq', 'paired.2.fastq')
+
+
+def test_paired_end_legacy():
+ '''--paired-output, not using -A/-B/-G'''
+ # the -m 14 filters out one read, which should then also be filtered out in the second output file
+ # -q 10 should not change anything: qualities in file 1 are high enough,
+ # qualities in file 2 should not be inspected.
+ run_paired('-a TTAGACATAT -m 14 -q 10',
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired.m14.1.fastq', expected2='paired.m14.2.fastq'
+ )
+
+
+def test_untrimmed_paired_output():
+ with temporary_path("tmp-untrimmed.1.fastq") as untrimmed1:
+ with temporary_path("tmp-untrimmed.2.fastq") as untrimmed2:
+ run_paired(['-a', 'TTAGACATAT',
+ '--untrimmed-output', untrimmed1,
+ '--untrimmed-paired-output', untrimmed2],
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired-trimmed.1.fastq', expected2='paired-trimmed.2.fastq'
+ )
+ assert files_equal(cutpath('paired-untrimmed.1.fastq'), untrimmed1)
+ assert files_equal(cutpath('paired-untrimmed.2.fastq'), untrimmed2)
+
+
+def test_explicit_format_with_paired():
+ # Use --format=fastq with input files whose extension is .txt
+ with temporary_path("paired.1.txt") as txt1:
+ with temporary_path("paired.2.txt") as txt2:
+ shutil.copyfile(datapath("paired.1.fastq"), txt1)
+ shutil.copyfile(datapath("paired.2.fastq"), txt2)
+ run_paired('--format=fastq -a TTAGACATAT -m 14',
+ in1=txt1, in2=txt2,
+ expected1='paired.m14.1.fastq',
+ expected2='paired.m14.2.fastq'
+ )
+
+
+def test_no_trimming_legacy():
+ # make sure that this doesn't divide by zero
+ cutadapt.main(['-a', 'XXXXX', '-o', '/dev/null', '-p', '/dev/null', datapath('paired.1.fastq'), datapath('paired.2.fastq')])
+
+
+def test_no_trimming():
+ # make sure that this doesn't divide by zero
+ cutadapt.main(['-a', 'XXXXX', '-A', 'XXXXX', '-o', '/dev/null', '-p', '/dev/null', datapath('paired.1.fastq'), datapath('paired.2.fastq')])
+
+
+ at raises(SystemExit)
+def test_missing_file():
+ with redirect_stderr():
+ cutadapt.main(['-a', 'XX', '--paired-output', 'out.fastq', datapath('paired.1.fastq')])
+
+
+ at raises(SystemExit)
+def test_first_too_short():
+ with temporary_path("truncated.1.fastq") as trunc1:
+ # Create a truncated file in which the last read is missing
+ with open(datapath('paired.1.fastq')) as f:
+ lines = f.readlines()
+ lines = lines[:-4]
+ with open(trunc1, 'w') as f:
+ f.writelines(lines)
+ with redirect_stderr():
+ cutadapt.main('-a XX --paired-output out.fastq'.split() + [trunc1, datapath('paired.2.fastq')])
+
+
+ at raises(SystemExit)
+def test_second_too_short():
+ with temporary_path("truncated.2.fastq") as trunc2:
+ # Create a truncated file in which the last read is missing
+ with open(datapath('paired.2.fastq')) as f:
+ lines = f.readlines()
+ lines = lines[:-4]
+ with open(trunc2, 'w') as f:
+ f.writelines(lines)
+ with redirect_stderr():
+ cutadapt.main('-a XX --paired-output out.fastq'.split() + [datapath('paired.1.fastq'), trunc2])
+
+
+ at raises(SystemExit)
+def test_unmatched_read_names():
+ with temporary_path("swapped.1.fastq") as swapped:
+ # Create a file in which reads 2 and are swapped
+ with open(datapath('paired.1.fastq')) as f:
+ lines = f.readlines()
+ lines = lines[0:4] + lines[8:12] + lines[4:8] + lines[12:]
+ with open(swapped, 'w') as f:
+ f.writelines(lines)
+ with redirect_stderr():
+ cutadapt.main('-a XX -o out1.fastq --paired-output out2.fastq'.split() + [swapped, datapath('paired.2.fastq')])
+
+
+def test_legacy_minlength():
+ '''Ensure -m is not applied to second read in a pair in legacy mode'''
+ run_paired('-a XXX -m 27',
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired-m27.1.fastq', expected2='paired-m27.2.fastq'
+ )
+
+
+def test_paired_end():
+ '''single-pass paired-end with -m'''
+ run_paired('-a TTAGACATAT -A CAGTGGAGTA -m 14',
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired.1.fastq', expected2='paired.2.fastq'
+ )
+
+
+def test_paired_anchored_back_no_indels():
+ run_paired("-a BACKADAPTER$ -A BACKADAPTER$ -N --no-indels",
+ in1='anchored-back.fasta', in2='anchored-back.fasta',
+ expected1='anchored-back.fasta', expected2="anchored-back.fasta"
+ )
+
+
+def test_paired_end_qualtrim():
+ '''single-pass paired-end with -q and -m'''
+ run_paired('-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90',
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='pairedq.1.fastq', expected2='pairedq.2.fastq'
+ )
+
+
+def test_paired_end_qualtrim_swapped():
+ '''single-pass paired-end with -q and -m, but files swapped'''
+ run_paired('-q 20 -a CAGTGGAGTA -A TTAGACATAT -m 14',
+ in1='paired.2.fastq', in2='paired.1.fastq',
+ expected1='pairedq.2.fastq', expected2='pairedq.1.fastq'
+ )
+
+
+def test_paired_end_cut():
+ run_paired('-u 3 -u -1 -U 4 -U -2',
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='pairedu.1.fastq', expected2='pairedu.2.fastq'
+ )
+
+
+def test_paired_end_A_only():
+ run_paired('-A CAGTGGAGTA',
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired-onlyA.1.fastq', expected2='paired-onlyA.2.fastq'
+ )
+
+
+def test_discard_untrimmed():
+ # issue #146
+ # the first adapter is a sequence cut out from the first read
+ run_paired('-a CTCCAGCTTAGACATATC -A XXXXXXXX --discard-untrimmed',
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='empty.fastq', expected2='empty.fastq'
+ )
+
+
+def test_discard_trimmed():
+ run_paired('-A C -O 1 --discard-trimmed', # applies everywhere
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='empty.fastq', expected2='empty.fastq'
+ )
+
+
+def test_interleaved():
+ '''single-pass interleaved paired-end with -q and -m'''
+ run_interleaved('-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90',
+ inpath='interleaved.fastq', expected='interleaved.fastq'
+ )
+
+
+ at raises(SystemExit)
+def test_interleaved_no_paired_output():
+ with temporary_path("temp-paired.1.fastq") as p1:
+ with temporary_path("temp-paired.2.fastq") as p2:
+ params = '-a XX --interleaved'.split()
+ with redirect_stderr():
+ params += [ '-o', p1, '-p1', p2, 'paired.1.fastq', 'paired.2.fastq']
+ cutadapt.main(params)
+
+"""
+def test_interleaved_input_paired_output():
+ '''single-pass interleaved paired-end with -q and -m, paired output'''
+ run_interleaved2('-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90',
+ inpath='interleaved.fastq', expected1='pairedq1.fastq', expected2='pairedq2.fastq'
+ )
+"""
+
+
+def test_pair_filter():
+ run_paired('--pair-filter=both -a TTAGACATAT -A GGAGTA -m 14',
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired-filterboth.1.fastq', expected2='paired-filterboth.2.fastq'
+ )
+
+
+def test_too_short_paired_output():
+ with temporary_path("temp-too-short.1.fastq") as p1:
+ with temporary_path("temp-too-short.2.fastq") as p2:
+ run_paired('-a TTAGACATAT -A CAGTGGAGTA -m 14 --too-short-output '
+ '{0} --too-short-paired-output {1}'.format(p1, p2),
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired.1.fastq', expected2='paired.2.fastq'
+ )
+ assert files_equal(cutpath('paired-too-short.1.fastq'), p1)
+ assert files_equal(cutpath('paired-too-short.2.fastq'), p2)
+
+
+def test_too_long_output():
+ with temporary_path("temp-too-long.1.fastq") as p1:
+ with temporary_path("temp-too-long.2.fastq") as p2:
+ run_paired('-a TTAGACATAT -A CAGTGGAGTA -M 14 --too-long-output '
+ '{0} --too-long-paired-output {1}'.format(p1, p2),
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired-too-short.1.fastq', expected2='paired-too-short.2.fastq'
+ )
+ assert files_equal(cutpath('paired.1.fastq'), p1)
+ assert files_equal(cutpath('paired.2.fastq'), p2)
+
+
+ at raises(SystemExit)
+def test_too_short_output_paired_option_missing():
+ with temporary_path("temp-too-short.1.fastq") as p1:
+ run_paired('-a TTAGACATAT -A CAGTGGAGTA -m 14 --too-short-output '
+ '{0}'.format(p1),
+ in1='paired.1.fastq', in2='paired.2.fastq',
+ expected1='paired.1.fastq', expected2='paired.2.fastq'
+ )
diff --git a/tests/testqualtrim.py b/tests/testqualtrim.py
new file mode 100644
index 0000000..173b264
--- /dev/null
+++ b/tests/testqualtrim.py
@@ -0,0 +1,14 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+
+from cutadapt.seqio import Sequence
+from cutadapt.qualtrim import nextseq_trim_index
+
+def test_nextseq_trim():
+ s = Sequence('n', '', '')
+ assert nextseq_trim_index(s, cutoff=22) == 0
+ s = Sequence('n',
+ 'TCTCGTATGCCGTCTTATGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGNNNNNNNNNNNGGNGG',
+ 'AA//EAEE//A6///E//A//EA/EEEEEEAEA//EEEEEEEEEEEEEEE###########EE#EA'
+ )
+ assert nextseq_trim_index(s, cutoff=22) == 33
diff --git a/tests/tests.py b/tests/tests.py
new file mode 100644
index 0000000..241e169
--- /dev/null
+++ b/tests/tests.py
@@ -0,0 +1,383 @@
+# coding: utf-8
+# TODO
+# test with the --output option
+# test reading from standard input
+from __future__ import print_function, division, absolute_import
+
+import os
+import sys
+from nose.tools import raises
+from cutadapt.scripts import cutadapt
+from cutadapt.compat import StringIO
+from .utils import run, files_equal, datapath, cutpath, redirect_stderr, temporary_path
+
+def test_example():
+ run('-N -b ADAPTER', 'example.fa', 'example.fa')
+
+def test_small():
+ run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq')
+
+def test_empty():
+ '''empty input'''
+ run('-a TTAGACATATCTCCGTCG', 'empty.fastq', 'empty.fastq')
+
+def test_newlines():
+ '''DOS/Windows newlines'''
+ run('-e 0.12 -b TTAGACATATCTCCGTCG', 'dos.fastq', 'dos.fastq')
+
+def test_lowercase():
+ '''lowercase adapter'''
+ run('-b ttagacatatctccgtcg', 'lowercase.fastq', 'small.fastq')
+
+
+def test_rest():
+ '''-r/--rest-file'''
+ with temporary_path('rest.tmp') as rest_tmp:
+ run(['-b', 'ADAPTER', '-N', '-r', rest_tmp], "rest.fa", "rest.fa")
+ assert files_equal(datapath('rest.txt'), rest_tmp)
+
+
+def test_restfront():
+ with temporary_path("rest.txt") as path:
+ run(['-g', 'ADAPTER', '-N', '-r', path], "restfront.fa", "rest.fa")
+ assert files_equal(datapath('restfront.txt'), path)
+
+
+def test_discard():
+ '''--discard'''
+ run("-b TTAGACATATCTCCGTCG --discard", "discard.fastq", "small.fastq")
+
+
+def test_discard_untrimmed():
+ '''--discard-untrimmed'''
+ run('-b CAAGAT --discard-untrimmed', 'discard-untrimmed.fastq', 'small.fastq')
+
+
+def test_plus():
+ '''test if sequence name after the "+" is retained'''
+ run("-e 0.12 -b TTAGACATATCTCCGTCG", "plus.fastq", "plus.fastq")
+
+
+def test_extensiontxtgz():
+ '''automatic recognition of "_sequence.txt.gz" extension'''
+ run("-b TTAGACATATCTCCGTCG", "s_1_sequence.txt", "s_1_sequence.txt.gz")
+
+
+def test_format():
+ '''the -f/--format parameter'''
+ run("-f fastq -b TTAGACATATCTCCGTCG", "small.fastq", "small.myownextension")
+
+
+def test_minimum_length():
+ '''-m/--minimum-length'''
+ run("-c -m 5 -a 330201030313112312", "minlen.fa", "lengths.fa")
+
+
+def test_too_short():
+ '''--too-short-output'''
+ run("-c -m 5 -a 330201030313112312 --too-short-output tooshort.tmp.fa", "minlen.fa", "lengths.fa")
+ assert files_equal(datapath('tooshort.fa'), "tooshort.tmp.fa")
+ os.remove('tooshort.tmp.fa')
+
+
+def test_too_short_no_primer():
+ '''--too-short-output and --trim-primer'''
+ run("-c -m 5 -a 330201030313112312 --trim-primer --too-short-output tooshort.tmp.fa", "minlen.noprimer.fa", "lengths.fa")
+ assert files_equal(datapath('tooshort.noprimer.fa'), "tooshort.tmp.fa")
+ os.remove('tooshort.tmp.fa')
+
+
+def test_maximum_length():
+ '''-M/--maximum-length'''
+ run("-c -M 5 -a 330201030313112312", "maxlen.fa", "lengths.fa")
+
+
+def test_too_long():
+ '''--too-long-output'''
+ run("-c -M 5 --too-long-output toolong.tmp.fa -a 330201030313112312", "maxlen.fa", "lengths.fa")
+ assert files_equal(datapath('toolong.fa'), "toolong.tmp.fa")
+ os.remove('toolong.tmp.fa')
+
+
+def test_length_tag():
+ '''454 data; -n and --length-tag'''
+ run("-n 3 -e 0.1 --length-tag length= " \
+ "-b TGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG "\
+ "-b TCCATCTCATCCCTGCGTGTCCCATCTGTTCCCTCCCTGTCTCA", '454.fa', '454.fa')
+
+def test_overlap_a():
+ '''-O/--overlap with -a (-c omitted on purpose)'''
+ run("-O 10 -a 330201030313112312 -e 0.0 -N", "overlapa.fa", "overlapa.fa")
+
+def test_overlap_b():
+ '''-O/--overlap with -b'''
+ run("-O 10 -b TTAGACATATCTCCGTCG -N", "overlapb.fa", "overlapb.fa")
+
+def test_qualtrim():
+ '''-q with low qualities'''
+ run("-q 10 -a XXXXXX", "lowqual.fastq", "lowqual.fastq")
+
+def test_qualbase():
+ '''-q with low qualities, using ascii(quality+64) encoding'''
+ run("-q 10 --quality-base 64 -a XXXXXX", "illumina64.fastq", "illumina64.fastq")
+
+def test_quality_trim_only():
+ '''only trim qualities, do not remove adapters'''
+ run("-q 10 --quality-base 64", "illumina64.fastq", "illumina64.fastq")
+
+def test_twoadapters():
+ '''two adapters'''
+ run("-a AATTTCAGGAATT -a GTTCTCTAGTTCT", "twoadapters.fasta", "twoadapters.fasta")
+
+def test_polya():
+ '''poly-A tails'''
+ run("-m 24 -O 10 -a AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "polya.fasta", "polya.fasta")
+
+def test_polya_brace_notation():
+ '''poly-A tails'''
+ run("-m 24 -O 10 -a A{35}", "polya.fasta", "polya.fasta")
+
+def test_mask_adapter():
+ '''mask adapter with N (reads maintain the same length)'''
+ run("-b CAAG -n 3 --mask-adapter", "anywhere_repeat.fastq", "anywhere_repeat.fastq")
+
+def test_gz_multiblock():
+ '''compressed gz file with multiple blocks (created by concatenating two .gz files)'''
+ run("-b TTAGACATATCTCCGTCG", "small.fastq", "multiblock.fastq.gz")
+
+def test_suffix():
+ '''-y/--suffix parameter, combined with _F3'''
+ run("-c -e 0.12 -a 1=330201030313112312 -y _my_suffix_{name} --strip-f3", "suffix.fastq", "solid.csfasta", 'solid.qual')
+
+def test_read_wildcard():
+ '''test wildcards in reads'''
+ run("--match-read-wildcards -b ACGTACGT", "wildcard.fa", "wildcard.fa")
+
+def test_adapter_wildcard():
+ '''wildcards in adapter'''
+ for adapter_type, expected in (
+ ("-a", "wildcard_adapter.fa"),
+ ("-b", "wildcard_adapter_anywhere.fa")):
+ with temporary_path("wildcardtmp.txt") as wildcardtmp:
+ run("--wildcard-file {0} {1} ACGTNNNACGT".format(wildcardtmp, adapter_type),
+ expected, "wildcard_adapter.fa")
+ with open(wildcardtmp) as wct:
+ lines = wct.readlines()
+ lines = [ line.strip() for line in lines ]
+ assert lines == ['AAA 1', 'GGG 2', 'CCC 3b', 'TTT 4b']
+
+def test_wildcard_N():
+ '''test 'N' wildcard matching with no allowed errors'''
+ run("-e 0 -a GGGGGGG --match-read-wildcards", "wildcardN.fa", "wildcardN.fa")
+
+def test_illumina_adapter_wildcard():
+ run("-a VCCGAMCYUCKHRKDCUBBCNUWNSGHCGU", "illumina.fastq", "illumina.fastq.gz")
+
+def test_adapter_front():
+ '''test adapter in front'''
+ run("--front ADAPTER -N", "examplefront.fa", "example.fa")
+
+def test_literal_N():
+ '''test matching literal 'N's'''
+ run("-N -e 0.2 -a NNNNNNNNNNNNNN", "trimN3.fasta", "trimN3.fasta")
+
+def test_literal_N2():
+ run("-N -O 1 -g NNNNNNNNNNNNNN", "trimN5.fasta", "trimN5.fasta")
+
+def test_literal_N_brace_notation():
+ '''test matching literal 'N's'''
+ run("-N -e 0.2 -a N{14}", "trimN3.fasta", "trimN3.fasta")
+
+def test_literal_N2_brace_notation():
+ run("-N -O 1 -g N{14}", "trimN5.fasta", "trimN5.fasta")
+
+def test_anchored_front():
+ run("-g ^FRONTADAPT -N", "anchored.fasta", "anchored.fasta")
+
+def test_anchored_front_ellipsis_notation():
+ run("-a FRONTADAPT... -N", "anchored.fasta", "anchored.fasta")
+
+def test_anchored_back():
+ run("-a BACKADAPTER$ -N", "anchored-back.fasta", "anchored-back.fasta")
+
+def test_anchored_back_no_indels():
+ run("-a BACKADAPTER$ -N --no-indels", "anchored-back.fasta", "anchored-back.fasta")
+
+
+def test_no_indels():
+ run('-a TTAGACATAT -g GAGATTGCCA --no-indels', 'no_indels.fasta', 'no_indels.fasta')
+
+
+def test_issue_46():
+ '''issue 46 - IndexError with --wildcard-file'''
+ with temporary_path("wildcardtmp.txt") as wildcardtmp:
+ run("--anywhere=AACGTN --wildcard-file={0}".format(wildcardtmp), "issue46.fasta", "issue46.fasta")
+
+def test_strip_suffix():
+ run("--strip-suffix _sequence -a XXXXXXX", "stripped.fasta", "simple.fasta")
+
+
+def test_info_file():
+ # The true adapter sequence in the illumina.fastq.gz data set is
+ # GCCTAACTTCTTAGACTGCCTTAAGGACGT (fourth base is different)
+ #
+ with temporary_path("infotmp.txt") as infotmp:
+ run(["--info-file", infotmp, '-a', 'adapt=GCCGAACTTCTTAGACTGCCTTAAGGACGT'], "illumina.fastq", "illumina.fastq.gz")
+ assert files_equal(cutpath('illumina.info.txt'), infotmp)
+
+
+def test_info_file_times():
+ with temporary_path("infotmp.txt") as infotmp:
+ run(["--info-file", infotmp, '--times', '2', '-a', 'adapt=GCCGAACTTCTTA', '-a', 'adapt2=GACTGCCTTAAGGACGT'], "illumina5.fastq", "illumina5.fastq")
+ assert files_equal(cutpath('illumina5.info.txt'), infotmp)
+
+
+def test_info_file_fasta():
+ with temporary_path("infotmp.txt") as infotmp:
+ # Just make sure that it runs
+ run(['--info-file', infotmp, '-a', 'TTAGACATAT', '-g', 'GAGATTGCCA', '--no-indels'], 'no_indels.fasta', 'no_indels.fasta')
+
+
+def test_named_adapter():
+ run("-a MY_ADAPTER=GCCGAACTTCTTAGACTGCCTTAAGGACGT", "illumina.fastq", "illumina.fastq.gz")
+
+
+def test_adapter_with_U():
+ run("-a GCCGAACUUCUUAGACUGCCUUAAGGACGU", "illumina.fastq", "illumina.fastq.gz")
+
+
+def test_no_trim():
+ ''' --no-trim '''
+ run("--no-trim --discard-untrimmed -a CCCTAGTTAAAC", 'no-trim.fastq', 'small.fastq')
+
+
+def test_bzip2():
+ '''test bzip2 support'''
+ run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq.bz2')
+
+
+try:
+ import lzma
+
+ def test_xz():
+ '''test xz support'''
+ run('-b TTAGACATATCTCCGTCG', 'small.fastq', 'small.fastq.xz')
+except ImportError:
+ pass
+
+
+ at raises(SystemExit)
+def test_qualfile_only():
+ with redirect_stderr():
+ cutadapt.main(['file.qual'])
+
+
+ at raises(SystemExit)
+def test_no_args():
+ with redirect_stderr():
+ cutadapt.main([])
+
+
+ at raises(SystemExit)
+def test_two_fastqs():
+ with redirect_stderr():
+ cutadapt.main([datapath('paired.1.fastq'), datapath('paired.2.fastq')])
+
+
+def test_anchored_no_indels():
+ '''anchored 5' adapter, mismatches only (no indels)'''
+ run('-g ^TTAGACATAT --no-indels -e 0.1', 'anchored_no_indels.fasta', 'anchored_no_indels.fasta')
+
+
+def test_anchored_no_indels_wildcard_read():
+ '''anchored 5' adapter, mismatches only (no indels), but wildcards in the read count as matches'''
+ run('-g ^TTAGACATAT --match-read-wildcards --no-indels -e 0.1', 'anchored_no_indels_wildcard.fasta', 'anchored_no_indels.fasta')
+
+
+def test_anchored_no_indels_wildcard_adapt():
+ '''anchored 5' adapter, mismatches only (no indels), but wildcards in the adapter count as matches'''
+ run('-g ^TTAGACANAT --no-indels -e 0.1', 'anchored_no_indels.fasta', 'anchored_no_indels.fasta')
+
+
+def test_unconditional_cut_front():
+ run('-u 5', 'unconditional-front.fastq', 'small.fastq')
+
+
+def test_unconditional_cut_back():
+ run('-u -5', 'unconditional-back.fastq', 'small.fastq')
+
+
+def test_unconditional_cut_both():
+ run('-u -5 -u 5', 'unconditional-both.fastq', 'small.fastq')
+
+
+def test_untrimmed_output():
+ with temporary_path('untrimmed.tmp.fastq') as tmp:
+ run(['-a', 'TTAGACATATCTCCGTCG', '--untrimmed-output', tmp], 'small.trimmed.fastq', 'small.fastq')
+ assert files_equal(cutpath('small.untrimmed.fastq'), tmp)
+
+
+def test_adapter_file():
+ run('-a file:' + datapath('adapter.fasta'), 'illumina.fastq', 'illumina.fastq.gz')
+
+def test_adapter_file_5p_anchored():
+ run('-N -g file:' + datapath('prefix-adapter.fasta'), 'anchored.fasta', 'anchored.fasta')
+
+def test_adapter_file_3p_anchored():
+ run('-N -a file:' + datapath('suffix-adapter.fasta'), 'anchored-back.fasta', 'anchored-back.fasta')
+
+
+def test_adapter_file_5p_anchored_no_indels():
+ run('-N --no-indels -g file:' + datapath('prefix-adapter.fasta'), 'anchored.fasta', 'anchored.fasta')
+
+
+def test_adapter_file_3p_anchored_no_indels():
+ run('-N --no-indels -a file:' + datapath('suffix-adapter.fasta'), 'anchored-back.fasta', 'anchored-back.fasta')
+
+
+def test_demultiplex():
+ multiout = os.path.join(os.path.dirname(__file__), 'data', 'tmp-demulti.{name}.fasta')
+ params = ['-a', 'first=AATTTCAGGAATT', '-a', 'second=GTTCTCTAGTTCT', '-o', multiout, datapath('twoadapters.fasta')]
+ assert cutadapt.main(params) is None
+ assert files_equal(cutpath('twoadapters.first.fasta'), multiout.format(name='first'))
+ assert files_equal(cutpath('twoadapters.second.fasta'), multiout.format(name='second'))
+ assert files_equal(cutpath('twoadapters.unknown.fasta'), multiout.format(name='unknown'))
+ os.remove(multiout.format(name='first'))
+ os.remove(multiout.format(name='second'))
+ os.remove(multiout.format(name='unknown'))
+
+
+def test_max_n():
+ run('--max-n 0', 'maxn0.fasta', 'maxn.fasta')
+ run('--max-n 1', 'maxn1.fasta', 'maxn.fasta')
+ run('--max-n 2', 'maxn2.fasta', 'maxn.fasta')
+ run('--max-n 0.2', 'maxn0.2.fasta', 'maxn.fasta')
+ run('--max-n 0.4', 'maxn0.4.fasta', 'maxn.fasta')
+
+
+def test_quiet_is_quiet():
+ captured_standard_output = StringIO()
+ captured_standard_error = StringIO()
+ try:
+ old_stdout = sys.stdout
+ old_stderr = sys.stderr
+ sys.stdout = captured_standard_output
+ sys.stderr = captured_standard_error
+ cutadapt.main(['-o', '/dev/null', '--quiet', '-a', 'XXXX', datapath('illumina.fastq.gz')])
+ finally:
+ sys.stdout = old_stdout
+ sys.stderr = old_stderr
+ assert captured_standard_output.getvalue() == ''
+ assert captured_standard_error.getvalue() == ''
+
+
+def test_nextseq():
+ run('--nextseq-trim 22', 'nextseq.fastq', 'nextseq.fastq')
+
+
+def test_linked():
+ run('-a AAAAAAAAAA...TTTTTTTTTT', 'linked.fasta', 'linked.fasta')
+
+
+def test_fasta():
+ run('-a TTAGACATATCTCCGTCG', 'small.fasta', 'small.fastq')
diff --git a/tests/testseqio.py b/tests/testseqio.py
new file mode 100644
index 0000000..ef8b0b6
--- /dev/null
+++ b/tests/testseqio.py
@@ -0,0 +1,352 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+
+import sys
+import os
+import shutil
+from textwrap import dedent
+from nose.tools import raises
+from tempfile import mkdtemp
+from cutadapt.seqio import (Sequence, ColorspaceSequence, FormatError,
+ FastaReader, FastqReader, FastaQualReader, InterleavedSequenceReader,
+ FastaWriter, FastqWriter, InterleavedSequenceWriter, open as openseq,
+ sequence_names_match)
+from cutadapt.compat import StringIO
+
+
+# files tests/data/simple.fast{q,a}
+simple_fastq = [
+ Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"),
+ Sequence("second_sequence", "SEQUENCE2", "83<??:(61")
+ ]
+
+simple_fasta = [ Sequence(x.name, x.sequence, None) for x in simple_fastq ]
+
+
+class TestSequence:
+ @raises(FormatError)
+ def test_too_many_qualities(self):
+ Sequence(name="name", sequence="ACGT", qualities="#####")
+
+ @raises(FormatError)
+ def test_too_many_qualities_colorspace(self):
+ ColorspaceSequence(name="name", sequence="T0123", qualities="#####")
+
+ @raises(FormatError)
+ def test_invalid_primer(self):
+ ColorspaceSequence(name="name", sequence="K0123", qualities="####")
+
+
+class TestFastaReader:
+ def test(self):
+ with FastaReader("tests/data/simple.fasta") as f:
+ reads = list(f)
+ assert reads == simple_fasta
+
+ fasta = StringIO(">first_sequence\nSEQUENCE1\n>second_sequence\nSEQUENCE2\n")
+ reads = list(FastaReader(fasta))
+ assert reads == simple_fasta
+
+ def test_with_comments(self):
+ fasta = StringIO(dedent(
+ """
+ # a comment
+ # another one
+ >first_sequence
+ SEQUENCE1
+ >second_sequence
+ SEQUENCE2
+ """))
+ reads = list(FastaReader(fasta))
+ assert reads == simple_fasta
+
+ @raises(FormatError)
+ def test_wrong_format(self):
+ fasta = StringIO(dedent(
+ """
+ # a comment
+ # another one
+ unexpected
+ >first_sequence
+ SEQUENCE1
+ >second_sequence
+ SEQUENCE2
+ """))
+ reads = list(FastaReader(fasta))
+
+ def test_fastareader_keeplinebreaks(self):
+ with FastaReader("tests/data/simple.fasta", keep_linebreaks=True) as f:
+ reads = list(f)
+ assert reads[0] == simple_fasta[0]
+ assert reads[1].sequence == 'SEQUEN\nCE2'
+
+ def test_context_manager(self):
+ filename = "tests/data/simple.fasta"
+ with open(filename) as f:
+ assert not f.closed
+ reads = list(openseq(f))
+ assert not f.closed
+ assert f.closed
+
+ with FastaReader(filename) as sr:
+ tmp_sr = sr
+ assert not sr._file.closed
+ reads = list(sr)
+ assert not sr._file.closed
+ assert tmp_sr._file is None
+ # Open it a second time
+ with FastaReader(filename) as sr:
+ pass
+
+
+class TestFastqReader:
+ def test_fastqreader(self):
+ with FastqReader("tests/data/simple.fastq") as f:
+ reads = list(f)
+ assert reads == simple_fastq
+
+ def test_fastqreader_dos(self):
+ with FastqReader("tests/data/dos.fastq") as f:
+ dos_reads = list(f)
+ with FastqReader("tests/data/small.fastq") as f:
+ unix_reads = list(f)
+ assert dos_reads == unix_reads
+
+ @raises(FormatError)
+ def test_fastq_wrongformat(self):
+ with FastqReader("tests/data/withplus.fastq") as f:
+ reads = list(f)
+
+ @raises(FormatError)
+ def test_fastq_incomplete(self):
+ fastq = StringIO("@name\nACGT+\n")
+ with FastqReader(fastq) as fq:
+ list(fq)
+
+ def test_context_manager(self):
+ filename = "tests/data/simple.fastq"
+ with open(filename) as f:
+ assert not f.closed
+ reads = list(openseq(f))
+ assert not f.closed
+ assert f.closed
+
+ with FastqReader(filename) as sr:
+ tmp_sr = sr
+ assert not sr._file.closed
+ reads = list(sr)
+ assert not sr._file.closed
+ assert tmp_sr._file is None
+
+
+class TestFastaQualReader:
+ @raises(FormatError)
+ def test_mismatching_read_names(self):
+ fasta = StringIO(">name\nACG")
+ qual = StringIO(">nome\n3 5 7")
+ list(FastaQualReader(fasta, qual))
+
+ @raises(FormatError)
+ def test_invalid_quality_value(self):
+ fasta = StringIO(">name\nACG")
+ qual = StringIO(">name\n3 xx 7")
+ list(FastaQualReader(fasta, qual))
+
+
+class TestSeqioOpen:
+ def setup(self):
+ self._tmpdir = mkdtemp()
+
+ def teardown(self):
+ shutil.rmtree(self._tmpdir)
+
+ def test_sequence_reader(self):
+ # test the autodetection
+ with openseq("tests/data/simple.fastq") as f:
+ reads = list(f)
+ assert reads == simple_fastq
+
+ with openseq("tests/data/simple.fasta") as f:
+ reads = list(f)
+ assert reads == simple_fasta
+
+ with open("tests/data/simple.fastq") as f:
+ reads = list(openseq(f))
+ assert reads == simple_fastq
+
+ # make the name attribute unavailable
+ f = StringIO(open("tests/data/simple.fastq").read())
+ reads = list(openseq(f))
+ assert reads == simple_fastq
+
+ f = StringIO(open("tests/data/simple.fasta").read())
+ reads = list(openseq(f))
+ assert reads == simple_fasta
+
+ def test_autodetect_fasta_format(self):
+ path = os.path.join(self._tmpdir, 'tmp.fasta')
+ with openseq(path, mode='w') as f:
+ assert isinstance(f, FastaWriter)
+ for seq in simple_fastq:
+ f.write(seq)
+ assert list(openseq(path)) == simple_fasta
+
+ def test_write_qualities_to_fasta(self):
+ path = os.path.join(self._tmpdir, 'tmp.fasta')
+ with openseq(path, mode='w', qualities=True) as f:
+ assert isinstance(f, FastaWriter)
+ for seq in simple_fastq:
+ f.write(seq)
+ assert list(openseq(path)) == simple_fasta
+
+ def test_autodetect_fastq_format(self):
+ path = os.path.join(self._tmpdir, 'tmp.fastq')
+ with openseq(path, mode='w') as f:
+ assert isinstance(f, FastqWriter)
+ for seq in simple_fastq:
+ f.write(seq)
+ assert list(openseq(path)) == simple_fastq
+
+ @raises(ValueError)
+ def test_fastq_qualities_missing(self):
+ path = os.path.join(self._tmpdir, 'tmp.fastq')
+ openseq(path, mode='w', qualities=False)
+
+
+class TestInterleavedReader:
+ def test(self):
+ expected = [
+ (Sequence('read1/1 some text', 'TTATTTGTCTCCAGC', '##HHHHHHHHHHHHH'),
+ Sequence('read1/2 other text', 'GCTGGAGACAAATAA', 'HHHHHHHHHHHHHHH')),
+ (Sequence('read3/1', 'CCAACTTGATATTAATAACA', 'HHHHHHHHHHHHHHHHHHHH'),
+ Sequence('read3/2', 'TGTTATTAATATCAAGTTGG', '#HHHHHHHHHHHHHHHHHHH'))
+ ]
+ reads = list(InterleavedSequenceReader("tests/cut/interleaved.fastq"))
+ for (r1, r2), (e1, e2) in zip(reads, expected):
+ print(r1, r2, e1, e2)
+
+ assert reads == expected
+ with openseq("tests/cut/interleaved.fastq", interleaved=True) as f:
+ reads = list(f)
+ assert reads == expected
+
+ @raises(FormatError)
+ def test_missing_partner(self):
+ s = StringIO('@r1\nACG\n+\nHHH')
+ list(InterleavedSequenceReader(s))
+
+ @raises(FormatError)
+ def test_incorrectly_paired(self):
+ s = StringIO('@r1/1\nACG\n+\nHHH\n at wrong_name\nTTT\n+\nHHH')
+ list(InterleavedSequenceReader(s))
+
+
+class TestFastaWriter:
+ def setup(self):
+ self._tmpdir = mkdtemp()
+ self.path = os.path.join(self._tmpdir, 'tmp.fasta')
+
+ def teardown(self):
+ shutil.rmtree(self._tmpdir)
+
+ def test(self):
+ with FastaWriter(self.path) as fw:
+ fw.write("name", "CCATA")
+ fw.write("name2", "HELLO")
+ assert fw._file.closed
+ with open(self.path) as t:
+ assert t.read() == '>name\nCCATA\n>name2\nHELLO\n'
+
+ def test_linelength(self):
+ with FastaWriter(self.path, line_length=3) as fw:
+ fw.write("r1", "ACG")
+ fw.write("r2", "CCAT")
+ fw.write("r3", "TACCAG")
+ assert fw._file.closed
+ with open(self.path) as t:
+ d = t.read()
+ assert d == '>r1\nACG\n>r2\nCCA\nT\n>r3\nTAC\nCAG\n'
+
+ def test_write_sequence_object(self):
+ with FastaWriter(self.path) as fw:
+ fw.write(Sequence("name", "CCATA"))
+ fw.write(Sequence("name2", "HELLO"))
+ assert fw._file.closed
+ with open(self.path) as t:
+ assert t.read() == '>name\nCCATA\n>name2\nHELLO\n'
+
+ def test_write_to_file_like_object(self):
+ sio = StringIO()
+ with FastaWriter(sio) as fw:
+ fw.write(Sequence("name", "CCATA"))
+ fw.write(Sequence("name2", "HELLO"))
+ assert sio.getvalue() == '>name\nCCATA\n>name2\nHELLO\n'
+ assert not fw._file.closed
+
+ def test_write_zero_length_sequence(self):
+ sio = StringIO()
+ with FastaWriter(sio) as fw:
+ fw.write(Sequence("name", ""))
+ assert sio.getvalue() == '>name\n\n', '{0!r}'.format(sio.getvalue())
+
+
+class TestFastqWriter:
+ def setup(self):
+ self._tmpdir = mkdtemp()
+ self.path = os.path.join(self._tmpdir, 'tmp.fastq')
+
+ def teardown(self):
+ shutil.rmtree(self._tmpdir)
+
+ def test(self):
+ with FastqWriter(self.path) as fq:
+ fq.writeseq("name", "CCATA", "!#!#!")
+ fq.writeseq("name2", "HELLO", "&&&!&&")
+ assert fq._file.closed
+ with open(self.path) as t:
+ assert t.read() == '@name\nCCATA\n+\n!#!#!\n at name2\nHELLO\n+\n&&&!&&\n'
+
+ def test_twoheaders(self):
+ with FastqWriter(self.path) as fq:
+ fq.write(Sequence("name", "CCATA", "!#!#!", name2="name"))
+ fq.write(Sequence("name2", "HELLO", "&&&!&", name2="name2"))
+ assert fq._file.closed
+ with open(self.path) as t:
+ assert t.read() == '@name\nCCATA\n+name\n!#!#!\n at name2\nHELLO\n+name2\n&&&!&\n'
+
+ def test_write_to_file_like_object(self):
+ sio = StringIO()
+ with FastqWriter(sio) as fq:
+ fq.writeseq("name", "CCATA", "!#!#!")
+ fq.writeseq("name2", "HELLO", "&&&!&&")
+ assert sio.getvalue() == '@name\nCCATA\n+\n!#!#!\n at name2\nHELLO\n+\n&&&!&&\n'
+
+
+class TestInterleavedWriter:
+ def test(self):
+ reads = [
+ (Sequence('A/1 comment', 'TTA', '##H'),
+ Sequence('A/2 comment', 'GCT', 'HH#')),
+ (Sequence('B/1', 'CC', 'HH'),
+ Sequence('B/2', 'TG', '#H'))
+ ]
+ sio = StringIO()
+ with InterleavedSequenceWriter(sio) as writer:
+ for read1, read2 in reads:
+ writer.write(read1, read2)
+ assert sio.getvalue() == '@A/1 comment\nTTA\n+\n##H\n at A/2 comment\nGCT\n+\nHH#\n at B/1\nCC\n+\nHH\n at B/2\nTG\n+\n#H\n'
+
+
+class TestPairedSequenceReader:
+ def test_sequence_names_match(self):
+ def match(name1, name2):
+ seq1 = Sequence(name1, 'ACGT')
+ seq2 = Sequence(name2, 'AACC')
+ return sequence_names_match(seq1, seq2)
+
+ assert match('abc', 'abc')
+ assert match('abc/1', 'abc/2')
+ assert match('abc.1', 'abc.2')
+ assert match('abc1', 'abc2')
+ assert not match('abc', 'xyz')
+
diff --git a/tests/testtrim.py b/tests/testtrim.py
new file mode 100644
index 0000000..09c3102
--- /dev/null
+++ b/tests/testtrim.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+
+from cutadapt.seqio import ColorspaceSequence, Sequence
+from cutadapt.adapters import Adapter, ColorspaceAdapter, PREFIX, BACK
+from cutadapt.scripts.cutadapt import AdapterCutter
+
+def test_cs_5p():
+ read = ColorspaceSequence("name", "0123", "DEFG", "T")
+ adapter = ColorspaceAdapter("CG", PREFIX, 0.1)
+ cutter = AdapterCutter([adapter])
+ trimmed_read = cutter(read)
+ # no assertion here, just make sure the above code runs without
+ # an exception
+
+
+def test_statistics():
+ read = Sequence('name', 'AAAACCCCAAAA')
+ adapters = [Adapter('CCCC', BACK, 0.1)]
+ cutter = AdapterCutter(adapters, times=3)
+ trimmed_read = cutter(read)
+ # TODO make this a lot simpler
+ trimmed_bp = 0
+ for adapter in adapters:
+ for d in (adapter.lengths_front, adapter.lengths_back):
+ trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items())
+ assert trimmed_bp <= len(read), trimmed_bp
diff --git a/tests/testxopen.py b/tests/testxopen.py
new file mode 100644
index 0000000..2d714c4
--- /dev/null
+++ b/tests/testxopen.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+import gzip
+import os
+import random
+import sys
+from nose.tools import raises
+from cutadapt.xopen import xopen, lzma
+from .utils import temporary_path
+
+base = "tests/data/small.fastq"
+files = [ base + ext for ext in ['', '.gz', '.bz2' ] ]
+if lzma is not None:
+ files.append(base + '.xz')
+
+def test_context_manager():
+ major, minor = sys.version_info[0:2]
+ for name in files:
+ if major == 2 and minor == 6:
+ continue # Py26 compression libraries do not support context manager protocol.
+ with xopen(name, 'rt') as f:
+ lines = list(f)
+ assert len(lines) == 12
+ assert lines[5] == 'AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT\n', name
+ f.close()
+
+def test_append():
+ for ext in ["", ".gz"]: # BZ2 does NOT support append
+ text = "AB"
+ if ext != "":
+ text = text.encode("utf-8") # On Py3, need to send BYTES, not unicode
+ reference = text + text
+ print("Trying ext=%s" % ext)
+ with temporary_path('truncated.fastq' + ext) as path:
+ try:
+ os.unlink(path)
+ except OSError:
+ pass
+ with xopen(path, 'a') as f:
+ f.write(text)
+ with xopen(path, 'a') as f:
+ f.write(text)
+ with xopen(path, 'r') as f:
+ for appended in f:
+ pass
+ try:
+ reference = reference.decode("utf-8")
+ except AttributeError:
+ pass
+ print(appended)
+ print(reference)
+ assert appended == reference
+
+def test_xopen_text():
+ for name in files:
+ f = xopen(name, 'rt')
+ lines = list(f)
+ assert len(lines) == 12
+ assert lines[5] == 'AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT\n', name
+ f.close()
+
+
+def test_xopen_binary():
+ for name in files:
+ f = xopen(name, 'rb')
+ lines = list(f)
+ assert len(lines) == 12
+ assert lines[5] == b'AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT\n', name
+ f.close()
+
+
+def create_truncated_file(path):
+ # Random text
+ text = ''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(200))
+ f = xopen(path, 'w')
+ f.write(text)
+ f.close()
+ f = open(path, 'a')
+ f.truncate(os.stat(path).st_size - 10)
+ f.close()
+
+
+# Disable these tests in Python 3.2 and 3.3
+if not ((3, 2) <= sys.version_info[:2] <= (3, 3)):
+ @raises(EOFError)
+ def test_truncated_gz():
+ with temporary_path('truncated.gz') as path:
+ create_truncated_file(path)
+ f = xopen(path, 'r')
+ f.read()
+ f.close()
+
+
+ @raises(EOFError)
+ def test_truncated_gz_iter():
+ with temporary_path('truncated.gz') as path:
+ create_truncated_file(path)
+ f = xopen(path, 'r')
+ for line in f:
+ pass
+ f.close()
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..473e598
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import print_function, division, absolute_import
+
+import sys, os
+from contextlib import contextmanager
+from cutadapt.scripts import cutadapt
+
+ at contextmanager
+def redirect_stderr():
+ "Send stderr to stdout. Nose doesn't capture stderr, yet."
+ old_stderr = sys.stderr
+ sys.stderr = sys.stdout
+ yield
+ sys.stderr = old_stderr
+
+
+ at contextmanager
+def temporary_path(name):
+ directory = os.path.join(os.path.dirname(__file__), 'testtmp')
+ if not os.path.isdir(directory):
+ os.mkdir(directory)
+ path = os.path.join(directory, name)
+ yield path
+ os.remove(path)
+
+
+def datapath(path):
+ return os.path.join(os.path.dirname(__file__), 'data', path)
+
+
+def cutpath(path):
+ return os.path.join(os.path.dirname(__file__), 'cut', path)
+
+
+def files_equal(path1, path2):
+ return os.system("diff -u {0} {1}".format(path1, path2)) == 0
+
+
+def run(params, expected, inpath, inpath2=None):
+ if type(params) is str:
+ params = params.split()
+ with temporary_path(expected) as tmp_fastaq:
+ params += ['-o', tmp_fastaq ] # TODO not parallelizable
+ params += [ datapath(inpath) ]
+ if inpath2:
+ params += [ datapath(inpath2) ]
+ assert cutadapt.main(params) is None
+ # TODO redirect standard output
+ assert files_equal(cutpath(expected), tmp_fastaq)
+ # TODO diff log files
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..43c4de1
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,6 @@
+[tox]
+envlist = py26,py27,py33,py34,py35
+
+[testenv]
+deps = nose
+commands = nosetests -P tests
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/cutadapt.git
More information about the debian-med-commit
mailing list