[med-svn] [fastaq] 01/01: Imported Upstream version 3.2.0
Andreas Tille
tille at debian.org
Mon Mar 2 21:28:15 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch upstream
in repository fastaq.
commit 0dfa38afe333a863165629cc55b2f3d68452992c
Author: Andreas Tille <tille at debian.org>
Date: Mon Mar 2 22:11:13 2015 +0100
Imported Upstream version 3.2.0
---
MANIFEST.in | 2 +-
README.md | 99 ++++++++++++---
fastaq/__init__.py | 2 -
pyfastaq/__init__.py | 11 ++
pyfastaq/caf.py | 79 ++++++++++++
pyfastaq/common.py | 1 +
pyfastaq/genetic_codes.py | 139 +++++++++++++++++++++
{fastaq => pyfastaq}/intervals.py | 0
.../runners/__init__.py | 0
pyfastaq/runners/add_indels.py | 93 ++++++++++++++
pyfastaq/runners/caf_to_fastq.py | 19 +++
pyfastaq/runners/capillary_to_pairs.py | 12 ++
pyfastaq/runners/chunker.py | 30 +++++
pyfastaq/runners/count_sequences.py | 10 ++
pyfastaq/runners/deinterleave.py | 13 ++
pyfastaq/runners/enumerate_names.py | 20 +++
pyfastaq/runners/expand_nucleotides.py | 14 +++
pyfastaq/runners/fasta_to_fastq.py | 12 ++
pyfastaq/runners/filter.py | 32 +++++
pyfastaq/runners/get_ids.py | 11 ++
pyfastaq/runners/get_seq_flanking_gaps.py | 13 ++
pyfastaq/runners/interleave.py | 12 ++
pyfastaq/runners/long_read_simulate.py | 49 ++++++++
pyfastaq/runners/make_random_contigs.py | 24 ++++
pyfastaq/runners/merge.py | 16 +++
pyfastaq/runners/replace_bases.py | 13 ++
pyfastaq/runners/reverse_complement.py | 11 ++
pyfastaq/runners/scaffolds_to_contigs.py | 12 ++
pyfastaq/runners/search_for_seq.py | 12 ++
pyfastaq/runners/sequence_trim.py | 24 ++++
pyfastaq/runners/sort_by_size.py | 16 +++
pyfastaq/runners/split_by_base_count.py | 14 +++
pyfastaq/runners/strip_illumina_suffix.py | 11 ++
pyfastaq/runners/to_fake_qual.py | 17 +++
pyfastaq/runners/to_fasta.py | 20 +++
pyfastaq/runners/to_mira_xml.py | 11 ++
pyfastaq/runners/to_orfs_gff.py | 12 ++
pyfastaq/runners/to_perfect_reads.py | 85 +++++++++++++
pyfastaq/runners/to_random_subset.py | 35 ++++++
pyfastaq/runners/to_tiling_bam.py | 77 ++++++++++++
pyfastaq/runners/to_unique_by_id.py | 11 ++
pyfastaq/runners/translate.py | 12 ++
pyfastaq/runners/trim_Ns_at_end.py | 11 ++
pyfastaq/runners/trim_contigs.py | 12 ++
pyfastaq/runners/trim_ends.py | 13 ++
pyfastaq/runners/version.py | 4 +
{fastaq => pyfastaq}/sequences.py | 77 ++----------
{fastaq => pyfastaq}/tasks.py | 123 ++++++++++++++++--
pyfastaq/tests/caf_test.py | 47 +++++++
pyfastaq/tests/data/caf_test.caf | 48 +++++++
.../data/caf_test.to_fastq.no_trim.min_length_0.fq | 8 ++
.../data/caf_test.to_fastq.trim.min_length_6.fq | 4 +
.../tests/data/sequences_test.embl | 0
.../tests/data/sequences_test.embl.bad | 0
.../tests/data/sequences_test.embl.bad2 | 0
.../tests/data/sequences_test.embl.to_fasta | 0
{fastaq => pyfastaq}/tests/data/sequences_test.fa | 0
.../tests/data/sequences_test.fa.ids | 0
.../tests/data/sequences_test.fa.qual | 0
.../tests/data/sequences_test.fa.qual.bad | 0
.../tests/data/sequences_test.fasta_to_fastq.fq | 0
{fastaq => pyfastaq}/tests/data/sequences_test.gbk | 0
.../tests/data/sequences_test.gbk.to_fasta | 0
.../tests/data/sequences_test.line_length3.fa | 0
.../tests/data/sequences_test_3-per-line.fa | 0
.../tests/data/sequences_test_cap_to_read_pairs.fa | 0
.../sequences_test_cap_to_read_pairs.fa.paired.gz | Bin
...sequences_test_cap_to_read_pairs.fa.unpaired.gz | Bin
.../tests/data/sequences_test_deinterleaved_1.fa | 0
.../tests/data/sequences_test_deinterleaved_2.fa | 0
.../data/sequences_test_deinterleaved_bad2_1.fa | 0
.../data/sequences_test_deinterleaved_bad2_2.fa | 0
.../data/sequences_test_deinterleaved_bad_1.fa | 0
.../data/sequences_test_deinterleaved_bad_2.fa | 0
.../tests/data/sequences_test_empty_file | 0
.../tests/data/sequences_test_enumerate_names.fa | 0
...equences_test_enumerate_names.fa.out.add_suffix | 8 ++
...quences_test_enumerate_names.fa.out.keep_suffix | 0
.../sequences_test_enumerate_names.fa.out.start.1 | 0
...test_enumerate_names.fa.out.start.1.rename_file | 0
.../sequences_test_enumerate_names.fa.out.start.2 | 0
.../tests/data/sequences_test_fai_test.fa | 0
.../tests/data/sequences_test_fai_test.fa.fai | 0
.../tests/data/sequences_test_fail_no_AT.fq | 0
.../tests/data/sequences_test_fail_no_plus.fq | 0
.../tests/data/sequences_test_fail_no_qual.fq | 0
.../tests/data/sequences_test_fail_no_seq.fq | 0
...sequences_test_fastaq_replace_bases.expected.fa | 0
.../data/sequences_test_fastaq_replace_bases.fa | 0
.../data/sequences_test_filter_by_ids_file.fa | 0
.../sequences_test_filter_by_ids_file.fa.filtered | 0
...nces_test_filter_by_ids_file.fa.filtered.invert | 0
.../data/sequences_test_filter_by_ids_file.fa.ids | 0
.../tests/data/sequences_test_filter_by_regex.fa | 0
.../sequences_test_filter_by_regex.first-char-a.fa | 0
...sequences_test_filter_by_regex.first-of-pair.fa | 0
.../data/sequences_test_filter_by_regex.numeric.fa | 0
.../data/sequences_test_get_seqs_flanking_gaps.fa | 0
.../sequences_test_get_seqs_flanking_gaps.fa.out | 0
.../tests/data/sequences_test_gffv3.gff | 0
.../tests/data/sequences_test_gffv3.gff.fasta | 0
.../tests/data/sequences_test_gffv3.gff.to_fasta | 0
.../data/sequences_test_gffv3.no_FASTA_line.gff | 0
...sequences_test_gffv3.no_FASTA_line.gff.to_fasta | 0
.../tests/data/sequences_test_gffv3.no_seq.2.gff | 0
.../tests/data/sequences_test_gffv3.no_seq.gff | 0
.../tests/data/sequences_test_good_file.fq | 0
.../data/sequences_test_good_file.fq.to_fasta | 0
.../tests/data/sequences_test_good_file_mira.xml | 0
.../tests/data/sequences_test_interleaved.fa | 0
.../tests/data/sequences_test_interleaved.fq | 0
.../tests/data/sequences_test_interleaved_bad.fa | 0
.../tests/data/sequences_test_length_filter.fa | 0
.../sequences_test_length_filter.min-0.max-1.fa | 0
.../sequences_test_length_filter.min-0.max-inf.fa | 0
.../sequences_test_length_filter.min-4.max-4.fa | 0
.../sequences_test_make_random_contigs.default.fa | 0
.../sequences_test_make_random_contigs.first-42.fa | 0
...ces_test_make_random_contigs.name-by-letters.fa | 0
.../sequences_test_make_random_contigs.prefix-p.fa | 0
.../tests/data/sequences_test_merge_to_one_seq.fa | 0
.../tests/data/sequences_test_merge_to_one_seq.fq | 0
.../data/sequences_test_merge_to_one_seq.merged.fa | 0
.../data/sequences_test_merge_to_one_seq.merged.fq | 0
.../tests/data/sequences_test_not_a_fastaq_file | 0
.../tests/data/sequences_test_one-per-line.fa | 0
.../tests/data/sequences_test_orfs.fa | 0
.../tests/data/sequences_test_orfs.gff | 0
.../tests/data/sequences_test_phylip.interleaved | 0
.../sequences_test_phylip.interleaved.to_fasta | 0
.../tests/data/sequences_test_phylip.interleaved2 | 0
.../sequences_test_phylip.interleaved2.to_fasta | 0
.../data/sequences_test_phylip.made_by_seaview | 0
.../sequences_test_phylip.made_by_seaview.to_fasta | 0
.../tests/data/sequences_test_phylip.sequential | 0
.../data/sequences_test_phylip.sequential.to_fasta | 0
.../tests/data/sequences_test_revcomp.fa | 0
.../tests/data/sequences_test_search_string.fa | 0
.../data/sequences_test_search_string.fa.hits | 0
.../tests/data/sequences_test_split_fixed_size.fa | 0
.../sequences_test_split_fixed_size.fa.split.1 | 0
.../sequences_test_split_fixed_size.fa.split.2 | 0
.../sequences_test_split_fixed_size.fa.split.3 | 0
.../sequences_test_split_fixed_size.fa.split.4 | 0
.../sequences_test_split_fixed_size.fa.split.5 | 0
.../sequences_test_split_fixed_size.fa.split.6 | 0
...sequences_test_split_fixed_size.fa.split.coords | 0
...test_split_fixed_size.fa.split.skip_if_all_Ns.1 | 0
...test_split_fixed_size.fa.split.skip_if_all_Ns.2 | 0
...test_split_fixed_size.fa.split.skip_if_all_Ns.3 | 0
...test_split_fixed_size.fa.split.skip_if_all_Ns.4 | 0
...split_fixed_size.fa.split.skip_if_all_Ns.coords | 0
.../sequences_test_split_fixed_size_onefile.fa | 18 +++
.../sequences_test_split_fixed_size_onefile.out.fa | 32 +++++
...es_test_split_fixed_size_onefile.skip_Ns.out.fa | 30 +++++
.../tests/data/sequences_test_split_test.fa | 0
.../tests/data/sequences_test_split_test.fa.2.1 | 0
.../tests/data/sequences_test_split_test.fa.2.2 | 0
.../tests/data/sequences_test_split_test.fa.2.3 | 0
.../tests/data/sequences_test_split_test.fa.2.4 | 0
.../tests/data/sequences_test_split_test.fa.3.1 | 0
.../tests/data/sequences_test_split_test.fa.3.2 | 0
.../tests/data/sequences_test_split_test.fa.3.3 | 0
.../tests/data/sequences_test_split_test.fa.4.1 | 0
.../tests/data/sequences_test_split_test.fa.4.2 | 0
.../tests/data/sequences_test_split_test.fa.4.3 | 0
.../tests/data/sequences_test_split_test.fa.6.1 | 0
.../tests/data/sequences_test_split_test.fa.6.2 | 0
.../data/sequences_test_split_test.fa.6.limit2.1 | 0
.../data/sequences_test_split_test.fa.6.limit2.2 | 0
.../data/sequences_test_split_test.fa.6.limit2.3 | 0
.../tests/data/sequences_test_split_test.long.fa | 0
.../data/sequences_test_split_test.long.fa.2.1 | 0
.../data/sequences_test_split_test.long.fa.2.2 | 0
.../data/sequences_test_strip_after_whitespace.fa | 0
...quences_test_strip_after_whitespace.fa.to_fasta | 0
.../data/sequences_test_strip_illumina_suffix.fq | 0
...equences_test_strip_illumina_suffix.fq.stripped | 0
.../tests/data/sequences_test_to_fasta_union.in.fa | 0
.../data/sequences_test_to_fasta_union.out.fa | 0
.../tests/data/sequences_test_to_unique_by_id.fa | 0
.../data/sequences_test_to_unique_by_id.fa.out | 0
.../tests/data/sequences_test_translate.fa | 0
.../tests/data/sequences_test_translate.fa.frame0 | 0
.../tests/data/sequences_test_translate.fa.frame1 | 0
.../tests/data/sequences_test_translate.fa.frame2 | 0
.../tests/data/sequences_test_trim_Ns_at_end.fa | 0
.../data/sequences_test_trim_Ns_at_end.fa.trimmed | 0
.../tests/data/sequences_test_trim_contigs.fa | 0
.../tests/data/sequences_test_trim_contigs.fa.out | 0
.../tests/data/sequences_test_trimmed.fq | 0
.../tests/data/sequences_test_untrimmed.fq | 0
.../tests/data/tasks_test_expend_nucleotides.in.fa | 0
.../tests/data/tasks_test_expend_nucleotides.in.fq | 0
.../data/tasks_test_expend_nucleotides.out.fa | 0
.../data/tasks_test_expend_nucleotides.out.fq | 0
.../tests/data/tasks_test_fasta_to_fake_qual.in.fa | 0
.../tasks_test_fasta_to_fake_qual.out.default.qual | 0
.../tasks_test_fasta_to_fake_qual.out.q42.qual | 0
.../tasks_test_filter_paired_both_pass.in_1.fa | 8 ++
.../tasks_test_filter_paired_both_pass.in_2.fa | 8 ++
.../tasks_test_filter_paired_both_pass.out_1.fa | 2 +
.../tasks_test_filter_paired_both_pass.out_2.fa | 2 +
.../data/tasks_test_filter_paired_one_pass.in_1.fa | 8 ++
.../data/tasks_test_filter_paired_one_pass.in_2.fa | 8 ++
.../tasks_test_filter_paired_one_pass.out_1.fa | 6 +
.../tasks_test_filter_paired_one_pass.out_2.fa | 6 +
.../tests/data/tasks_test_make_long_reads.input.fa | 0
.../data/tasks_test_make_long_reads.output.fa | 0
pyfastaq/tests/data/tasks_test_mean_length.fa | 8 ++
.../tests/data/tasks_test_sequence_trim_1.fa | 0
.../data/tasks_test_sequence_trim_1.trimmed.fa | 0
.../tests/data/tasks_test_sequence_trim_2.fa | 0
.../data/tasks_test_sequence_trim_2.trimmed.fa | 0
.../tests/data/tasks_test_sequences_to_trim.fa | 0
pyfastaq/tests/data/tasks_test_sort_by_size.in.fa | 8 ++
pyfastaq/tests/data/tasks_test_sort_by_size.out.fa | 8 ++
.../tests/data/tasks_test_sort_by_size.out.rev.fa | 8 ++
.../tests/data/utils_test_file_transpose.txt | 0
.../tests/data/utils_test_file_transposed.txt | 0
.../tests/data/utils_test_not_really_zipped.gz | 0
.../tests/data/utils_test_scaffolds.fa | 0
.../data/utils_test_scaffolds.fa.to_contigs.fa | 0
..._test_scaffolds.fa.to_contigs.number_contigs.fa | 0
.../tests/data/utils_test_system_call.txt | 0
{fastaq => pyfastaq}/tests/intervals_test.py | 2 +-
{fastaq => pyfastaq}/tests/sequences_test.py | 16 ++-
{fastaq => pyfastaq}/tests/tasks_test.py | 95 +++++++++++++-
{fastaq => pyfastaq}/tests/utils_test.py | 2 +-
{fastaq => pyfastaq}/utils.py | 0
scripts/fastaq | 70 +++++++++++
scripts/fastaq_capillary_to_pairs | 12 --
scripts/fastaq_chunker | 21 ----
scripts/fastaq_count_sequences | 11 --
scripts/fastaq_deinterleave | 14 ---
scripts/fastaq_enumerate_names | 19 ---
scripts/fastaq_expand_nucleotides | 15 ---
scripts/fastaq_extend_gaps | 13 --
scripts/fastaq_fasta_to_fastq | 13 --
scripts/fastaq_filter | 24 ----
scripts/fastaq_get_ids | 12 --
scripts/fastaq_get_seq_flanking_gaps | 14 ---
scripts/fastaq_insert_or_delete_bases | 94 --------------
scripts/fastaq_interleave | 13 --
scripts/fastaq_long_read_simulate | 50 --------
scripts/fastaq_make_random_contigs | 25 ----
scripts/fastaq_merge | 18 ---
scripts/fastaq_replace_bases | 14 ---
scripts/fastaq_reverse_complement | 12 --
scripts/fastaq_scaffolds_to_contigs | 13 --
scripts/fastaq_search_for_seq | 13 --
scripts/fastaq_sequence_trim | 25 ----
scripts/fastaq_split_by_base_count | 15 ---
scripts/fastaq_strip_illumina_suffix | 12 --
scripts/fastaq_to_fake_qual | 18 ---
scripts/fastaq_to_fasta | 19 ---
scripts/fastaq_to_mira_xml | 12 --
scripts/fastaq_to_orfs_gff | 13 --
scripts/fastaq_to_perfect_reads | 86 -------------
scripts/fastaq_to_random_subset | 36 ------
scripts/fastaq_to_tiling_bam | 79 ------------
scripts/fastaq_to_unique_by_id | 12 --
scripts/fastaq_translate | 13 --
scripts/fastaq_trim_Ns_at_end | 12 --
scripts/fastaq_trim_ends | 14 ---
setup.py | 22 +++-
266 files changed, 1721 insertions(+), 893 deletions(-)
diff --git a/MANIFEST.in b/MANIFEST.in
index f5282c0..81691a1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1 @@
-recursive-include fastaq/tests *.txt *.sh *.py *_test_*
+recursive-include pyfastaq/tests *.txt *.py *_test_*
diff --git a/README.md b/README.md
index a5c3464..9e939f4 100644
--- a/README.md
+++ b/README.md
@@ -1,50 +1,115 @@
Fastaq
======
-Python3 scripts to manipulate FASTA and FASTQ files, plus API for developers
+Python3 script to manipulate FASTA and FASTQ (and other format) files, plus API for developers
Installation
------------
-Run the tests:
+Dependencies:
+ * numpy (install with `apt-get install python3-numpy`)
+
+Install with pip:
+
+ pip3 install pyfastaq
+
+
+Alternatively, you can download the latest release from this github repository,
+or clone the repository. Then run the tests:
python3 setup.py test
-Install:
+If the tests all pass, install:
python3 setup.py install
-Notes:
- * A few scripts assume that samtools is installed and in your path. This is NOT tested in the tests, because most scripts don't need it.
- * The installation will put all scripts in your path and are named fastaq_*.
-Scripts
--------
+Usage
+-----
+
+The installation will put a single script called `fastaq` in your path.
+The usage is:
+
+ fastaq <command> [options]
+
Key points:
- * Use -h or --help with a script to get its usage.
- * All scripts automatically detect whether the input is a FASTA or FASTQ file.
+ * To list the available commands and brief descriptions, just run `fastaq`
+ * Use `fastaq command -h` or `fastaq command --help` to get a longer description and the usage of that command.
+ * The type of input file is automatically detected. Currently supported:
+ `FASTA`, `FASTQ`, `GFF3`, `EMBL`, `GBK`, `Phylip`.
+ * `fastaq` only manipulates sequences (and
+ quality scores if present), so annotation is ignored where present in the input.
* Input and output files can be gzipped. An input file is assumed to be gzipped if its name ends with .gz. To gzip an output file, just name it with .gz at the end.
- * You can use a minus sign for a filename to use stdin or stdout, so scripts can be piped together. See the following examples.
+ * You can use a minus sign for a filename to use stdin or stdout, so commands can be piped together. See the example below.
-Reverse complement all sequences in a file:
- fastaq_reverse_complement in.fastq out.fastq
+Examples
+--------
+
+Reverse complement all sequences in a file:
-Reverse complement all sequences in a gzipped file, then translate each sequence
+ fastaq reverse_complement in.fastq out.fastq
+
+Reverse complement all sequences in a gzipped file, then translate each sequence:
+
+ fastaq reverse_complement in.fastq.gz - | fastaq translate - out.fasta
+
+
+Available commands
+------------------
+
+| Command | Description |
+|-----------------------|----------------------------------------------------------------------|
+| add_indels | Deletes or inserts bases at given position(s) |
+| caf_to_fastq | Converts a CAF file to FASTQ format |
+| capillary_to_pairs | Converts file of capillary reads to paired and unpaired files |
+| chunker | Splits sequences into equal sized chunks |
+| count_sequences | Counts the sequences in input file |
+| deinterleave | Splits interleaved paired file into two separate files |
+| enumerate_names | Renames sequences in a file, calling them 1,2,3... etc |
+| expand_nucleotides | Makes every combination of degenerate nucleotides |
+| fasta_to_fastq | Convert FASTA and .qual to FASTQ |
+| filter | Filter sequences to get a subset of them |
+| get_ids | Get the ID of each sequence |
+| get_seq_flanking_gaps | Gets the sequences flanking gaps |
+| interleave | Interleaves two files, output is alternating between fwd/rev reads |
+| long_read_simulate | Simulates long reads from reference |
+| make_random_contigs | Make contigs of random sequence |
+| merge | Converts multi sequence file to a single sequence |
+| replace_bases | Replaces all occurences of one letter with another |
+| reverse_complement | Reverse complement all sequences |
+| scaffolds_to_contigs | Creates a file of contigs from a file of scaffolds |
+| search_for_seq | Find all exact matches to a string (and its reverse complement) |
+| sequence_trim | Trim exact matches to a given string off the start of every sequence |
+| sort_by_size | Sorts sequences in length order |
+| split_by_base_count | Split multi sequence file into separate files |
+| strip_illumina_suffix | Strips /1 or /2 off the end of every read name |
+| to_fake_qual | Make fake quality scores file |
+| to_fasta | Converts a variety of input formats to nicely formatted FASTA format |
+| to_mira_xml | Create an xml file from a file of reads, for use with Mira assembler |
+| to_orfs_gff | Writes a GFF file of open reading frames |
+| to_perfect_reads | Make perfect paired reads from reference |
+| to_random_subset | Make a random sample of sequences (and optionally mates as well) |
+| to_tiling_bam | Make a BAM file of reads uniformly spread across the input reference |
+| to_unique_by_id | Remove duplicate sequences, based on their names. Keep longest seqs |
+| translate | Translate all sequences in input nucleotide sequences |
+| trim_Ns_at_end | Trims all Ns at the start/end of all sequences |
+| trim_contigs | Trims a set number of bases off the end of every contig |
+| trim_ends | Trim fixed number of bases of start and/or end of every sequence |
+| version | Print version number and exit |
- fastaq_reverse_complement in.fastq.gz - | fastaq_translate - out.fasta
For developers
--------------
Here is a template for counting the sequences in a FASTA or FASTQ file:
- from fastaq import sequences
+ from pyfastaq import sequences
seq_reader = sequences.file_reader(infile)
count = 0
for seq in seq_reader:
count += 1
print(count)
-Hopefully you get the idea and there are plenty of examples in tasks.py. Detection of FASTA or FASTQ and gzipped or not input file 'infile' is automatic. See help(sequences) for the various methods already defined in the classes Fasta and Fastq.
+Hopefully you get the idea and there are plenty of examples in tasks.py. Detection of the input file type and whether gzipped or not is automatic. See help(sequences) for the various methods already defined in the classes Fasta and Fastq.
diff --git a/fastaq/__init__.py b/fastaq/__init__.py
deleted file mode 100644
index 52ded75..0000000
--- a/fastaq/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-__all__ = ['utils', 'sequences', 'tasks', 'intervals']
-from fastaq import *
diff --git a/pyfastaq/__init__.py b/pyfastaq/__init__.py
new file mode 100644
index 0000000..8f176ae
--- /dev/null
+++ b/pyfastaq/__init__.py
@@ -0,0 +1,11 @@
+__all__ = [
+ 'caf',
+ 'common',
+ 'genetic_codes',
+ 'utils',
+ 'sequences',
+ 'tasks',
+ 'intervals',
+ 'runners'
+]
+from pyfastaq import *
diff --git a/pyfastaq/caf.py b/pyfastaq/caf.py
new file mode 100644
index 0000000..dac0f87
--- /dev/null
+++ b/pyfastaq/caf.py
@@ -0,0 +1,79 @@
+from pyfastaq import sequences, utils
+
+class Error (Exception): pass
+
+def file_reader(fname):
+ f = utils.open_file_read(fname)
+ c = Caf()
+
+ while c.get_next_from_file(f):
+ yield c
+
+ utils.close(f)
+
+
+class Caf:
+ def __init__(self):
+ self.id = None
+ self.seq = None
+ self.insert_min = None
+ self.insert_max = None
+ self.ligation = None
+ self.clone = None
+ self.clip_start = None
+ self.clip_end = None
+
+
+ def __eq__(self, other):
+ if type(other) is type(self):
+ return self.__dict__ == other.__dict__
+ return False
+
+
+ def get_next_from_file(self, f):
+ self.__init__()
+ line = f.readline()
+ if not line:
+ return None
+ while line == '\n':
+ line = f.readline()
+
+ if not line.startswith('DNA : '):
+ raise Error("Error reading caf file. Expected line starting with 'DNA : ...'")
+
+ self.id = line.rstrip().split()[2]
+
+ line = f.readline()
+ seq = []
+
+ while line != '\n':
+ seq.append(line.rstrip())
+ line = f.readline()
+
+ self.seq = sequences.Fasta(self.id, ''.join(seq))
+
+ line = f.readline()
+ if not line.startswith('BaseQuality : '):
+ raise Error("Error reading caf file. Expected line starting with 'BaseQuality : ...'")
+
+ quals = [int(x) for x in f.readline().rstrip().split()]
+ self.seq = self.seq.to_Fastq(quals)
+
+ line = f.readline()
+ assert line == '\n'
+ line = f.readline()
+
+ while line not in ['', '\n']:
+ a = line.rstrip().split()
+ if a[0] == 'Insert_size':
+ self.insert_min, self.insert_max = int(a[1]), int(a[2])
+ elif a[0] == 'Ligation_no':
+ self.ligation = a[1]
+ elif a[0] == 'Clone':
+ self.clone = a[1]
+ elif a[0] == 'Clipping' and a[1] == 'QUAL':
+ self.clip_start, self.clip_end = int(a[2]) - 1, int(a[3]) - 1
+
+ line = f.readline()
+
+ return True
diff --git a/pyfastaq/common.py b/pyfastaq/common.py
new file mode 100644
index 0000000..42b6ccb
--- /dev/null
+++ b/pyfastaq/common.py
@@ -0,0 +1 @@
+version = '3.2.0'
diff --git a/pyfastaq/genetic_codes.py b/pyfastaq/genetic_codes.py
new file mode 100644
index 0000000..c32c065
--- /dev/null
+++ b/pyfastaq/genetic_codes.py
@@ -0,0 +1,139 @@
+codes = {}
+
+#standard genetic code
+codes[1] = {
+ 'TTT': 'F',
+ 'TTC': 'F',
+ 'TTA': 'L',
+ 'TTG': 'L',
+ 'TCT': 'S',
+ 'TCC': 'S',
+ 'TCA': 'S',
+ 'TCG': 'S',
+ 'TAT': 'Y',
+ 'TAC': 'Y',
+ 'TAA': '*',
+ 'TAG': '*',
+ 'TGT': 'C',
+ 'TGC': 'C',
+ 'TGA': '*',
+ 'TGG': 'W',
+ 'CTT': 'L',
+ 'CTC': 'L',
+ 'CTA': 'L',
+ 'CTG': 'L',
+ 'CCT': 'P',
+ 'CCC': 'P',
+ 'CCA': 'P',
+ 'CCG': 'P',
+ 'CAT': 'H',
+ 'CAC': 'H',
+ 'CAA': 'Q',
+ 'CAG': 'Q',
+ 'CGT': 'R',
+ 'CGC': 'R',
+ 'CGA': 'R',
+ 'CGG': 'R',
+ 'ATT': 'I',
+ 'ATC': 'I',
+ 'ATA': 'I',
+ 'ATG': 'M',
+ 'ACT': 'T',
+ 'ACC': 'T',
+ 'ACA': 'T',
+ 'ACG': 'T',
+ 'AAT': 'N',
+ 'AAC': 'N',
+ 'AAA': 'K',
+ 'AAG': 'K',
+ 'AGT': 'S',
+ 'AGC': 'S',
+ 'AGA': 'R',
+ 'AGG': 'R',
+ 'GTT': 'V',
+ 'GTC': 'V',
+ 'GTA': 'V',
+ 'GTG': 'V',
+ 'GCT': 'A',
+ 'GCC': 'A',
+ 'GCA': 'A',
+ 'GCG': 'A',
+ 'GAT': 'D',
+ 'GAC': 'D',
+ 'GAA': 'E',
+ 'GAG': 'E',
+ 'GGT': 'G',
+ 'GGC': 'G',
+ 'GGA': 'G',
+ 'GGG': 'G',
+}
+
+
+#mycoplasma genetic code
+codes[4] = {
+ 'TTT': 'F',
+ 'TTC': 'F',
+ 'TTA': 'L',
+ 'TTG': 'L',
+ 'TCT': 'S',
+ 'TCC': 'S',
+ 'TCA': 'S',
+ 'TCG': 'S',
+ 'TAT': 'Y',
+ 'TAC': 'Y',
+ 'TAA': '*',
+ 'TAG': '*',
+ 'TGT': 'C',
+ 'TGC': 'C',
+ 'TGA': 'W',
+ 'TGG': 'W',
+ 'CTT': 'L',
+ 'CTC': 'L',
+ 'CTA': 'L',
+ 'CTG': 'L',
+ 'CCT': 'P',
+ 'CCC': 'P',
+ 'CCA': 'P',
+ 'CCG': 'P',
+ 'CAT': 'H',
+ 'CAC': 'H',
+ 'CAA': 'Q',
+ 'CAG': 'Q',
+ 'CGT': 'R',
+ 'CGC': 'R',
+ 'CGA': 'R',
+ 'CGG': 'R',
+ 'ATT': 'I',
+ 'ATC': 'I',
+ 'ATA': 'I',
+ 'ATG': 'M',
+ 'ACT': 'T',
+ 'ACC': 'T',
+ 'ACA': 'T',
+ 'ACG': 'T',
+ 'AAT': 'N',
+ 'AAC': 'N',
+ 'AAA': 'K',
+ 'AAG': 'K',
+ 'AGT': 'S',
+ 'AGC': 'S',
+ 'AGA': 'R',
+ 'AGG': 'R',
+ 'GTT': 'V',
+ 'GTC': 'V',
+ 'GTA': 'V',
+ 'GTG': 'V',
+ 'GCT': 'A',
+ 'GCC': 'A',
+ 'GCA': 'A',
+ 'GCG': 'A',
+ 'GAT': 'D',
+ 'GAC': 'D',
+ 'GAA': 'E',
+ 'GAG': 'E',
+ 'GGT': 'G',
+ 'GGC': 'G',
+ 'GGA': 'G',
+ 'GGG': 'G'
+}
+
diff --git a/fastaq/intervals.py b/pyfastaq/intervals.py
similarity index 100%
rename from fastaq/intervals.py
rename to pyfastaq/intervals.py
diff --git a/fastaq/tests/data/sequences_test_empty_file b/pyfastaq/runners/__init__.py
similarity index 100%
copy from fastaq/tests/data/sequences_test_empty_file
copy to pyfastaq/runners/__init__.py
diff --git a/pyfastaq/runners/add_indels.py b/pyfastaq/runners/add_indels.py
new file mode 100644
index 0000000..088b9d3
--- /dev/null
+++ b/pyfastaq/runners/add_indels.py
@@ -0,0 +1,93 @@
+import argparse
+import sys
+import random
+from pyfastaq import sequences, utils, intervals
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq add_indels [options] <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ parser.add_argument('-d','--delete', action='append', help='Delete the given bases from the given sequence. Format same as samtools view: name:start-end. This option can be used multiple times (once for each region to delete). Overlapping coords will be merged before deleting', metavar='Name:start:bases')
+ parser.add_argument('--delete_range', help='Deletes bases starting at position P in each sequence of the input file. Deletes start + (n-1)*step bases from sequence n.', metavar='P,start,step')
+ parser.add_argument('-i','--insert', action='append', help='Insert a random string of bases at the given position. Format is name:position:number_to_add. Bases are added after the position. This option can be used multiple times', metavar='Name:start:bases')
+ parser.add_argument('--insert_range', help='Inserts random bases starting after position P in each sequence of the input file. Inserts start + (n-1)*step bases into sequence n.', metavar='P,start,step')
+ options = parser.parse_args()
+
+ test_ops = [int(x is not None) for x in [options.delete, options.insert, options.delete_range, options.insert_range]]
+
+ if sum(test_ops) != 1:
+ print('Must use one of --delete, --insert, --delete_range, --insert_range. Cannot continue', file=sys.stderr)
+ sys.exit(1)
+
+
+ def range2dic(range_in):
+ if range_in is None:
+ return {}
+ (pos, start, step) = range_in.split(',')
+ d = {}
+ d['pos'] = int(pos) - 1
+ d['bases'] = int(start)
+ d['step'] = int(step)
+ return d
+
+ delete_range = range2dic(options.delete_range)
+ insert_range = range2dic(options.insert_range)
+
+
+ # convert the -d regions into sequence name, start and end coords
+ to_delete = {}
+ if options.delete:
+ for s in options.delete:
+ id, coords = s.rsplit(':')
+ start, end = [int(x)-1 for x in coords.split('-')]
+ if id not in to_delete:
+ to_delete[id] = []
+ to_delete[id].append(intervals.Interval(start, end))
+
+
+ to_insert = {}
+ if options.insert:
+ for s in options.insert:
+ id, pos, bases = s.rsplit(':',2)
+ pos = int(pos) - 1
+ bases = int(bases)
+ if id not in to_insert:
+ to_insert[id] = []
+ to_insert[id].append((pos, bases))
+
+
+ assert len(to_delete) * len(to_insert) == 0
+
+ # merge overlapping regions to be deleted
+ for l in to_delete.values():
+ intervals.merge_overlapping_in_list(l)
+
+ # sort positions to be inserted
+ for l in to_insert.values():
+ l.sort()
+
+ # read in the fasta/q file and print outfile with deleted sequences
+ seq_reader = sequences.file_reader(options.infile)
+ f = utils.open_file_write(options.outfile)
+
+ for seq in seq_reader:
+ if seq.id in to_delete:
+ # delete regions for this sequence, but start at the end so the
+ # coords don't get messed up after the first deletion
+ for inter in reversed(to_delete[seq.id]):
+ seq.seq = seq.seq[:inter.start] + seq.seq[inter.end + 1:]
+ elif options.delete_range:
+ seq.seq = seq.seq[:delete_range['pos']] + seq.seq[delete_range['pos'] + delete_range['bases']:]
+ delete_range['bases'] += delete_range['step']
+ elif seq.id in to_insert:
+ for pos, bases in reversed(to_insert[seq.id]):
+ seq.seq = seq.seq[:pos + 1] + ''.join([random.choice('ACGT') for x in range(bases)]) + seq.seq[pos + 1:]
+ elif options.insert_range:
+ seq.seq = seq.seq[:insert_range['pos'] + 1] + ''.join([random.choice('ACGT') for x in range(insert_range['bases'])]) + seq.seq[insert_range['pos'] + 1:]
+ insert_range['bases'] += insert_range['step']
+
+ print(seq, file=f)
+
+ utils.close(f)
diff --git a/pyfastaq/runners/caf_to_fastq.py b/pyfastaq/runners/caf_to_fastq.py
new file mode 100644
index 0000000..5d08d79
--- /dev/null
+++ b/pyfastaq/runners/caf_to_fastq.py
@@ -0,0 +1,19 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Converts CAF file to FASTQ format',
+ usage = 'fastaq caf_to_fastq [options] <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input CAF file.')
+ parser.add_argument('outfile', help='Name of output FASTQ file')
+ parser.add_argument('-c', '--clip', action='store_true', help='Use clipping info to clip reads, if present in the input CAF file (as lines of the form "Clipping QUAL start end"). Default is to not clip')
+ parser.add_argument('-l', '--min_length', type=int, help='Minimum length of sequence to output [%(default)s]', default=1, metavar='INT')
+ options = parser.parse_args()
+
+ tasks.caf_to_fastq(
+ options.infile,
+ options.outfile,
+ trim=options.clip,
+ min_length=options.min_length
+ )
diff --git a/pyfastaq/runners/capillary_to_pairs.py b/pyfastaq/runners/capillary_to_pairs.py
new file mode 100644
index 0000000..1bf64dd
--- /dev/null
+++ b/pyfastaq/runners/capillary_to_pairs.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Given a file of capillary reads, makes an interleaved file of read pairs (where more than read from same ligation, takes the longest read) and a file of unpaired reads. Replaces the .p1k/.q1k part of read names to denote fwd/rev reads with /1 and /2',
+ usage = 'fastaq capillary_to_pairs <infile> <outfiles prefix>')
+ parser.add_argument('infile', help='Name of input fasta/q file')
+ parser.add_argument('outprefix', help='Prefix of output files', metavar='outfiles prefix')
+ options = parser.parse_args()
+ tasks.capillary_to_pairs(options.infile, options.outprefix)
+
diff --git a/pyfastaq/runners/chunker.py b/pyfastaq/runners/chunker.py
new file mode 100644
index 0000000..7e5d980
--- /dev/null
+++ b/pyfastaq/runners/chunker.py
@@ -0,0 +1,30 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Splits a multi sequence file into separate files. Splits sequences into chunks of a fixed size. Aims for chunk_size chunks in each file, but allows a little extra, so chunk can be up to (chunk_size + tolerance), to prevent tiny chunks made from the ends of sequences',
+ usage = 'fastaq chunker [options] <infile> <out> <chunk size> <tolerance>')
+ parser.add_argument('infile', help='Name of input file to be split')
+ parser.add_argument('out', help='Prefix of output file. If --onefile used, then name of single output file')
+ parser.add_argument('chunk_size', type=int, help='Size of each chunk')
+ parser.add_argument('tolerance', type=int, help='Tolerance allowed in chunk size')
+ parser.add_argument('--onefile', action='store_true', help='Output all the sequences in one file')
+ parser.add_argument('--skip_all_Ns', action='store_true', help='Do not output any sequence that consists of all Ns')
+ options = parser.parse_args()
+ if options.onefile:
+ tasks.split_by_fixed_size_onefile(
+ options.infile,
+ options.out,
+ options.chunk_size,
+ options.tolerance,
+ skip_if_all_Ns=options.skip_all_Ns
+ )
+ else:
+ tasks.split_by_fixed_size(
+ options.infile,
+ options.out,
+ options.chunk_size,
+ options.tolerance,
+ skip_if_all_Ns=options.skip_all_Ns
+ )
diff --git a/pyfastaq/runners/count_sequences.py b/pyfastaq/runners/count_sequences.py
new file mode 100644
index 0000000..b56fa87
--- /dev/null
+++ b/pyfastaq/runners/count_sequences.py
@@ -0,0 +1,10 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Prints the number of sequences in input file to stdout',
+ usage = 'fastaq count_sequences <infile>')
+ parser.add_argument('infile', help='Name of input file')
+ options = parser.parse_args()
+ print(tasks.count_sequences(options.infile))
diff --git a/pyfastaq/runners/deinterleave.py b/pyfastaq/runners/deinterleave.py
new file mode 100644
index 0000000..eac212e
--- /dev/null
+++ b/pyfastaq/runners/deinterleave.py
@@ -0,0 +1,13 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Deinterleaves sequence file, so that reads are written alternately between two output files',
+ usage = 'fastaq deinterleave [options] <infile> <out_fwd> <out_rev>')
+ parser.add_argument('--fasta_out', action='store_true', help='Use this to write output as fasta (default is same as input)', default=False)
+ parser.add_argument('infile', help='Name of fasta/q file to be deinterleaved')
+ parser.add_argument('out_fwd', help='Name of output fasta/q file of forwards reads')
+ parser.add_argument('out_rev', help='Name of output fasta/q file of reverse reads')
+ options = parser.parse_args()
+ tasks.deinterleave(options.infile, options.out_fwd, options.out_rev, fasta_out=options.fasta_out)
diff --git a/pyfastaq/runners/enumerate_names.py b/pyfastaq/runners/enumerate_names.py
new file mode 100644
index 0000000..4a9c218
--- /dev/null
+++ b/pyfastaq/runners/enumerate_names.py
@@ -0,0 +1,20 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Renames sequences in a file, calling them 1,2,3... etc',
+ usage = 'fastaq enumerate_names [options] <infile> <outfile>')
+ parser.add_argument('--start_index', type=int, help='Starting number [%(default)s]', default=1)
+ parser.add_argument('--rename_file', help='If used, will write a file of old name to new name')
+ parser.add_argument('--keep_suffix', action='store_true', help='Use this to keep a /1 or /2 suffix at the end of each name')
+ parser.add_argument('--suffix', help='Add the given string to the end of every name', default=None)
+ parser.add_argument('infile', help='Name of fasta/q file to be read')
+ parser.add_argument('outfile', help='Name of output fasta/q file')
+ options = parser.parse_args()
+ tasks.enumerate_names(options.infile,
+ options.outfile,
+ start_index=options.start_index,
+ keep_illumina_suffix=options.keep_suffix,
+ rename_file=options.rename_file,
+ suffix=options.suffix)
diff --git a/pyfastaq/runners/expand_nucleotides.py b/pyfastaq/runners/expand_nucleotides.py
new file mode 100644
index 0000000..3f240a8
--- /dev/null
+++ b/pyfastaq/runners/expand_nucleotides.py
@@ -0,0 +1,14 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Makes all combinations of sequences in input file by using all possibilities of redundant bases. e.g. ART could be AAT or AGT. Assumes input is nucleotides, not amino acids',
+ usage = 'fastaq expand_nucleotides <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.expand_nucleotides(
+ options.infile,
+ options.outfile,
+ )
diff --git a/pyfastaq/runners/fasta_to_fastq.py b/pyfastaq/runners/fasta_to_fastq.py
new file mode 100644
index 0000000..cd755a6
--- /dev/null
+++ b/pyfastaq/runners/fasta_to_fastq.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq fasta_to_fastq <fasta in> <qual in> <fastq out>')
+ parser.add_argument('fasta', help='Name of input FASTA file', metavar='fasta in')
+ parser.add_argument('qual', help='Name of input quality scores file', metavar='qual in')
+ parser.add_argument('outfile', help='Name of output FASTQ file', metavar='fastq out')
+ options = parser.parse_args()
+ tasks.fasta_to_fastq(options.fasta, options.qual, options.outfile)
diff --git a/pyfastaq/runners/filter.py b/pyfastaq/runners/filter.py
new file mode 100644
index 0000000..5e0964f
--- /dev/null
+++ b/pyfastaq/runners/filter.py
@@ -0,0 +1,32 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Filters a sequence file by sequence length and/or by name matching a regular expression',
+ usage = 'fastaq filter [options] <infile> <outfile>')
+ parser.add_argument('--min_length', type=int, help='Minimum length of sequence to keep [%(default)s]', default=0, metavar='INT')
+ parser.add_argument('--max_length', type=float, help='Maximum length of sequence to keep [%(default)s]', default=float('inf'), metavar='INT')
+ parser.add_argument('--regex', help='If given, only reads with a name matching the regular expression will be kept')
+ parser.add_argument('--ids_file', help='If given, only reads whose ID is in th given file will be used. One ID per line of file.', metavar='FILENAME')
+ parser.add_argument('-v', '--invert', action='store_true', help='Only keep sequences that do not match the filters')
+
+ mate_group = parser.add_argument_group('Mate file for read pairs options')
+ mate_group.add_argument('--mate_in', help='Name of mates input file. If used, must also provide --mate_out', metavar='FILENAME')
+ mate_group.add_argument('--mate_out', help='Name of mates output file', metavar='FILENAME')
+ mate_group.add_argument('--both_mates_pass', action='store_true', help='By default, if either mate passes filter, then both reads output. Use this flag to require that both reads of a pair pass the filter')
+
+ parser.add_argument('infile', help='Name of input file to be filtered')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.filter(options.infile,
+ options.outfile,
+ minlength=options.min_length,
+ maxlength=options.max_length,
+ regex=options.regex,
+ ids_file=options.ids_file,
+ invert=options.invert,
+ mate_in=options.mate_in,
+ mate_out=options.mate_out,
+ both_mates_pass=options.both_mates_pass,
+ )
diff --git a/pyfastaq/runners/get_ids.py b/pyfastaq/runners/get_ids.py
new file mode 100644
index 0000000..7a4442b
--- /dev/null
+++ b/pyfastaq/runners/get_ids.py
@@ -0,0 +1,11 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Gets IDs from each sequence in input file',
+ usage = 'fastaq get_ids <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.get_ids(options.infile, options.outfile)
diff --git a/pyfastaq/runners/get_seq_flanking_gaps.py b/pyfastaq/runners/get_seq_flanking_gaps.py
new file mode 100644
index 0000000..b132d96
--- /dev/null
+++ b/pyfastaq/runners/get_seq_flanking_gaps.py
@@ -0,0 +1,13 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq get_seq_flanking_gaps [options] <infile> <outfile>')
+ parser.add_argument('--left', type=int, help='Number of bases to get to left of gap [%(default)s]', default=25, metavar='INT')
+ parser.add_argument('--right', type=int, help='Number of bases to get to right of gap [%(default)s]', default=25, metavar='INT')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.get_seqs_flanking_gaps(options.infile, options.outfile, options.left, options.right)
diff --git a/pyfastaq/runners/interleave.py b/pyfastaq/runners/interleave.py
new file mode 100644
index 0000000..60f2782
--- /dev/null
+++ b/pyfastaq/runners/interleave.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq interleave <infile_1> <infile_2> <outfile>')
+ parser.add_argument('infile_1', help='Name of first input file')
+ parser.add_argument('infile_2', help='Name of second input file')
+ parser.add_argument('outfile', help='Name of output file of interleaved reads')
+ options = parser.parse_args()
+ tasks.interleave(options.infile_1, options.infile_2, options.outfile)
diff --git a/pyfastaq/runners/long_read_simulate.py b/pyfastaq/runners/long_read_simulate.py
new file mode 100644
index 0000000..ad8da23
--- /dev/null
+++ b/pyfastaq/runners/long_read_simulate.py
@@ -0,0 +1,49 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Simulates long reads from a sequence file. Can optionally make insertions into the reads, like pacbio does. If insertions made, coverage calculation is done before the insertions (so total read length may appear longer then expected).',
+ usage = 'fastaq long_read_simulate [options] <infile> <outfile>')
+
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output FASTA file')
+
+ parser.add_argument('--method', help='How to sample the read positions and lengths. Choose from 1) "tiling", where reads of fixed length are taken at equal intervals from the reference. 2) "unfiform", where reads of fixed length taken at positions sampled uniformly. 3) "gamma", where reads lengths are taken from a gamma distribution, and positions sampled uniformly. [%(default)s]', default='tiling', choices=['tiling', 'uniform', 'gamma'], metavar='tiling|uniform|gamma')
+ parser.add_argument('--seed', type=int, help='Seed for random number generator [default: use python\'s default]', metavar='INT')
+ parser.add_argument('--qual', help='Write a file of fake quality scores called outfile.qual, all bases same quality [%(default)s]', metavar='INT')
+ parser.add_argument('--fixed_read_length', type=int, help='Length of each read. Only applies if method is tile or uniform. [%(default)s]', default=20000, metavar='INT')
+ parser.add_argument('--coverage', type=float, help='Read coverage. Only applies if method is gamma or uniform. [%(default)s]', default=2, metavar='FLOAT')
+
+
+ tiling_group = parser.add_argument_group('tiling options')
+ tiling_group.add_argument('--tile_step', type=int, help='Distance between start of each read [%(default)s]', default=10000, metavar='INT')
+
+ gamma_group = parser.add_argument_group('gamma options')
+ gamma_group.add_argument('--gamma_shape', type=float, help='Shape parameter of gamma distribution [%(default)s]', default=1.2, metavar='FLOAT')
+ gamma_group.add_argument('--gamma_scale', type=float, help='Scale parameter of gamma distribution [%(default)s]', default=6000, metavar='FLOAT')
+ gamma_group.add_argument('--gamma_min_length', type=int, help='Minimum read length [%(default)s]', default=20000, metavar='INT')
+
+ ins_group = parser.add_argument_group('options to add insertions to reads')
+ ins_group.add_argument('--ins_skip', type=int, help='Insert a random base every --skip bases plus or minus --ins_window. If this option is used, must also use --ins_window.', metavar='INT')
+ ins_group.add_argument('--ins_window', type=int, help='See --ins_skip. If this option is used, must also use --ins_skip.', metavar='INT')
+
+
+ options = parser.parse_args()
+ tasks.make_long_reads(
+ options.infile,
+ options.outfile,
+ method=options.method,
+ fixed_read_length=options.fixed_read_length,
+ coverage=options.coverage,
+ tile_step=options.tile_step,
+ gamma_shape=options.gamma_shape,
+ gamma_scale=options.gamma_scale,
+ gamma_min_length=options.gamma_min_length,
+ seed=options.seed,
+ ins_skip=options.ins_skip,
+ ins_window=options.ins_window
+ )
+
+ if options.qual:
+ tasks.fastaq_to_fake_qual(options.outfile, options.outfile + '.qual', q=options.qual)
diff --git a/pyfastaq/runners/make_random_contigs.py b/pyfastaq/runners/make_random_contigs.py
new file mode 100644
index 0000000..5337120
--- /dev/null
+++ b/pyfastaq/runners/make_random_contigs.py
@@ -0,0 +1,24 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Makes a multi-FASTA file of random sequences, all of the same length. Each base has equal chance of being A,C,G or T',
+ usage = 'fastaq make_random_contigs [options] <contigs> <length> <outfile>')
+ parser.add_argument('--first_number', type=int, help='If numbering the sequences, the first sequence gets this number [%(default)s]', default=1)
+ parser.add_argument('--name_by_letters', action='store_true', help='Name the contigs A,B,C,... will start at A again if you get to Z')
+ parser.add_argument('--prefix', help='Prefix to add to start of every sequence name', default='')
+ parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None)
+ parser.add_argument('contigs', type=int, help='Nunber of contigs to make')
+ parser.add_argument('length', type=int, help='Length of each contig')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.make_random_contigs(
+ options.contigs,
+ options.length,
+ options.outfile,
+ name_by_letters=options.name_by_letters,
+ prefix=options.prefix,
+ seed=options.seed,
+ first_number=options.first_number
+ )
diff --git a/pyfastaq/runners/merge.py b/pyfastaq/runners/merge.py
new file mode 100644
index 0000000..c9f0a36
--- /dev/null
+++ b/pyfastaq/runners/merge.py
@@ -0,0 +1,16 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq merge [options] <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ parser.add_argument('-n', '--name', help='Name of sequence in output file [%(default)s]', default='union')
+ options = parser.parse_args()
+ tasks.merge_to_one_seq(
+ options.infile,
+ options.outfile,
+ seqname=options.name
+ )
diff --git a/pyfastaq/runners/replace_bases.py b/pyfastaq/runners/replace_bases.py
new file mode 100644
index 0000000..1ee64e6
--- /dev/null
+++ b/pyfastaq/runners/replace_bases.py
@@ -0,0 +1,13 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq replace_bases <infile> <outfile> <old> <new>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ parser.add_argument('old', help='Base to be replaced')
+ parser.add_argument('new', help='Replace with this letter')
+ options = parser.parse_args()
+ tasks.replace_bases(options.infile, options.outfile, options.old, options.new)
diff --git a/pyfastaq/runners/reverse_complement.py b/pyfastaq/runners/reverse_complement.py
new file mode 100644
index 0000000..f393de8
--- /dev/null
+++ b/pyfastaq/runners/reverse_complement.py
@@ -0,0 +1,11 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq reverse_complement <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.reverse_complement(options.infile, options.outfile)
diff --git a/pyfastaq/runners/scaffolds_to_contigs.py b/pyfastaq/runners/scaffolds_to_contigs.py
new file mode 100644
index 0000000..afab4db
--- /dev/null
+++ b/pyfastaq/runners/scaffolds_to_contigs.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Creates a file of contigs from a file of scaffolds - i.e. breaks at every gap in the input',
+ usage = 'fastaq scaffolds_to_contigs [options] <infile> <outfile>')
+ parser.add_argument('--number_contigs', action='store_true', help='Use this to enumerate contig names 1,2,3,... within each scaffold')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output contigs file')
+ options = parser.parse_args()
+ tasks.scaffolds_to_contigs(options.infile, options.outfile, number_contigs=options.number_contigs)
diff --git a/pyfastaq/runners/search_for_seq.py b/pyfastaq/runners/search_for_seq.py
new file mode 100644
index 0000000..7864f1e
--- /dev/null
+++ b/pyfastaq/runners/search_for_seq.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Searches for an exact match on a given string and its reverse complement, in every sequence of input sequence file. Case insensitive. Guaranteed to find all hits',
+ usage = 'fastaq search_for_seq [options] <infile> <outfile> <search_string>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of outputfile. Tab-delimited output: sequence name, position, strand')
+ parser.add_argument('search_string', help='String to search for in the sequences')
+ options = parser.parse_args()
+ tasks.search_for_seq(options.infile, options.outfile, options.search_string)
diff --git a/pyfastaq/runners/sequence_trim.py b/pyfastaq/runners/sequence_trim.py
new file mode 100644
index 0000000..4542294
--- /dev/null
+++ b/pyfastaq/runners/sequence_trim.py
@@ -0,0 +1,24 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Trims sequences off the start of all sequences in a pair of sequence files, whenever there is a perfect match. Only keeps a read pair if both reads of the pair are at least a minimum length after any trimming',
+ usage = 'fastaq sequence_trim [options] <infile_1> <infile_2> <outfile_1> <outfile_2> <trim_seqs>')
+ parser.add_argument('--min_length', type=int, help='Minimum length of output sequences [%(default)s]', default=50, metavar='INT')
+ parser.add_argument('--revcomp', action='store_true', help='Trim the end of each sequence if it matches the reverse complement. This option is intended for PCR primer trimming')
+ parser.add_argument('infile_1', help='Name of forward fasta/q file to be trimmed')
+ parser.add_argument('infile_2', help='Name of reverse fasta/q file to be trimmed')
+ parser.add_argument('outfile_1', help='Name of output forward fasta/q file')
+ parser.add_argument('outfile_2', help='Name of output reverse fasta/q file')
+ parser.add_argument('trim_seqs', help='Name of file of sequences to search for at the start of each input sequence')
+ options = parser.parse_args()
+ tasks.sequence_trim(
+ options.infile_1,
+ options.infile_2,
+ options.outfile_1,
+ options.outfile_2,
+ options.trim_seqs,
+ min_length=options.min_length,
+ check_revcomp=options.revcomp
+ )
diff --git a/pyfastaq/runners/sort_by_size.py b/pyfastaq/runners/sort_by_size.py
new file mode 100644
index 0000000..c877806
--- /dev/null
+++ b/pyfastaq/runners/sort_by_size.py
@@ -0,0 +1,16 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq sort_by_size [options] <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ parser.add_argument('-r', '--reverse', action='store_true', help='Sort by shortest first instead of the default of longest first')
+ options = parser.parse_args()
+ tasks.sort_by_size(
+ options.infile,
+ options.outfile,
+ smallest_first=options.reverse
+ )
diff --git a/pyfastaq/runners/split_by_base_count.py b/pyfastaq/runners/split_by_base_count.py
new file mode 100644
index 0000000..7a0bea0
--- /dev/null
+++ b/pyfastaq/runners/split_by_base_count.py
@@ -0,0 +1,14 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Splits a multi sequence file into separate files. Does not split sequences. Puts up to max_bases into each split file. The exception is that any sequence longer than max_bases is put into its own file.',
+ usage = 'fastaq split_by_base_count [options] <infile> <outprefix> <max_bases>')
+ parser.add_argument('infile', help='Name of input file to be split')
+ parser.add_argument('outprefix', help='Name of output file')
+ parser.add_argument('max_bases', type=int, help='Max bases in each output split file', metavar='max_bases')
+ parser.add_argument('--max_seqs', type=int, help='Max number of sequences in each output split file [no limit]', metavar='INT')
+
+ options = parser.parse_args()
+ tasks.split_by_base_count(options.infile, options.outprefix, options.max_bases, options.max_seqs)
diff --git a/pyfastaq/runners/strip_illumina_suffix.py b/pyfastaq/runners/strip_illumina_suffix.py
new file mode 100644
index 0000000..c42cb5a
--- /dev/null
+++ b/pyfastaq/runners/strip_illumina_suffix.py
@@ -0,0 +1,11 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq strip_illumina_suffix <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.strip_illumina_suffix(options.infile, options.outfile)
diff --git a/pyfastaq/runners/to_fake_qual.py b/pyfastaq/runners/to_fake_qual.py
new file mode 100644
index 0000000..1cb4436
--- /dev/null
+++ b/pyfastaq/runners/to_fake_qual.py
@@ -0,0 +1,17 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq to_fake_qual [options] <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ parser.add_argument('-q', '--qual', type=int, help='Quality score to assign to all bases [%(default)s]', default=40)
+ options = parser.parse_args()
+ tasks.fastaq_to_fake_qual(
+ options.infile,
+ options.outfile,
+ q=options.qual
+ )
+
diff --git a/pyfastaq/runners/to_fasta.py b/pyfastaq/runners/to_fasta.py
new file mode 100644
index 0000000..379abc6
--- /dev/null
+++ b/pyfastaq/runners/to_fasta.py
@@ -0,0 +1,20 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq to_fasta [options] <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip')
+ parser.add_argument('outfile', help='Name of output file')
+ parser.add_argument('-l', '--line_length', type=int, help='Number of bases on each sequence line of output file. Set to zero for no linebreaks in sequences [%(default)s]', default=60)
+ parser.add_argument('-s', '--strip_after_whitespace', action='store_true', help='Remove everything after first whitespace in every sequence name')
+ options = parser.parse_args()
+
+ tasks.to_fasta(
+ options.infile,
+ options.outfile,
+ line_length=options.line_length,
+ strip_after_first_whitespace=options.strip_after_whitespace
+ )
+
diff --git a/pyfastaq/runners/to_mira_xml.py b/pyfastaq/runners/to_mira_xml.py
new file mode 100644
index 0000000..548e996
--- /dev/null
+++ b/pyfastaq/runners/to_mira_xml.py
@@ -0,0 +1,11 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq to_mira_xml <infile> <xml_out>')
+ parser.add_argument('infile', help='Name of input fasta/q file')
+ parser.add_argument('xml_out', help='Name of output xml file')
+ options = parser.parse_args()
+ tasks.fastaq_to_mira_xml(options.infile, options.xml_out)
diff --git a/pyfastaq/runners/to_orfs_gff.py b/pyfastaq/runners/to_orfs_gff.py
new file mode 100644
index 0000000..039016c
--- /dev/null
+++ b/pyfastaq/runners/to_orfs_gff.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Writes a GFF file of open reading frames from a sequence file',
+ usage = 'fastaq to_orfs_gff [options] <infile> <outfile>')
+ parser.add_argument('--min_length', type=int, help='Minimum length of ORF, in nucleotides [%(default)s]', default=300, metavar='INT')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output GFF file')
+ options = parser.parse_args()
+ tasks.fastaq_to_orfs_gff(options.infile, options.gff_out, min_length=options.min_length)
diff --git a/pyfastaq/runners/to_perfect_reads.py b/pyfastaq/runners/to_perfect_reads.py
new file mode 100644
index 0000000..eb12d34
--- /dev/null
+++ b/pyfastaq/runners/to_perfect_reads.py
@@ -0,0 +1,85 @@
+import argparse
+import random
+from math import floor, ceil
+import sys
+from pyfastaq import sequences, utils
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Makes perfect paired end fastq reads from a sequence file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved FASTQ file.',
+ usage = 'fastaq to_perfect_reads [options] <infile> <outfile> <mean insert size> <insert std deviation> <mean coverage> <read length>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size')
+ parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation')
+ parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage')
+ parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length')
+ parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME')
+ parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads')
+ parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT')
+ options = parser.parse_args()
+
+ random.seed(a=options.seed)
+
+ seq_reader = sequences.file_reader(options.infile)
+ fout = utils.open_file_write(options.outfile)
+ pair_counter = 1
+
+ if options.fragments:
+ fout_frags = utils.open_file_write(options.fragments)
+
+ for ref in seq_reader:
+ # check if current seq is long enough
+ if len(ref) < options.mean_insert + 4 * options.insert_std:
+ print('Warning, sequence ', ref.id, ' too short. Skipping it...', file=sys.stderr)
+ continue
+
+ # work out how many reads to simulate
+ read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength)
+
+ # it's possible that we pick the same fragment twice, in which case the
+ # reads would get the same name. So remember the frag coords
+ used_fragments = {} # (middle_position, length) => count
+
+ # do the simulation: pick insert size from normal distribution, and
+ # position in genome from uniform distribution
+ x = 0
+ while x < read_pairs:
+ isize = int(random.normalvariate(options.mean_insert, options.insert_std))
+ while isize > len(ref) or isize < options.readlength:
+ isize = int(random.normalvariate(options.mean_insert, options.insert_std))
+ middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize))
+ read_start1 = int(middle_pos - ceil(0.5 * isize))
+ read_start2 = read_start1 + isize - options.readlength
+
+ readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)])
+
+ fragment = (middle_pos, isize)
+ if fragment in used_fragments:
+ used_fragments[fragment] += 1
+ readname += '.dup.' + str(used_fragments[fragment])
+ else:
+ used_fragments[fragment] = 1
+
+ read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength)
+ read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength)
+
+
+ if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq):
+ continue
+
+ read2.revcomp()
+
+ print(read1, file=fout)
+ print(read2, file=fout)
+
+ if options.fragments:
+ frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength])
+ print(frag, file=fout_frags)
+
+ pair_counter += 1
+ x += 1
+
+ utils.close(fout)
+ if options.fragments:
+ utils.close(fout_frags)
diff --git a/pyfastaq/runners/to_random_subset.py b/pyfastaq/runners/to_random_subset.py
new file mode 100644
index 0000000..211f7f2
--- /dev/null
+++ b/pyfastaq/runners/to_random_subset.py
@@ -0,0 +1,35 @@
+import argparse
+import sys
+import random
+from pyfastaq import sequences, utils
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' +
+ 'from a mates file. Ouptut is interleaved if mates file given',
+ usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>')
+ parser.add_argument('--mate_file', help='Name of mates file')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ parser.add_argument('percent', type=int, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='INT')
+ options = parser.parse_args()
+
+ seq_reader = sequences.file_reader(options.infile)
+ fout = utils.open_file_write(options.outfile)
+
+ if options.mate_file:
+ mate_seq_reader = sequences.file_reader(options.mate_file)
+
+ for seq in seq_reader:
+ if options.mate_file:
+ try:
+ mate_seq = next(mate_seq_reader)
+ except StopIteration:
+ print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr)
+ sys.exit(1)
+ if random.randint(0, 100) <= options.percent:
+ print(seq, file=fout)
+ if options.mate_file:
+ print(mate_seq, file=fout)
+
+ utils.close(fout)
diff --git a/pyfastaq/runners/to_tiling_bam.py b/pyfastaq/runners/to_tiling_bam.py
new file mode 100644
index 0000000..ab37e65
--- /dev/null
+++ b/pyfastaq/runners/to_tiling_bam.py
@@ -0,0 +1,77 @@
+import argparse
+import sys
+import os
+from pyfastaq import sequences, utils
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Takes a sequence file. Makes a BAM file containing perfect (unpaired) reads tiling the whole genome',
+ usage = 'fastaq to_tiling_bam [options] <infile> <read_length> <read_step> <read_prefix> <outfile>',
+ epilog = 'Important: assumes that samtools is in your path')
+ parser.add_argument('infile', help='Name of input fasta/q file')
+ parser.add_argument('read_length', type=int, help='Length of reads')
+ parser.add_argument('read_step', type=int, help='Distance between start of each read')
+ parser.add_argument('read_prefix', help='Prefix of read names')
+ parser.add_argument('outfile', help='Name of output BAM file')
+ parser.add_argument('--read_group', help='Add the given read group ID to all reads [%(default)s]' ,default='42')
+ options = parser.parse_args()
+
+ # make a header first - we need to add the @RG line to the default header made by samtools
+ tmp_empty_file = options.outfile + '.tmp.empty'
+ f = utils.open_file_write(tmp_empty_file)
+ utils.close(f)
+ try:
+ f = os.popen('samtools view -H -T ' + options.infile + ' ' + tmp_empty_file)
+ except IOError:
+ print('Error making tmp header file', file=sys.stderr)
+ sys.exit(1)
+
+ header_lines = f.readlines()
+ header_lines.append('@RG\tID:' + options.read_group + '\tSM:FAKE')
+ f.close()
+ os.unlink(tmp_empty_file)
+
+ seq_reader = sequences.file_reader(options.infile)
+ try:
+ f = os.popen('samtools view -hbS - > ' + options.outfile, 'w')
+ except IOError:
+ print("Error opening for writing BAM file '" + options.outfile + "'", file=sys.stderr)
+ sys.exit(1)
+
+ print(''.join(header_lines), file=f)
+
+ for seq in seq_reader:
+ end_range = len(seq)
+ if len(seq) < options.read_length:
+ end_range = 1
+ for i in range(0, end_range, options.read_step):
+ if len(seq) <= options.read_length:
+ start = 0
+ end = len(seq) - 1
+ else:
+ start = i
+ end = start + options.read_length - 1
+
+ if end > len(seq) - 1:
+ end = len(seq) - 1
+ start = end - options.read_length + 1
+
+ read = sequences.Fastq(options.read_prefix + ':' + seq.id + ':' + str(start + 1) + ':' + str(end + 1), seq[start:end+1], 'I' * (end - start + 1))
+
+ print ('\t'.join([read.id,
+ '0',
+ seq.id,
+ str(start + 1),
+ '60',
+ str(len(read)) + 'M',
+ '*',
+ '*',
+ '*',
+ read.seq,
+ read.qual,
+ 'RG:Z:' + options.read_group]), file=f)
+
+ if end == len(seq) - 1:
+ break
+
+ f.close()
diff --git a/pyfastaq/runners/to_unique_by_id.py b/pyfastaq/runners/to_unique_by_id.py
new file mode 100644
index 0000000..c931cea
--- /dev/null
+++ b/pyfastaq/runners/to_unique_by_id.py
@@ -0,0 +1,11 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Removes duplicate sequences from input file, based on their names. If the same name is found more than once, then the longest sequence is kept. Order of sequences is preserved in output',
+ usage = 'fastaq to_unique_by_id <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.to_unique_by_id(options.infile, options.outfile)
diff --git a/pyfastaq/runners/translate.py b/pyfastaq/runners/translate.py
new file mode 100644
index 0000000..3f24fcb
--- /dev/null
+++ b/pyfastaq/runners/translate.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Translates all sequences in input file. Output is always FASTA format',
+ usage = 'fastaq translate [options] <infile> <outfile>')
+ parser.add_argument('--frame', type=int, choices=[0,1,2], help='Frame to translate [%(default)s]', default=0)
+ parser.add_argument('infile', help='Name of file to be translated')
+ parser.add_argument('outfile', help='Name of output FASTA file')
+ options = parser.parse_args()
+ tasks.translate(options.infile, options.outfile, frame=options.frame)
diff --git a/pyfastaq/runners/trim_Ns_at_end.py b/pyfastaq/runners/trim_Ns_at_end.py
new file mode 100644
index 0000000..753c26e
--- /dev/null
+++ b/pyfastaq/runners/trim_Ns_at_end.py
@@ -0,0 +1,11 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Trims any Ns off each sequence in input file. Does nothing to gaps in the middle, just trims the ends',
+ usage = 'fastaq trim_Ns_at_end <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.trim_Ns_at_end(options.infile, options.outfile)
diff --git a/pyfastaq/runners/trim_contigs.py b/pyfastaq/runners/trim_contigs.py
new file mode 100644
index 0000000..45a5e08
--- /dev/null
+++ b/pyfastaq/runners/trim_contigs.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Trims a set number of bases off the end of every contig, so gaps get bigger and contig ends are removed. Bases are replaced with Ns. Any sequence that ends up as all Ns is lost',
+ usage = 'fastaq trim_contigs [options] <infile> <outfile>')
+ parser.add_argument('--trim_number', type=int, help='Number of bases to trim around each gap, and off ends of each sequence [%(default)s]', default=100)
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.trim_contigs(options.infile, options.outfile, options.trim_number)
diff --git a/pyfastaq/runners/trim_ends.py b/pyfastaq/runners/trim_ends.py
new file mode 100644
index 0000000..3d0374d
--- /dev/null
+++ b/pyfastaq/runners/trim_ends.py
@@ -0,0 +1,13 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = description,
+ usage = 'fastaq trim_ends <infile> <bases off start> <bases off end> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('start_trim', type=int, help='Number of bases to trim off start')
+ parser.add_argument('end_trim', type=int, help='Number of bases to trim off end')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.trim(options.infile, options.outfile, options.start_trim, options.end_trim)
diff --git a/pyfastaq/runners/version.py b/pyfastaq/runners/version.py
new file mode 100644
index 0000000..3596510
--- /dev/null
+++ b/pyfastaq/runners/version.py
@@ -0,0 +1,4 @@
+from pyfastaq import common
+
+def run(description):
+ print(common.version)
diff --git a/fastaq/sequences.py b/pyfastaq/sequences.py
similarity index 96%
rename from fastaq/sequences.py
rename to pyfastaq/sequences.py
index f7efb1a..4a3c2a1 100644
--- a/fastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -3,7 +3,7 @@ import string
import random
import itertools
-from fastaq import utils, intervals
+from pyfastaq import utils, intervals, genetic_codes
class Error (Exception): pass
@@ -15,72 +15,7 @@ class Error (Exception): pass
previous_lines = {}
-codon2aa = {
-'GCA': 'A',
-'GCC': 'A',
-'GCG': 'A',
-'GCT': 'A',
-'AGA': 'R',
-'AGG': 'R',
-'CGA': 'R',
-'CGC': 'R',
-'CGG': 'R',
-'CGT': 'R',
-'AAC': 'N',
-'AAT': 'N',
-'GAC': 'D',
-'GAT': 'D',
-'TGC': 'C',
-'TGT': 'C',
-'GAA': 'E',
-'GAG': 'E',
-'CAA': 'Q',
-'CAG': 'Q',
-'GGA': 'G',
-'GGC': 'G',
-'GGG': 'G',
-'GGT': 'G',
-'CAC': 'H',
-'CAT': 'H',
-'ATA': 'I',
-'ATC': 'I',
-'ATT': 'I',
-'TTA': 'L',
-'TTG': 'L',
-'CTA': 'L',
-'CTC': 'L',
-'CTG': 'L',
-'CTT': 'L',
-'AAA': 'K',
-'AAG': 'K',
-'ATG': 'M',
-'TTC': 'F',
-'TTT': 'F',
-'CCA': 'P',
-'CCC': 'P',
-'CCG': 'P',
-'CCT': 'P',
-'AGC': 'S',
-'AGT': 'S',
-'TCA': 'S',
-'TCC': 'S',
-'TCG': 'S',
-'TCT': 'S',
-'ACA': 'T',
-'ACC': 'T',
-'ACG': 'T',
-'ACT': 'T',
-'TGG': 'W',
-'TAC': 'Y',
-'TAT': 'Y',
-'GTA': 'V',
-'GTC': 'V',
-'GTG': 'V',
-'GTT': 'V',
-'TAA': '*',
-'TAG': '*',
-'TGA': '*'}
-
+codon2aa = genetic_codes.codes[1]
redundant_nts = {
'R': ('A', 'G'),
@@ -238,6 +173,10 @@ class Fasta:
def __len__(self):
return len(self.seq)
+ def subseq(self, start, end):
+ '''Returns Fasta object with the same name, of the bases from start to end, but not including end'''
+ return Fasta(self.id, self.seq[start:end])
+
def split_capillary_id(self):
'''Gets the prefix and suffix of an name of a capillary read, e.g. xxxxx.p1k or xxxx.q1k. Returns a tuple (prefix, suffx)'''
try:
@@ -544,6 +483,10 @@ class Fastq(Fasta):
def __eq__(self, other):
return type(other) is type(self) and self.__dict__ == other.__dict__
+ def subseq(self, start, end):
+ '''Returns Fastq object with the same name, of the bases from start to end, but not including end'''
+ return Fastq(self.id, self.seq[start:end], self.qual[start:end])
+
def get_next_from_file(self, f, read_quals=False):
if f in previous_lines:
line = previous_lines[f]
diff --git a/fastaq/tasks.py b/pyfastaq/tasks.py
similarity index 85%
rename from fastaq/tasks.py
rename to pyfastaq/tasks.py
index 1a7d378..7527910 100644
--- a/fastaq/tasks.py
+++ b/pyfastaq/tasks.py
@@ -3,10 +3,30 @@ import sys
import copy
import random
import numpy
-from fastaq import sequences, utils
+from pyfastaq import sequences, utils, caf
class Error (Exception): pass
+def caf_to_fastq(infile, outfile, min_length=0, trim=False):
+ '''Convert a CAF file to fastq. Reads shorter than min_length are not output. If clipping information is in the CAF file (with a line Clipping QUAL ...) and trim=True, then trim the reads'''
+ caf_reader = caf.file_reader(infile)
+ fout = utils.open_file_write(outfile)
+
+ for c in caf_reader:
+ if trim:
+ if c.clip_start is not None and c.clip_end is not None:
+ c.seq.seq = c.seq.seq[c.clip_start:c.clip_end + 1]
+ c.seq.qual = c.seq.qual[c.clip_start:c.clip_end + 1]
+ else:
+ print('Warning: no clipping info for sequence', c.id, file=sys.stderr)
+
+
+ if len(c.seq) >= min_length:
+ print(c.seq, file=fout)
+
+ utils.close(fout)
+
+
def capillary_to_pairs(infile, outprefix):
# hash the sequences, only taking longest where an end has been sequenced more than once
seq_reader = sequences.file_reader(infile)
@@ -86,7 +106,7 @@ def deinterleave(infile, outfile_1, outfile_2, fasta_out=False):
utils.close(f_2)
-def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False, rename_file=None):
+def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False, rename_file=None, suffix=None):
seq_reader = sequences.file_reader(infile)
fout_seqs = utils.open_file_write(outfile)
counter = start_index
@@ -113,6 +133,9 @@ def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False,
if rename_file is not None:
print(old_id, seq.id, sep='\t', file=fout_rename)
+ if suffix is not None:
+ seq.id += suffix
+
print(seq, file=fout_seqs)
counter += 1
@@ -135,7 +158,7 @@ def expand_nucleotides(infile, outfile):
print(seq, file=fout)
-def extend_gaps(infile, outfile, trim):
+def trim_contigs(infile, outfile, trim):
seq_reader = sequences.file_reader(infile)
fout = utils.open_file_write(outfile)
@@ -242,7 +265,19 @@ def file_to_dict(infile, d):
d[seq.id] = copy.copy(seq)
-def filter(infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False):
+def filter(
+ infile,
+ outfile,
+ minlength=0,
+ maxlength=float('inf'),
+ regex=None,
+ ids_file=None,
+ invert=False,
+ mate_in=None,
+ mate_out=None,
+ both_mates_pass=True,
+ ):
+
ids_from_file = set()
if ids_file is not None:
f = utils.open_file_read(ids_file)
@@ -250,19 +285,44 @@ def filter(infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids
ids_from_file.add(line.rstrip())
utils.close(f)
+ if mate_in:
+ if mate_out is None:
+ raise Error('Error in filter! mate_in provided. Must also provide mate_out')
+
+ seq_reader_mate = sequences.file_reader(mate_in)
+ f_out_mate = utils.open_file_write(mate_out)
+
seq_reader = sequences.file_reader(infile)
f_out = utils.open_file_write(outfile)
if regex is not None:
r = re.compile(regex)
- for seq in seq_reader:
- hit = minlength <= len(seq) <= maxlength \
+
+ def passes(seq):
+ return minlength <= len(seq) <= maxlength \
and (regex is None or r.search(seq.id) is not None) \
and (ids_file is None or seq.id in ids_from_file)
- if hit != invert:
+ for seq in seq_reader:
+ seq_passes = passes(seq)
+ if mate_in:
+ try:
+ seq_mate = next(seq_reader_mate)
+ except:
+ utils.close(f_out)
+ raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue')
+
+ mate_passes = passes(seq_mate)
+ want_the_pair = (seq_passes and mate_passes) \
+ or (( seq_passes or mate_passes) and not both_mates_pass)
+ if want_the_pair != invert:
+ print(seq, file=f_out)
+ print(seq_mate, file=f_out_mate)
+ elif seq_passes != invert:
print(seq, file=f_out)
utils.close(f_out)
+ if mate_in:
+ utils.close(f_out_mate)
def get_ids(infile, outfile):
@@ -400,6 +460,21 @@ def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, t
utils.close(f)
+def mean_length(infile, limit=None):
+ '''Returns the mean length of the sequences in the input file. By default uses all sequences. To limit to the first N sequences, use limit=N'''
+ total = 0
+ count = 0
+ seq_reader = sequences.file_reader(infile)
+ for seq in seq_reader:
+ total += len(seq)
+ count += 1
+ if limit is not None and count >= limit:
+ break
+
+ assert count > 0
+ return total / count
+
+
def merge_to_one_seq(infile, outfile, seqname='union'):
'''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order'''
seq_reader = sequences.file_reader(infile)
@@ -510,6 +585,17 @@ def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_le
utils.close(f_out_2)
+def sort_by_size(infile, outfile, smallest_first=False):
+ '''Sorts input sequence file by biggest sequence first, writes sorted output file. Set smallest_first=True to have smallest first'''
+ seqs = {}
+ file_to_dict(infile, seqs)
+ seqs = list(seqs.values())
+ seqs.sort(key=lambda x: len(x), reverse=not smallest_first)
+ fout = utils.open_file_write(outfile)
+ for seq in seqs:
+ print(seq, file=fout)
+ utils.close(fout)
+
def translate(infile, outfile, frame=0):
seq_reader = sequences.file_reader(infile)
@@ -649,6 +735,29 @@ def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_
utils.close(f)
+def split_by_fixed_size_onefile(infile, outfile, chunk_size, tolerance, skip_if_all_Ns=False):
+ '''Splits each sequence in infile into chunks of fixed size, last chunk can be up to
+ (chunk_size + tolerance) in length'''
+ seq_reader = sequences.file_reader(infile)
+ f_out = utils.open_file_write(outfile)
+ for seq in seq_reader:
+ for i in range(0, len(seq), chunk_size):
+ if i + chunk_size + tolerance >= len(seq):
+ end = len(seq)
+ else:
+ end = i + chunk_size
+
+ subseq = seq.subseq(i, end)
+ if not (skip_if_all_Ns and subseq.is_all_Ns()):
+ subseq.id += '.' + str(i+1) + '_' + str(end)
+ print(subseq, file=f_out)
+
+ if end == len(seq):
+ break
+
+ utils.close(f_out)
+
+
def replace_bases(infile, outfile, old, new):
seq_reader = sequences.file_reader(infile)
f_out = utils.open_file_write(outfile)
diff --git a/pyfastaq/tests/caf_test.py b/pyfastaq/tests/caf_test.py
new file mode 100644
index 0000000..61d6154
--- /dev/null
+++ b/pyfastaq/tests/caf_test.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+
+import os
+import unittest
+from pyfastaq import caf, utils, sequences
+
+modules_dir = os.path.dirname(os.path.abspath(caf.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+class TestCaf(unittest.TestCase):
+ def test_get_next_from_file(self):
+ '''Test get_next_from_file()'''
+
+ f_in = utils.open_file_read(os.path.join(data_dir, 'caf_test.caf'))
+
+ c = caf.Caf()
+ c.get_next_from_file(f_in)
+ read = caf.Caf()
+ read.id = 'read1.p1k'
+ read.seq = sequences.Fasta(read.id, 'NACGTAN')
+ read.seq = read.seq.to_Fastq([4, 24, 42, 43, 40, 30, 8])
+ read.insert_min = 2000
+ read.insert_max = 4000
+ read.ligation = '12345'
+ read.clone = 'clone1'
+ read.clip_start = 1
+ read.clip_end = 5
+ self.assertEqual(c, read)
+
+ c.get_next_from_file(f_in)
+ read = caf.Caf()
+ read.id = 'read2.p1k'
+ read.seq = sequences.Fasta(read.id, 'CGACGTT')
+ read.seq = read.seq.to_Fastq([9, 9, 40, 41, 42, 42, 4])
+ read.insert_min = 2000
+ read.insert_max = 4000
+ read.ligation = '23456'
+ read.clone = 'clone2'
+ read.clip_start = None
+ read.clip_end = None
+ self.assertEqual(c, read)
+
+ utils.close(f_in)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/pyfastaq/tests/data/caf_test.caf b/pyfastaq/tests/data/caf_test.caf
new file mode 100644
index 0000000..f1ad7ff
--- /dev/null
+++ b/pyfastaq/tests/data/caf_test.caf
@@ -0,0 +1,48 @@
+
+DNA : read1.p1k
+NACG
+TAN
+
+BaseQuality : read1.p1k
+4 24 42 43 40 30 8
+
+Sequence : read1.p1k
+Is_read
+SCF_File read1.p1kSCF
+Template read1
+Insert_size 2000 4000
+Ligation_no 12345
+Primer Universal_primer
+Strand Forward
+Dye Dye_terminator
+Clone clone1
+Seq_vec SVEC 1 15 puc19
+Sequencing_vector "puc19"
+Clipping QUAL 2 6
+ProcessStatus PASS
+Asped 2006-7-5
+Unpadded
+Align_to_SCF 1 1272 1 1272
+
+DNA : read2.p1k
+CG
+ACGTT
+
+BaseQuality : read2.p1k
+9 9 40 41 42 42 4
+
+Sequence : read2.p1k
+Is_read
+SCF_File read2.p1kSCF
+Template read2
+Insert_size 2000 4000
+Ligation_no 23456
+Primer Universal_primer
+Strand Forward
+Dye Dye_terminator
+Clone clone2
+Seq_vec SVEC 1 32 puc19
+Sequencing_vector "puc19"
+ProcessStatus PASS
+Unpadded
+Align_to_SCF 1 1347 1 1347
diff --git a/pyfastaq/tests/data/caf_test.to_fastq.no_trim.min_length_0.fq b/pyfastaq/tests/data/caf_test.to_fastq.no_trim.min_length_0.fq
new file mode 100644
index 0000000..5519aad
--- /dev/null
+++ b/pyfastaq/tests/data/caf_test.to_fastq.no_trim.min_length_0.fq
@@ -0,0 +1,8 @@
+ at read1.p1k
+NACGTAN
++
+%9KLI?)
+ at read2.p1k
+CGACGTT
++
+**IJKK%
diff --git a/pyfastaq/tests/data/caf_test.to_fastq.trim.min_length_6.fq b/pyfastaq/tests/data/caf_test.to_fastq.trim.min_length_6.fq
new file mode 100644
index 0000000..cc6d7c0
--- /dev/null
+++ b/pyfastaq/tests/data/caf_test.to_fastq.trim.min_length_6.fq
@@ -0,0 +1,4 @@
+ at read2.p1k
+CGACGTT
++
+**IJKK%
diff --git a/fastaq/tests/data/sequences_test.embl b/pyfastaq/tests/data/sequences_test.embl
similarity index 100%
rename from fastaq/tests/data/sequences_test.embl
rename to pyfastaq/tests/data/sequences_test.embl
diff --git a/fastaq/tests/data/sequences_test.embl.bad b/pyfastaq/tests/data/sequences_test.embl.bad
similarity index 100%
rename from fastaq/tests/data/sequences_test.embl.bad
rename to pyfastaq/tests/data/sequences_test.embl.bad
diff --git a/fastaq/tests/data/sequences_test.embl.bad2 b/pyfastaq/tests/data/sequences_test.embl.bad2
similarity index 100%
rename from fastaq/tests/data/sequences_test.embl.bad2
rename to pyfastaq/tests/data/sequences_test.embl.bad2
diff --git a/fastaq/tests/data/sequences_test.embl.to_fasta b/pyfastaq/tests/data/sequences_test.embl.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test.embl.to_fasta
rename to pyfastaq/tests/data/sequences_test.embl.to_fasta
diff --git a/fastaq/tests/data/sequences_test.fa b/pyfastaq/tests/data/sequences_test.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test.fa
rename to pyfastaq/tests/data/sequences_test.fa
diff --git a/fastaq/tests/data/sequences_test.fa.ids b/pyfastaq/tests/data/sequences_test.fa.ids
similarity index 100%
rename from fastaq/tests/data/sequences_test.fa.ids
rename to pyfastaq/tests/data/sequences_test.fa.ids
diff --git a/fastaq/tests/data/sequences_test.fa.qual b/pyfastaq/tests/data/sequences_test.fa.qual
similarity index 100%
rename from fastaq/tests/data/sequences_test.fa.qual
rename to pyfastaq/tests/data/sequences_test.fa.qual
diff --git a/fastaq/tests/data/sequences_test.fa.qual.bad b/pyfastaq/tests/data/sequences_test.fa.qual.bad
similarity index 100%
rename from fastaq/tests/data/sequences_test.fa.qual.bad
rename to pyfastaq/tests/data/sequences_test.fa.qual.bad
diff --git a/fastaq/tests/data/sequences_test.fasta_to_fastq.fq b/pyfastaq/tests/data/sequences_test.fasta_to_fastq.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test.fasta_to_fastq.fq
rename to pyfastaq/tests/data/sequences_test.fasta_to_fastq.fq
diff --git a/fastaq/tests/data/sequences_test.gbk b/pyfastaq/tests/data/sequences_test.gbk
similarity index 100%
rename from fastaq/tests/data/sequences_test.gbk
rename to pyfastaq/tests/data/sequences_test.gbk
diff --git a/fastaq/tests/data/sequences_test.gbk.to_fasta b/pyfastaq/tests/data/sequences_test.gbk.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test.gbk.to_fasta
rename to pyfastaq/tests/data/sequences_test.gbk.to_fasta
diff --git a/fastaq/tests/data/sequences_test.line_length3.fa b/pyfastaq/tests/data/sequences_test.line_length3.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test.line_length3.fa
rename to pyfastaq/tests/data/sequences_test.line_length3.fa
diff --git a/fastaq/tests/data/sequences_test_3-per-line.fa b/pyfastaq/tests/data/sequences_test_3-per-line.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_3-per-line.fa
rename to pyfastaq/tests/data/sequences_test_3-per-line.fa
diff --git a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa b/pyfastaq/tests/data/sequences_test_cap_to_read_pairs.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_cap_to_read_pairs.fa
rename to pyfastaq/tests/data/sequences_test_cap_to_read_pairs.fa
diff --git a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.paired.gz b/pyfastaq/tests/data/sequences_test_cap_to_read_pairs.fa.paired.gz
similarity index 100%
rename from fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.paired.gz
rename to pyfastaq/tests/data/sequences_test_cap_to_read_pairs.fa.paired.gz
diff --git a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.unpaired.gz b/pyfastaq/tests/data/sequences_test_cap_to_read_pairs.fa.unpaired.gz
similarity index 100%
rename from fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.unpaired.gz
rename to pyfastaq/tests/data/sequences_test_cap_to_read_pairs.fa.unpaired.gz
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_1.fa b/pyfastaq/tests/data/sequences_test_deinterleaved_1.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_deinterleaved_1.fa
rename to pyfastaq/tests/data/sequences_test_deinterleaved_1.fa
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_2.fa b/pyfastaq/tests/data/sequences_test_deinterleaved_2.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_deinterleaved_2.fa
rename to pyfastaq/tests/data/sequences_test_deinterleaved_2.fa
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_bad2_1.fa b/pyfastaq/tests/data/sequences_test_deinterleaved_bad2_1.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_deinterleaved_bad2_1.fa
rename to pyfastaq/tests/data/sequences_test_deinterleaved_bad2_1.fa
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_bad2_2.fa b/pyfastaq/tests/data/sequences_test_deinterleaved_bad2_2.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_deinterleaved_bad2_2.fa
rename to pyfastaq/tests/data/sequences_test_deinterleaved_bad2_2.fa
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_bad_1.fa b/pyfastaq/tests/data/sequences_test_deinterleaved_bad_1.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_deinterleaved_bad_1.fa
rename to pyfastaq/tests/data/sequences_test_deinterleaved_bad_1.fa
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_bad_2.fa b/pyfastaq/tests/data/sequences_test_deinterleaved_bad_2.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_deinterleaved_bad_2.fa
rename to pyfastaq/tests/data/sequences_test_deinterleaved_bad_2.fa
diff --git a/fastaq/tests/data/sequences_test_empty_file b/pyfastaq/tests/data/sequences_test_empty_file
similarity index 100%
rename from fastaq/tests/data/sequences_test_empty_file
rename to pyfastaq/tests/data/sequences_test_empty_file
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa b/pyfastaq/tests/data/sequences_test_enumerate_names.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_enumerate_names.fa
rename to pyfastaq/tests/data/sequences_test_enumerate_names.fa
diff --git a/pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.add_suffix b/pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.add_suffix
new file mode 100644
index 0000000..ed89faa
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.add_suffix
@@ -0,0 +1,8 @@
+>1.SUFFIX
+A
+>2.SUFFIX
+C
+>3.SUFFIX
+G
+>4.SUFFIX
+T
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.keep_suffix b/pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.keep_suffix
similarity index 100%
rename from fastaq/tests/data/sequences_test_enumerate_names.fa.out.keep_suffix
rename to pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.keep_suffix
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1 b/pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1
rename to pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1.rename_file b/pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1.rename_file
similarity index 100%
rename from fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1.rename_file
rename to pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1.rename_file
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.2 b/pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.start.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.2
rename to pyfastaq/tests/data/sequences_test_enumerate_names.fa.out.start.2
diff --git a/fastaq/tests/data/sequences_test_fai_test.fa b/pyfastaq/tests/data/sequences_test_fai_test.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_fai_test.fa
rename to pyfastaq/tests/data/sequences_test_fai_test.fa
diff --git a/fastaq/tests/data/sequences_test_fai_test.fa.fai b/pyfastaq/tests/data/sequences_test_fai_test.fa.fai
similarity index 100%
rename from fastaq/tests/data/sequences_test_fai_test.fa.fai
rename to pyfastaq/tests/data/sequences_test_fai_test.fa.fai
diff --git a/fastaq/tests/data/sequences_test_fail_no_AT.fq b/pyfastaq/tests/data/sequences_test_fail_no_AT.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_fail_no_AT.fq
rename to pyfastaq/tests/data/sequences_test_fail_no_AT.fq
diff --git a/fastaq/tests/data/sequences_test_fail_no_plus.fq b/pyfastaq/tests/data/sequences_test_fail_no_plus.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_fail_no_plus.fq
rename to pyfastaq/tests/data/sequences_test_fail_no_plus.fq
diff --git a/fastaq/tests/data/sequences_test_fail_no_qual.fq b/pyfastaq/tests/data/sequences_test_fail_no_qual.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_fail_no_qual.fq
rename to pyfastaq/tests/data/sequences_test_fail_no_qual.fq
diff --git a/fastaq/tests/data/sequences_test_fail_no_seq.fq b/pyfastaq/tests/data/sequences_test_fail_no_seq.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_fail_no_seq.fq
rename to pyfastaq/tests/data/sequences_test_fail_no_seq.fq
diff --git a/fastaq/tests/data/sequences_test_fastaq_replace_bases.expected.fa b/pyfastaq/tests/data/sequences_test_fastaq_replace_bases.expected.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_fastaq_replace_bases.expected.fa
rename to pyfastaq/tests/data/sequences_test_fastaq_replace_bases.expected.fa
diff --git a/fastaq/tests/data/sequences_test_fastaq_replace_bases.fa b/pyfastaq/tests/data/sequences_test_fastaq_replace_bases.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_fastaq_replace_bases.fa
rename to pyfastaq/tests/data/sequences_test_fastaq_replace_bases.fa
diff --git a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa b/pyfastaq/tests/data/sequences_test_filter_by_ids_file.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_filter_by_ids_file.fa
rename to pyfastaq/tests/data/sequences_test_filter_by_ids_file.fa
diff --git a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered b/pyfastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered
similarity index 100%
rename from fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered
rename to pyfastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered
diff --git a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered.invert b/pyfastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered.invert
similarity index 100%
rename from fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered.invert
rename to pyfastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered.invert
diff --git a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.ids b/pyfastaq/tests/data/sequences_test_filter_by_ids_file.fa.ids
similarity index 100%
rename from fastaq/tests/data/sequences_test_filter_by_ids_file.fa.ids
rename to pyfastaq/tests/data/sequences_test_filter_by_ids_file.fa.ids
diff --git a/fastaq/tests/data/sequences_test_filter_by_regex.fa b/pyfastaq/tests/data/sequences_test_filter_by_regex.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_filter_by_regex.fa
rename to pyfastaq/tests/data/sequences_test_filter_by_regex.fa
diff --git a/fastaq/tests/data/sequences_test_filter_by_regex.first-char-a.fa b/pyfastaq/tests/data/sequences_test_filter_by_regex.first-char-a.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_filter_by_regex.first-char-a.fa
rename to pyfastaq/tests/data/sequences_test_filter_by_regex.first-char-a.fa
diff --git a/fastaq/tests/data/sequences_test_filter_by_regex.first-of-pair.fa b/pyfastaq/tests/data/sequences_test_filter_by_regex.first-of-pair.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_filter_by_regex.first-of-pair.fa
rename to pyfastaq/tests/data/sequences_test_filter_by_regex.first-of-pair.fa
diff --git a/fastaq/tests/data/sequences_test_filter_by_regex.numeric.fa b/pyfastaq/tests/data/sequences_test_filter_by_regex.numeric.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_filter_by_regex.numeric.fa
rename to pyfastaq/tests/data/sequences_test_filter_by_regex.numeric.fa
diff --git a/fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa b/pyfastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa
rename to pyfastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa
diff --git a/fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa.out b/pyfastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa.out
similarity index 100%
rename from fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa.out
rename to pyfastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa.out
diff --git a/fastaq/tests/data/sequences_test_gffv3.gff b/pyfastaq/tests/data/sequences_test_gffv3.gff
similarity index 100%
rename from fastaq/tests/data/sequences_test_gffv3.gff
rename to pyfastaq/tests/data/sequences_test_gffv3.gff
diff --git a/fastaq/tests/data/sequences_test_gffv3.gff.fasta b/pyfastaq/tests/data/sequences_test_gffv3.gff.fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_gffv3.gff.fasta
rename to pyfastaq/tests/data/sequences_test_gffv3.gff.fasta
diff --git a/fastaq/tests/data/sequences_test_gffv3.gff.to_fasta b/pyfastaq/tests/data/sequences_test_gffv3.gff.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_gffv3.gff.to_fasta
rename to pyfastaq/tests/data/sequences_test_gffv3.gff.to_fasta
diff --git a/fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff b/pyfastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff
similarity index 100%
rename from fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff
rename to pyfastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff
diff --git a/fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff.to_fasta b/pyfastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff.to_fasta
rename to pyfastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff.to_fasta
diff --git a/fastaq/tests/data/sequences_test_gffv3.no_seq.2.gff b/pyfastaq/tests/data/sequences_test_gffv3.no_seq.2.gff
similarity index 100%
rename from fastaq/tests/data/sequences_test_gffv3.no_seq.2.gff
rename to pyfastaq/tests/data/sequences_test_gffv3.no_seq.2.gff
diff --git a/fastaq/tests/data/sequences_test_gffv3.no_seq.gff b/pyfastaq/tests/data/sequences_test_gffv3.no_seq.gff
similarity index 100%
rename from fastaq/tests/data/sequences_test_gffv3.no_seq.gff
rename to pyfastaq/tests/data/sequences_test_gffv3.no_seq.gff
diff --git a/fastaq/tests/data/sequences_test_good_file.fq b/pyfastaq/tests/data/sequences_test_good_file.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_good_file.fq
rename to pyfastaq/tests/data/sequences_test_good_file.fq
diff --git a/fastaq/tests/data/sequences_test_good_file.fq.to_fasta b/pyfastaq/tests/data/sequences_test_good_file.fq.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_good_file.fq.to_fasta
rename to pyfastaq/tests/data/sequences_test_good_file.fq.to_fasta
diff --git a/fastaq/tests/data/sequences_test_good_file_mira.xml b/pyfastaq/tests/data/sequences_test_good_file_mira.xml
similarity index 100%
rename from fastaq/tests/data/sequences_test_good_file_mira.xml
rename to pyfastaq/tests/data/sequences_test_good_file_mira.xml
diff --git a/fastaq/tests/data/sequences_test_interleaved.fa b/pyfastaq/tests/data/sequences_test_interleaved.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_interleaved.fa
rename to pyfastaq/tests/data/sequences_test_interleaved.fa
diff --git a/fastaq/tests/data/sequences_test_interleaved.fq b/pyfastaq/tests/data/sequences_test_interleaved.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_interleaved.fq
rename to pyfastaq/tests/data/sequences_test_interleaved.fq
diff --git a/fastaq/tests/data/sequences_test_interleaved_bad.fa b/pyfastaq/tests/data/sequences_test_interleaved_bad.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_interleaved_bad.fa
rename to pyfastaq/tests/data/sequences_test_interleaved_bad.fa
diff --git a/fastaq/tests/data/sequences_test_length_filter.fa b/pyfastaq/tests/data/sequences_test_length_filter.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_length_filter.fa
rename to pyfastaq/tests/data/sequences_test_length_filter.fa
diff --git a/fastaq/tests/data/sequences_test_length_filter.min-0.max-1.fa b/pyfastaq/tests/data/sequences_test_length_filter.min-0.max-1.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_length_filter.min-0.max-1.fa
rename to pyfastaq/tests/data/sequences_test_length_filter.min-0.max-1.fa
diff --git a/fastaq/tests/data/sequences_test_length_filter.min-0.max-inf.fa b/pyfastaq/tests/data/sequences_test_length_filter.min-0.max-inf.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_length_filter.min-0.max-inf.fa
rename to pyfastaq/tests/data/sequences_test_length_filter.min-0.max-inf.fa
diff --git a/fastaq/tests/data/sequences_test_length_filter.min-4.max-4.fa b/pyfastaq/tests/data/sequences_test_length_filter.min-4.max-4.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_length_filter.min-4.max-4.fa
rename to pyfastaq/tests/data/sequences_test_length_filter.min-4.max-4.fa
diff --git a/fastaq/tests/data/sequences_test_make_random_contigs.default.fa b/pyfastaq/tests/data/sequences_test_make_random_contigs.default.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_make_random_contigs.default.fa
rename to pyfastaq/tests/data/sequences_test_make_random_contigs.default.fa
diff --git a/fastaq/tests/data/sequences_test_make_random_contigs.first-42.fa b/pyfastaq/tests/data/sequences_test_make_random_contigs.first-42.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_make_random_contigs.first-42.fa
rename to pyfastaq/tests/data/sequences_test_make_random_contigs.first-42.fa
diff --git a/fastaq/tests/data/sequences_test_make_random_contigs.name-by-letters.fa b/pyfastaq/tests/data/sequences_test_make_random_contigs.name-by-letters.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_make_random_contigs.name-by-letters.fa
rename to pyfastaq/tests/data/sequences_test_make_random_contigs.name-by-letters.fa
diff --git a/fastaq/tests/data/sequences_test_make_random_contigs.prefix-p.fa b/pyfastaq/tests/data/sequences_test_make_random_contigs.prefix-p.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_make_random_contigs.prefix-p.fa
rename to pyfastaq/tests/data/sequences_test_make_random_contigs.prefix-p.fa
diff --git a/fastaq/tests/data/sequences_test_merge_to_one_seq.fa b/pyfastaq/tests/data/sequences_test_merge_to_one_seq.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_merge_to_one_seq.fa
rename to pyfastaq/tests/data/sequences_test_merge_to_one_seq.fa
diff --git a/fastaq/tests/data/sequences_test_merge_to_one_seq.fq b/pyfastaq/tests/data/sequences_test_merge_to_one_seq.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_merge_to_one_seq.fq
rename to pyfastaq/tests/data/sequences_test_merge_to_one_seq.fq
diff --git a/fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fa b/pyfastaq/tests/data/sequences_test_merge_to_one_seq.merged.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fa
rename to pyfastaq/tests/data/sequences_test_merge_to_one_seq.merged.fa
diff --git a/fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fq b/pyfastaq/tests/data/sequences_test_merge_to_one_seq.merged.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fq
rename to pyfastaq/tests/data/sequences_test_merge_to_one_seq.merged.fq
diff --git a/fastaq/tests/data/sequences_test_not_a_fastaq_file b/pyfastaq/tests/data/sequences_test_not_a_fastaq_file
similarity index 100%
rename from fastaq/tests/data/sequences_test_not_a_fastaq_file
rename to pyfastaq/tests/data/sequences_test_not_a_fastaq_file
diff --git a/fastaq/tests/data/sequences_test_one-per-line.fa b/pyfastaq/tests/data/sequences_test_one-per-line.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_one-per-line.fa
rename to pyfastaq/tests/data/sequences_test_one-per-line.fa
diff --git a/fastaq/tests/data/sequences_test_orfs.fa b/pyfastaq/tests/data/sequences_test_orfs.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_orfs.fa
rename to pyfastaq/tests/data/sequences_test_orfs.fa
diff --git a/fastaq/tests/data/sequences_test_orfs.gff b/pyfastaq/tests/data/sequences_test_orfs.gff
similarity index 100%
rename from fastaq/tests/data/sequences_test_orfs.gff
rename to pyfastaq/tests/data/sequences_test_orfs.gff
diff --git a/fastaq/tests/data/sequences_test_phylip.interleaved b/pyfastaq/tests/data/sequences_test_phylip.interleaved
similarity index 100%
rename from fastaq/tests/data/sequences_test_phylip.interleaved
rename to pyfastaq/tests/data/sequences_test_phylip.interleaved
diff --git a/fastaq/tests/data/sequences_test_phylip.interleaved.to_fasta b/pyfastaq/tests/data/sequences_test_phylip.interleaved.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_phylip.interleaved.to_fasta
rename to pyfastaq/tests/data/sequences_test_phylip.interleaved.to_fasta
diff --git a/fastaq/tests/data/sequences_test_phylip.interleaved2 b/pyfastaq/tests/data/sequences_test_phylip.interleaved2
similarity index 100%
rename from fastaq/tests/data/sequences_test_phylip.interleaved2
rename to pyfastaq/tests/data/sequences_test_phylip.interleaved2
diff --git a/fastaq/tests/data/sequences_test_phylip.interleaved2.to_fasta b/pyfastaq/tests/data/sequences_test_phylip.interleaved2.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_phylip.interleaved2.to_fasta
rename to pyfastaq/tests/data/sequences_test_phylip.interleaved2.to_fasta
diff --git a/fastaq/tests/data/sequences_test_phylip.made_by_seaview b/pyfastaq/tests/data/sequences_test_phylip.made_by_seaview
similarity index 100%
rename from fastaq/tests/data/sequences_test_phylip.made_by_seaview
rename to pyfastaq/tests/data/sequences_test_phylip.made_by_seaview
diff --git a/fastaq/tests/data/sequences_test_phylip.made_by_seaview.to_fasta b/pyfastaq/tests/data/sequences_test_phylip.made_by_seaview.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_phylip.made_by_seaview.to_fasta
rename to pyfastaq/tests/data/sequences_test_phylip.made_by_seaview.to_fasta
diff --git a/fastaq/tests/data/sequences_test_phylip.sequential b/pyfastaq/tests/data/sequences_test_phylip.sequential
similarity index 100%
rename from fastaq/tests/data/sequences_test_phylip.sequential
rename to pyfastaq/tests/data/sequences_test_phylip.sequential
diff --git a/fastaq/tests/data/sequences_test_phylip.sequential.to_fasta b/pyfastaq/tests/data/sequences_test_phylip.sequential.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_phylip.sequential.to_fasta
rename to pyfastaq/tests/data/sequences_test_phylip.sequential.to_fasta
diff --git a/fastaq/tests/data/sequences_test_revcomp.fa b/pyfastaq/tests/data/sequences_test_revcomp.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_revcomp.fa
rename to pyfastaq/tests/data/sequences_test_revcomp.fa
diff --git a/fastaq/tests/data/sequences_test_search_string.fa b/pyfastaq/tests/data/sequences_test_search_string.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_search_string.fa
rename to pyfastaq/tests/data/sequences_test_search_string.fa
diff --git a/fastaq/tests/data/sequences_test_search_string.fa.hits b/pyfastaq/tests/data/sequences_test_search_string.fa.hits
similarity index 100%
rename from fastaq/tests/data/sequences_test_search_string.fa.hits
rename to pyfastaq/tests/data/sequences_test_search_string.fa.hits
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.1 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.1
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.1
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.2 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.2
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.2
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.3 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.3
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.3
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.3
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.4 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.4
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.4
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.4
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.5 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.5
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.5
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.5
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.6 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.6
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.6
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.6
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.coords b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.coords
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.coords
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.coords
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.1 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.1
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.1
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.2 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.2
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.2
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.3 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.3
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.3
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.3
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.4 b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.4
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.4
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.4
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords b/pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords
rename to pyfastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords
diff --git a/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.fa b/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.fa
new file mode 100644
index 0000000..6502b16
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.fa
@@ -0,0 +1,18 @@
+>seq1
+A
+>seq2
+NC
+>seq3
+ACG
+>seq4
+ACGT
+>seq5
+NNNTA
+>seq6
+ACGTAC
+>seq7
+ACGTACG
+>seq8
+ACGTACGT
+>seq9
+ACGTACGTA
diff --git a/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.out.fa b/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.out.fa
new file mode 100644
index 0000000..dee22da
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.out.fa
@@ -0,0 +1,32 @@
+>seq1.1_1
+A
+>seq2.1_2
+NC
+>seq3.1_3
+ACG
+>seq4.1_4
+ACGT
+>seq5.1_3
+NNN
+>seq5.4_5
+TA
+>seq6.1_3
+ACG
+>seq6.4_6
+TAC
+>seq7.1_3
+ACG
+>seq7.4_7
+TACG
+>seq8.1_3
+ACG
+>seq8.4_6
+TAC
+>seq8.7_8
+GT
+>seq9.1_3
+ACG
+>seq9.4_6
+TAC
+>seq9.7_9
+GTA
diff --git a/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.skip_Ns.out.fa b/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.skip_Ns.out.fa
new file mode 100644
index 0000000..0c4a7b0
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test_split_fixed_size_onefile.skip_Ns.out.fa
@@ -0,0 +1,30 @@
+>seq1.1_1
+A
+>seq2.1_2
+NC
+>seq3.1_3
+ACG
+>seq4.1_4
+ACGT
+>seq5.4_5
+TA
+>seq6.1_3
+ACG
+>seq6.4_6
+TAC
+>seq7.1_3
+ACG
+>seq7.4_7
+TACG
+>seq8.1_3
+ACG
+>seq8.4_6
+TAC
+>seq8.7_8
+GT
+>seq9.1_3
+ACG
+>seq9.4_6
+TAC
+>seq9.7_9
+GTA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa b/pyfastaq/tests/data/sequences_test_split_test.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa
rename to pyfastaq/tests/data/sequences_test_split_test.fa
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.2.1 b/pyfastaq/tests/data/sequences_test_split_test.fa.2.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.2.1
rename to pyfastaq/tests/data/sequences_test_split_test.fa.2.1
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.2.2 b/pyfastaq/tests/data/sequences_test_split_test.fa.2.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.2.2
rename to pyfastaq/tests/data/sequences_test_split_test.fa.2.2
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.2.3 b/pyfastaq/tests/data/sequences_test_split_test.fa.2.3
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.2.3
rename to pyfastaq/tests/data/sequences_test_split_test.fa.2.3
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.2.4 b/pyfastaq/tests/data/sequences_test_split_test.fa.2.4
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.2.4
rename to pyfastaq/tests/data/sequences_test_split_test.fa.2.4
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.3.1 b/pyfastaq/tests/data/sequences_test_split_test.fa.3.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.3.1
rename to pyfastaq/tests/data/sequences_test_split_test.fa.3.1
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.3.2 b/pyfastaq/tests/data/sequences_test_split_test.fa.3.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.3.2
rename to pyfastaq/tests/data/sequences_test_split_test.fa.3.2
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.3.3 b/pyfastaq/tests/data/sequences_test_split_test.fa.3.3
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.3.3
rename to pyfastaq/tests/data/sequences_test_split_test.fa.3.3
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.4.1 b/pyfastaq/tests/data/sequences_test_split_test.fa.4.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.4.1
rename to pyfastaq/tests/data/sequences_test_split_test.fa.4.1
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.4.2 b/pyfastaq/tests/data/sequences_test_split_test.fa.4.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.4.2
rename to pyfastaq/tests/data/sequences_test_split_test.fa.4.2
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.4.3 b/pyfastaq/tests/data/sequences_test_split_test.fa.4.3
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.4.3
rename to pyfastaq/tests/data/sequences_test_split_test.fa.4.3
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.1 b/pyfastaq/tests/data/sequences_test_split_test.fa.6.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.6.1
rename to pyfastaq/tests/data/sequences_test_split_test.fa.6.1
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.2 b/pyfastaq/tests/data/sequences_test_split_test.fa.6.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.6.2
rename to pyfastaq/tests/data/sequences_test_split_test.fa.6.2
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.1 b/pyfastaq/tests/data/sequences_test_split_test.fa.6.limit2.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.6.limit2.1
rename to pyfastaq/tests/data/sequences_test_split_test.fa.6.limit2.1
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.2 b/pyfastaq/tests/data/sequences_test_split_test.fa.6.limit2.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.6.limit2.2
rename to pyfastaq/tests/data/sequences_test_split_test.fa.6.limit2.2
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.3 b/pyfastaq/tests/data/sequences_test_split_test.fa.6.limit2.3
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.fa.6.limit2.3
rename to pyfastaq/tests/data/sequences_test_split_test.fa.6.limit2.3
diff --git a/fastaq/tests/data/sequences_test_split_test.long.fa b/pyfastaq/tests/data/sequences_test_split_test.long.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.long.fa
rename to pyfastaq/tests/data/sequences_test_split_test.long.fa
diff --git a/fastaq/tests/data/sequences_test_split_test.long.fa.2.1 b/pyfastaq/tests/data/sequences_test_split_test.long.fa.2.1
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.long.fa.2.1
rename to pyfastaq/tests/data/sequences_test_split_test.long.fa.2.1
diff --git a/fastaq/tests/data/sequences_test_split_test.long.fa.2.2 b/pyfastaq/tests/data/sequences_test_split_test.long.fa.2.2
similarity index 100%
rename from fastaq/tests/data/sequences_test_split_test.long.fa.2.2
rename to pyfastaq/tests/data/sequences_test_split_test.long.fa.2.2
diff --git a/fastaq/tests/data/sequences_test_strip_after_whitespace.fa b/pyfastaq/tests/data/sequences_test_strip_after_whitespace.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_strip_after_whitespace.fa
rename to pyfastaq/tests/data/sequences_test_strip_after_whitespace.fa
diff --git a/fastaq/tests/data/sequences_test_strip_after_whitespace.fa.to_fasta b/pyfastaq/tests/data/sequences_test_strip_after_whitespace.fa.to_fasta
similarity index 100%
rename from fastaq/tests/data/sequences_test_strip_after_whitespace.fa.to_fasta
rename to pyfastaq/tests/data/sequences_test_strip_after_whitespace.fa.to_fasta
diff --git a/fastaq/tests/data/sequences_test_strip_illumina_suffix.fq b/pyfastaq/tests/data/sequences_test_strip_illumina_suffix.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_strip_illumina_suffix.fq
rename to pyfastaq/tests/data/sequences_test_strip_illumina_suffix.fq
diff --git a/fastaq/tests/data/sequences_test_strip_illumina_suffix.fq.stripped b/pyfastaq/tests/data/sequences_test_strip_illumina_suffix.fq.stripped
similarity index 100%
rename from fastaq/tests/data/sequences_test_strip_illumina_suffix.fq.stripped
rename to pyfastaq/tests/data/sequences_test_strip_illumina_suffix.fq.stripped
diff --git a/fastaq/tests/data/sequences_test_to_fasta_union.in.fa b/pyfastaq/tests/data/sequences_test_to_fasta_union.in.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_to_fasta_union.in.fa
rename to pyfastaq/tests/data/sequences_test_to_fasta_union.in.fa
diff --git a/fastaq/tests/data/sequences_test_to_fasta_union.out.fa b/pyfastaq/tests/data/sequences_test_to_fasta_union.out.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_to_fasta_union.out.fa
rename to pyfastaq/tests/data/sequences_test_to_fasta_union.out.fa
diff --git a/fastaq/tests/data/sequences_test_to_unique_by_id.fa b/pyfastaq/tests/data/sequences_test_to_unique_by_id.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_to_unique_by_id.fa
rename to pyfastaq/tests/data/sequences_test_to_unique_by_id.fa
diff --git a/fastaq/tests/data/sequences_test_to_unique_by_id.fa.out b/pyfastaq/tests/data/sequences_test_to_unique_by_id.fa.out
similarity index 100%
rename from fastaq/tests/data/sequences_test_to_unique_by_id.fa.out
rename to pyfastaq/tests/data/sequences_test_to_unique_by_id.fa.out
diff --git a/fastaq/tests/data/sequences_test_translate.fa b/pyfastaq/tests/data/sequences_test_translate.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_translate.fa
rename to pyfastaq/tests/data/sequences_test_translate.fa
diff --git a/fastaq/tests/data/sequences_test_translate.fa.frame0 b/pyfastaq/tests/data/sequences_test_translate.fa.frame0
similarity index 100%
rename from fastaq/tests/data/sequences_test_translate.fa.frame0
rename to pyfastaq/tests/data/sequences_test_translate.fa.frame0
diff --git a/fastaq/tests/data/sequences_test_translate.fa.frame1 b/pyfastaq/tests/data/sequences_test_translate.fa.frame1
similarity index 100%
rename from fastaq/tests/data/sequences_test_translate.fa.frame1
rename to pyfastaq/tests/data/sequences_test_translate.fa.frame1
diff --git a/fastaq/tests/data/sequences_test_translate.fa.frame2 b/pyfastaq/tests/data/sequences_test_translate.fa.frame2
similarity index 100%
rename from fastaq/tests/data/sequences_test_translate.fa.frame2
rename to pyfastaq/tests/data/sequences_test_translate.fa.frame2
diff --git a/fastaq/tests/data/sequences_test_trim_Ns_at_end.fa b/pyfastaq/tests/data/sequences_test_trim_Ns_at_end.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_trim_Ns_at_end.fa
rename to pyfastaq/tests/data/sequences_test_trim_Ns_at_end.fa
diff --git a/fastaq/tests/data/sequences_test_trim_Ns_at_end.fa.trimmed b/pyfastaq/tests/data/sequences_test_trim_Ns_at_end.fa.trimmed
similarity index 100%
rename from fastaq/tests/data/sequences_test_trim_Ns_at_end.fa.trimmed
rename to pyfastaq/tests/data/sequences_test_trim_Ns_at_end.fa.trimmed
diff --git a/fastaq/tests/data/sequences_test_extend_gaps.fa b/pyfastaq/tests/data/sequences_test_trim_contigs.fa
similarity index 100%
rename from fastaq/tests/data/sequences_test_extend_gaps.fa
rename to pyfastaq/tests/data/sequences_test_trim_contigs.fa
diff --git a/fastaq/tests/data/sequences_test_extend_gaps.fa.out b/pyfastaq/tests/data/sequences_test_trim_contigs.fa.out
similarity index 100%
rename from fastaq/tests/data/sequences_test_extend_gaps.fa.out
rename to pyfastaq/tests/data/sequences_test_trim_contigs.fa.out
diff --git a/fastaq/tests/data/sequences_test_trimmed.fq b/pyfastaq/tests/data/sequences_test_trimmed.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_trimmed.fq
rename to pyfastaq/tests/data/sequences_test_trimmed.fq
diff --git a/fastaq/tests/data/sequences_test_untrimmed.fq b/pyfastaq/tests/data/sequences_test_untrimmed.fq
similarity index 100%
rename from fastaq/tests/data/sequences_test_untrimmed.fq
rename to pyfastaq/tests/data/sequences_test_untrimmed.fq
diff --git a/fastaq/tests/data/tasks_test_expend_nucleotides.in.fa b/pyfastaq/tests/data/tasks_test_expend_nucleotides.in.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_expend_nucleotides.in.fa
rename to pyfastaq/tests/data/tasks_test_expend_nucleotides.in.fa
diff --git a/fastaq/tests/data/tasks_test_expend_nucleotides.in.fq b/pyfastaq/tests/data/tasks_test_expend_nucleotides.in.fq
similarity index 100%
rename from fastaq/tests/data/tasks_test_expend_nucleotides.in.fq
rename to pyfastaq/tests/data/tasks_test_expend_nucleotides.in.fq
diff --git a/fastaq/tests/data/tasks_test_expend_nucleotides.out.fa b/pyfastaq/tests/data/tasks_test_expend_nucleotides.out.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_expend_nucleotides.out.fa
rename to pyfastaq/tests/data/tasks_test_expend_nucleotides.out.fa
diff --git a/fastaq/tests/data/tasks_test_expend_nucleotides.out.fq b/pyfastaq/tests/data/tasks_test_expend_nucleotides.out.fq
similarity index 100%
rename from fastaq/tests/data/tasks_test_expend_nucleotides.out.fq
rename to pyfastaq/tests/data/tasks_test_expend_nucleotides.out.fq
diff --git a/fastaq/tests/data/tasks_test_fasta_to_fake_qual.in.fa b/pyfastaq/tests/data/tasks_test_fasta_to_fake_qual.in.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_fasta_to_fake_qual.in.fa
rename to pyfastaq/tests/data/tasks_test_fasta_to_fake_qual.in.fa
diff --git a/fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.default.qual b/pyfastaq/tests/data/tasks_test_fasta_to_fake_qual.out.default.qual
similarity index 100%
rename from fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.default.qual
rename to pyfastaq/tests/data/tasks_test_fasta_to_fake_qual.out.default.qual
diff --git a/fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.q42.qual b/pyfastaq/tests/data/tasks_test_fasta_to_fake_qual.out.q42.qual
similarity index 100%
rename from fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.q42.qual
rename to pyfastaq/tests/data/tasks_test_fasta_to_fake_qual.out.q42.qual
diff --git a/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.in_1.fa b/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.in_1.fa
new file mode 100644
index 0000000..360c67d
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.in_1.fa
@@ -0,0 +1,8 @@
+>both_fail/1
+A
+>read1_pass/1
+ACGT
+>read2_pass/1
+A
+>both_pass/1
+ACGT
diff --git a/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.in_2.fa b/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.in_2.fa
new file mode 100644
index 0000000..2fe4be3
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.in_2.fa
@@ -0,0 +1,8 @@
+>both_fail/2
+A
+>read1_pass/2
+A
+>read2_pass/2
+ACGT
+>both_pass/2
+ACGT
diff --git a/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.out_1.fa b/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.out_1.fa
new file mode 100644
index 0000000..253d5d4
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.out_1.fa
@@ -0,0 +1,2 @@
+>both_pass/1
+ACGT
diff --git a/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.out_2.fa b/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.out_2.fa
new file mode 100644
index 0000000..89443ae
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_filter_paired_both_pass.out_2.fa
@@ -0,0 +1,2 @@
+>both_pass/2
+ACGT
diff --git a/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.in_1.fa b/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.in_1.fa
new file mode 100644
index 0000000..360c67d
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.in_1.fa
@@ -0,0 +1,8 @@
+>both_fail/1
+A
+>read1_pass/1
+ACGT
+>read2_pass/1
+A
+>both_pass/1
+ACGT
diff --git a/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.in_2.fa b/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.in_2.fa
new file mode 100644
index 0000000..2fe4be3
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.in_2.fa
@@ -0,0 +1,8 @@
+>both_fail/2
+A
+>read1_pass/2
+A
+>read2_pass/2
+ACGT
+>both_pass/2
+ACGT
diff --git a/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.out_1.fa b/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.out_1.fa
new file mode 100644
index 0000000..f992333
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.out_1.fa
@@ -0,0 +1,6 @@
+>read1_pass/1
+ACGT
+>read2_pass/1
+A
+>both_pass/1
+ACGT
diff --git a/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.out_2.fa b/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.out_2.fa
new file mode 100644
index 0000000..75dbd9f
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_filter_paired_one_pass.out_2.fa
@@ -0,0 +1,6 @@
+>read1_pass/2
+A
+>read2_pass/2
+ACGT
+>both_pass/2
+ACGT
diff --git a/fastaq/tests/data/tasks_test_make_long_reads.input.fa b/pyfastaq/tests/data/tasks_test_make_long_reads.input.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_make_long_reads.input.fa
rename to pyfastaq/tests/data/tasks_test_make_long_reads.input.fa
diff --git a/fastaq/tests/data/tasks_test_make_long_reads.output.fa b/pyfastaq/tests/data/tasks_test_make_long_reads.output.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_make_long_reads.output.fa
rename to pyfastaq/tests/data/tasks_test_make_long_reads.output.fa
diff --git a/pyfastaq/tests/data/tasks_test_mean_length.fa b/pyfastaq/tests/data/tasks_test_mean_length.fa
new file mode 100644
index 0000000..d298e03
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_mean_length.fa
@@ -0,0 +1,8 @@
+>1
+AGT
+>2
+A
+>3
+AGACG
+>4
+AAGTAGT
diff --git a/fastaq/tests/data/tasks_test_sequence_trim_1.fa b/pyfastaq/tests/data/tasks_test_sequence_trim_1.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_sequence_trim_1.fa
rename to pyfastaq/tests/data/tasks_test_sequence_trim_1.fa
diff --git a/fastaq/tests/data/tasks_test_sequence_trim_1.trimmed.fa b/pyfastaq/tests/data/tasks_test_sequence_trim_1.trimmed.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_sequence_trim_1.trimmed.fa
rename to pyfastaq/tests/data/tasks_test_sequence_trim_1.trimmed.fa
diff --git a/fastaq/tests/data/tasks_test_sequence_trim_2.fa b/pyfastaq/tests/data/tasks_test_sequence_trim_2.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_sequence_trim_2.fa
rename to pyfastaq/tests/data/tasks_test_sequence_trim_2.fa
diff --git a/fastaq/tests/data/tasks_test_sequence_trim_2.trimmed.fa b/pyfastaq/tests/data/tasks_test_sequence_trim_2.trimmed.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_sequence_trim_2.trimmed.fa
rename to pyfastaq/tests/data/tasks_test_sequence_trim_2.trimmed.fa
diff --git a/fastaq/tests/data/tasks_test_sequences_to_trim.fa b/pyfastaq/tests/data/tasks_test_sequences_to_trim.fa
similarity index 100%
rename from fastaq/tests/data/tasks_test_sequences_to_trim.fa
rename to pyfastaq/tests/data/tasks_test_sequences_to_trim.fa
diff --git a/pyfastaq/tests/data/tasks_test_sort_by_size.in.fa b/pyfastaq/tests/data/tasks_test_sort_by_size.in.fa
new file mode 100644
index 0000000..38833cb
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_sort_by_size.in.fa
@@ -0,0 +1,8 @@
+>1
+AGTCA
+>2
+ACGTTT
+>3
+A
+>4
+ACG
diff --git a/pyfastaq/tests/data/tasks_test_sort_by_size.out.fa b/pyfastaq/tests/data/tasks_test_sort_by_size.out.fa
new file mode 100644
index 0000000..b65543a
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_sort_by_size.out.fa
@@ -0,0 +1,8 @@
+>2
+ACGTTT
+>1
+AGTCA
+>4
+ACG
+>3
+A
diff --git a/pyfastaq/tests/data/tasks_test_sort_by_size.out.rev.fa b/pyfastaq/tests/data/tasks_test_sort_by_size.out.rev.fa
new file mode 100644
index 0000000..3819030
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_sort_by_size.out.rev.fa
@@ -0,0 +1,8 @@
+>3
+A
+>4
+ACG
+>1
+AGTCA
+>2
+ACGTTT
diff --git a/fastaq/tests/data/utils_test_file_transpose.txt b/pyfastaq/tests/data/utils_test_file_transpose.txt
similarity index 100%
rename from fastaq/tests/data/utils_test_file_transpose.txt
rename to pyfastaq/tests/data/utils_test_file_transpose.txt
diff --git a/fastaq/tests/data/utils_test_file_transposed.txt b/pyfastaq/tests/data/utils_test_file_transposed.txt
similarity index 100%
rename from fastaq/tests/data/utils_test_file_transposed.txt
rename to pyfastaq/tests/data/utils_test_file_transposed.txt
diff --git a/fastaq/tests/data/utils_test_not_really_zipped.gz b/pyfastaq/tests/data/utils_test_not_really_zipped.gz
similarity index 100%
rename from fastaq/tests/data/utils_test_not_really_zipped.gz
rename to pyfastaq/tests/data/utils_test_not_really_zipped.gz
diff --git a/fastaq/tests/data/utils_test_scaffolds.fa b/pyfastaq/tests/data/utils_test_scaffolds.fa
similarity index 100%
rename from fastaq/tests/data/utils_test_scaffolds.fa
rename to pyfastaq/tests/data/utils_test_scaffolds.fa
diff --git a/fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.fa b/pyfastaq/tests/data/utils_test_scaffolds.fa.to_contigs.fa
similarity index 100%
rename from fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.fa
rename to pyfastaq/tests/data/utils_test_scaffolds.fa.to_contigs.fa
diff --git a/fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.number_contigs.fa b/pyfastaq/tests/data/utils_test_scaffolds.fa.to_contigs.number_contigs.fa
similarity index 100%
rename from fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.number_contigs.fa
rename to pyfastaq/tests/data/utils_test_scaffolds.fa.to_contigs.number_contigs.fa
diff --git a/fastaq/tests/data/utils_test_system_call.txt b/pyfastaq/tests/data/utils_test_system_call.txt
similarity index 100%
rename from fastaq/tests/data/utils_test_system_call.txt
rename to pyfastaq/tests/data/utils_test_system_call.txt
diff --git a/fastaq/tests/intervals_test.py b/pyfastaq/tests/intervals_test.py
similarity index 99%
rename from fastaq/tests/intervals_test.py
rename to pyfastaq/tests/intervals_test.py
index c6282a0..e899a63 100644
--- a/fastaq/tests/intervals_test.py
+++ b/pyfastaq/tests/intervals_test.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
import unittest
-from fastaq import intervals
+from pyfastaq import intervals
class TestIntervals(unittest.TestCase):
def test_init(self):
diff --git a/fastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py
similarity index 97%
rename from fastaq/tests/sequences_test.py
rename to pyfastaq/tests/sequences_test.py
index fad6098..cc22c8d 100644
--- a/fastaq/tests/sequences_test.py
+++ b/pyfastaq/tests/sequences_test.py
@@ -4,7 +4,7 @@ import sys
import filecmp
import os
import unittest
-from fastaq import sequences, utils, intervals, tasks
+from pyfastaq import sequences, utils, intervals, tasks
modules_dir = os.path.dirname(os.path.abspath(sequences.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')
@@ -62,6 +62,13 @@ class TestFasta(unittest.TestCase):
'''len() should return the length of the sequence'''
self.assertEqual(5, len(self.fasta))
+ def test_subseq(self):
+ '''Test subseq'''
+ fa = sequences.Fasta('name', 'ACGTA')
+ self.assertEqual(fa.subseq(1,4), sequences.Fasta('name', 'CGT'))
+ self.assertEqual(fa.subseq(None,4), sequences.Fasta('name', 'ACGT'))
+ self.assertEqual(fa.subseq(1,None), sequences.Fasta('name', 'CGTA'))
+
def test_print_line_length(self):
'''__str__ should be formatted correctly with the right number of chars per line of sequence'''
line_lengths = [0, 3]
@@ -466,6 +473,13 @@ class TestFastq(unittest.TestCase):
self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
utils.close(f_in)
+ def test_subseq(self):
+ '''Test subseq'''
+ fq = sequences.Fastq('name', 'ACGTA', 'FGHIJ')
+ self.assertEqual(fq.subseq(1,4), sequences.Fastq('name', 'CGT', 'GHI'))
+ self.assertEqual(fq.subseq(None,4), sequences.Fastq('name', 'ACGT', 'FGHI'))
+ self.assertEqual(fq.subseq(1,None), sequences.Fastq('name', 'CGTA', 'GHIJ'))
+
def test_revcomp(self):
'''revcomp() should correctly reverse complement a sequence'''
fq = sequences.Fastq('ID', 'ACGTNacgtn', '1234567890')
diff --git a/fastaq/tests/tasks_test.py b/pyfastaq/tests/tasks_test.py
similarity index 83%
rename from fastaq/tests/tasks_test.py
rename to pyfastaq/tests/tasks_test.py
index 7528815..6d14ef6 100644
--- a/fastaq/tests/tasks_test.py
+++ b/pyfastaq/tests/tasks_test.py
@@ -4,13 +4,28 @@ import sys
import filecmp
import os
import unittest
-from fastaq import tasks, sequences
+from pyfastaq import tasks, sequences
modules_dir = os.path.dirname(os.path.abspath(sequences.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')
class Error (Exception): pass
+class TestCafToFastq(unittest.TestCase):
+ def test_caf_to_fastq_default(self):
+ '''Test caf_to_fastq with no filtering'''
+ tmpfile = 'tmp.fq'
+ tasks.caf_to_fastq(os.path.join(data_dir, 'caf_test.caf'), tmpfile)
+ self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'caf_test.to_fastq.no_trim.min_length_0.fq'), tmpfile, shallow=False))
+ os.unlink(tmpfile)
+
+ def test_caf_to_fastq_trim_and_min_length(self):
+ '''Test caf_to_fastq with trimming and min_length'''
+ tmpfile = 'tmp.fq'
+ tasks.caf_to_fastq(os.path.join(data_dir, 'caf_test.caf'), tmpfile, trim=True, min_length=6)
+ self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'caf_test.to_fastq.trim.min_length_6.fq'), tmpfile, shallow=False))
+ os.unlink(tmpfile)
+
class TestCapillaryToPairs(unittest.TestCase):
def test_capillary_to_pairs(self):
@@ -66,6 +81,9 @@ class TestEnumerateNames(unittest.TestCase):
self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.2'), outfile))
tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, keep_illumina_suffix=True)
self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.keep_suffix'), outfile))
+
+ tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, suffix='.SUFFIX')
+ self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.add_suffix'), outfile, shallow=False))
os.unlink(outfile)
os.unlink(rename_out)
@@ -87,11 +105,11 @@ class TestExpandNucleotides(unittest.TestCase):
class TestExtendGaps(unittest.TestCase):
- def test_extend_gaps(self):
+ def test_trim_contigs(self):
'''Test that gap extension works'''
outfile = 'tmp.gap_extend.fa'
- tasks.extend_gaps(os.path.join(data_dir, 'sequences_test_extend_gaps.fa'), outfile, trim=2)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_extend_gaps.fa.out'), outfile))
+ tasks.trim_contigs(os.path.join(data_dir, 'sequences_test_trim_contigs.fa'), outfile, trim=2)
+ self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_trim_contigs.fa.out'), outfile))
os.unlink(outfile)
@@ -159,6 +177,36 @@ class TestFilter(unittest.TestCase):
os.unlink(outfile)
+ def test_paired_both_pass(self):
+ '''Test filter with paired file both pass'''
+ infile1 = os.path.join(data_dir, 'tasks_test_filter_paired_both_pass.in_1.fa')
+ infile2 = os.path.join(data_dir, 'tasks_test_filter_paired_both_pass.in_2.fa')
+ outfile1 = 'tmp.filter_both_pass_1.fa'
+ outfile2 = 'tmp.filter_both_pass_2.fa'
+ expected1 = os.path.join(data_dir, 'tasks_test_filter_paired_both_pass.out_1.fa')
+ expected2 = os.path.join(data_dir, 'tasks_test_filter_paired_both_pass.out_2.fa')
+ tasks.filter(infile1, outfile1, mate_in=infile2, mate_out=outfile2, minlength=3)
+ self.assertTrue(filecmp.cmp(outfile1, expected1, shallow=False))
+ self.assertTrue(filecmp.cmp(outfile2, expected2, shallow=False))
+ os.unlink(outfile1)
+ os.unlink(outfile2)
+
+
+ def test_paired_one_pass(self):
+ '''Test filter with paired file one pass'''
+ infile1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_1.fa')
+ infile2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_2.fa')
+ outfile1 = 'tmp.filter_one_pass_1.fa'
+ outfile2 = 'tmp.filter_one_pass_2.fa'
+ expected1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_1.fa')
+ expected2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_2.fa')
+ tasks.filter(infile1, outfile1, mate_in=infile2, mate_out=outfile2, both_mates_pass=False, minlength=3)
+ self.assertTrue(filecmp.cmp(outfile1, expected1, shallow=False))
+ self.assertTrue(filecmp.cmp(outfile2, expected2, shallow=False))
+ os.unlink(outfile1)
+ os.unlink(outfile2)
+
+
class TestGetSeqsFlankingGaps(unittest.TestCase):
def test_get_seqs_flanking_gaps(self):
outfile = 'tmp.seqs_flanking_gaps'
@@ -232,6 +280,17 @@ class TestMakeLongReads(unittest.TestCase):
os.unlink(tmp)
+class TestMeanLength(unittest.TestCase):
+ def test_mean_length(self):
+ '''Test mean_length'''
+ expected = [3, 2, 3, 4, 4]
+ limits = [1, 2, 3, 4, None]
+ assert len(expected) == len(limits)
+ for i in range(len(expected)):
+ mean = tasks.mean_length(os.path.join(data_dir, 'tasks_test_mean_length.fa'), limit=limits[i])
+ self.assertEqual(expected[i], mean)
+
+
class TestMergeToOneSeq(unittest.TestCase):
def test_merge_to_one_seq_fa(self):
'''Test merge_to_one_seq with fasta'''
@@ -415,6 +474,22 @@ class TestSplit(unittest.TestCase):
self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords'), test_coords))
os.unlink(test_coords)
+ def test_split_by_fixed_size_onefile(self):
+ infile = os.path.join(data_dir, 'sequences_test_split_fixed_size_onefile.fa')
+ tmp_out = 'tmp.sequences_test_split_fixed_size_onefile.fa'
+ expected = os.path.join(data_dir, 'sequences_test_split_fixed_size_onefile.out.fa')
+ tasks.split_by_fixed_size_onefile(infile, tmp_out, chunk_size=3, tolerance=1)
+ self.assertTrue(filecmp.cmp(expected, tmp_out))
+ os.unlink(tmp_out)
+
+ def test_split_by_fixed_size_onefile_exclude_Ns(self):
+ infile = os.path.join(data_dir, 'sequences_test_split_fixed_size_onefile.fa')
+ tmp_out = 'tmp.sequences_test_split_fixed_size_onefile.skip_Ns.fa'
+ expected = os.path.join(data_dir, 'sequences_test_split_fixed_size_onefile.skip_Ns.out.fa')
+ tasks.split_by_fixed_size_onefile(infile, tmp_out, chunk_size=3, tolerance=1, skip_if_all_Ns=True)
+ self.assertTrue(filecmp.cmp(expected, tmp_out))
+ os.unlink(tmp_out)
+
class TestCountSequences(unittest.TestCase):
def test_count_sequences(self):
'''Check that count_sequences does as expected'''
@@ -469,6 +544,18 @@ class TestReplaceBases(unittest.TestCase):
os.unlink(tmpfile)
+class TestSortBySize(unittest.TestCase):
+ def test_sort_by_size(self):
+ '''Test sort_by_size'''
+ infile = os.path.join(data_dir, 'tasks_test_sort_by_size.in.fa')
+ tmpfile = 'tmp.sorted.fa'
+ tasks.sort_by_size(infile, tmpfile)
+ self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_sort_by_size.out.fa'), tmpfile, shallow=False))
+ tasks.sort_by_size(infile, tmpfile, smallest_first=True)
+ self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_sort_by_size.out.rev.fa'), tmpfile, shallow=False))
+ os.unlink(tmpfile)
+
+
class TestStripIlluminaSuffix(unittest.TestCase):
def test_strip_illumina_suffix(self):
'''Check illumina suffixes stripped correctly off read names'''
diff --git a/fastaq/tests/utils_test.py b/pyfastaq/tests/utils_test.py
similarity index 99%
rename from fastaq/tests/utils_test.py
rename to pyfastaq/tests/utils_test.py
index 731c944..8fab6ab 100644
--- a/fastaq/tests/utils_test.py
+++ b/pyfastaq/tests/utils_test.py
@@ -4,7 +4,7 @@ import sys
import os
import filecmp
import unittest
-from fastaq import utils
+from pyfastaq import utils
modules_dir = os.path.dirname(os.path.abspath(utils.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')
diff --git a/fastaq/utils.py b/pyfastaq/utils.py
similarity index 100%
rename from fastaq/utils.py
rename to pyfastaq/utils.py
diff --git a/scripts/fastaq b/scripts/fastaq
new file mode 100755
index 0000000..9537f75
--- /dev/null
+++ b/scripts/fastaq
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+
+import argparse
+import sys
+
+tasks = {
+ 'add_indels': 'Deletes or inserts bases at given position(s)',
+ 'caf_to_fastq': 'Converts a CAF file to FASTQ format',
+ 'capillary_to_pairs': 'Converts file of capillary reads to paired and unpaired files',
+ 'chunker': 'Splits sequences into equal sized chunks',
+ 'count_sequences': 'Counts the sequences in input file',
+ 'deinterleave': 'Splits interleaved paired file into two separate files',
+ 'enumerate_names': 'Renames sequences in a file, calling them 1,2,3... etc',
+ 'expand_nucleotides': 'Makes every combination of degenerate nucleotides',
+ 'fasta_to_fastq': 'Convert FASTA and .qual to FASTQ',
+ 'filter': 'Filter sequences to get a subset of them',
+ 'get_ids': 'Get the ID of each sequence',
+ 'get_seq_flanking_gaps': 'Gets the sequences flanking gaps',
+ 'interleave': 'Interleaves two files, output is alternating between fwd/rev reads',
+ 'long_read_simulate': 'Simulates long reads from reference',
+ 'make_random_contigs': 'Make contigs of random sequence',
+ 'merge': 'Converts multi sequence file to a single sequence',
+ 'replace_bases': 'Replaces all occurences of one letter with another',
+ 'reverse_complement': 'Reverse complement all sequences',
+ 'scaffolds_to_contigs': 'Creates a file of contigs from a file of scaffolds',
+ 'search_for_seq': 'Find all exact matches to a string (and its reverse complement)',
+ 'sequence_trim': 'Trim exact matches to a given string off the start of every sequence',
+ 'split_by_base_count': 'Split multi sequence file into separate files',
+ 'sort_by_size': 'Sorts sequences in length order',
+ 'strip_illumina_suffix': 'Strips /1 or /2 off the end of every read name',
+ 'to_fasta': 'Converts a variety of input formats to nicely formatted FASTA format',
+ 'to_fake_qual': 'Make fake quality scores file',
+ 'to_mira_xml': 'Create an xml file from a file of reads, for use with Mira assembler',
+ 'to_orfs_gff': 'Writes a GFF file of open reading frames',
+ 'to_perfect_reads': 'Make perfect paired reads from reference',
+ 'to_random_subset': 'Make a random sample of sequences (and optionally mates as well)',
+ 'to_tiling_bam': 'Make a BAM file of reads uniformly spread across the input reference',
+ 'to_unique_by_id': 'Remove duplicate sequences, based on their names. Keep longest seqs',
+ 'translate': 'Translate all sequences in input nucleotide sequences',
+ 'trim_contigs': 'Trims a set number of bases off the end of every contig',
+ 'trim_ends': 'Trim fixed number of bases of start and/or end of every sequence',
+ 'trim_Ns_at_end': 'Trims all Ns at the start/end of all sequences',
+ 'version': 'Print version number and exit',
+}
+
+
+def print_usage_and_exit():
+ print('Usage: fastaq <command> [options]', file=sys.stderr)
+ print('\nTo get minimal usage for a command use:\nfastaq command', file=sys.stderr)
+ print('\nTo get full help for a command use one of:\nfastaq command -h\nfastaq command --help\n', file=sys.stderr)
+ print('\nAvailable commands:\n', file=sys.stderr)
+ max_task_length = max([len(x) for x in list(tasks.keys())])
+ for task in sorted(tasks):
+ print('{{0: <{}}}'.format(max_task_length).format(task), tasks[task], sep=' ', file=sys.stderr)
+ sys.exit(1)
+
+
+if len(sys.argv) == 1 or sys.argv[1] in ['-h', '-help', '--help']:
+ print_usage_and_exit()
+
+task = sys.argv.pop(1)
+
+if task not in tasks:
+ print('Task "' + task + '" not recognised. Cannot continue.\n', file=sys.stderr)
+ print_usage_and_exit()
+
+
+exec('import pyfastaq.runners.' + task)
+exec('pyfastaq.runners.' + task + '.run("' + tasks[task] + '")')
+
diff --git a/scripts/fastaq_capillary_to_pairs b/scripts/fastaq_capillary_to_pairs
deleted file mode 100755
index 0d4a48f..0000000
--- a/scripts/fastaq_capillary_to_pairs
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Given a fasta/q file of capillary reads, makes an interleaved file of read pairs (where more than read from same ligation, takes the longest read) and a file of unpaired reads. Replaces the .p1k/.q1k part of read names to denote fwd/rev reads with /1 and /2',
- usage = '%(prog)s <infile> <outfiles prefix>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outprefix', help='Prefix of output files', metavar='outfiles prefix')
-options = parser.parse_args()
-tasks.capillary_to_pairs(options.infile, options.outprefix)
diff --git a/scripts/fastaq_chunker b/scripts/fastaq_chunker
deleted file mode 100755
index d1aeb68..0000000
--- a/scripts/fastaq_chunker
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Splits a multi fasta/q file into separate files. Splits sequences into chunks of a fixed size. Aims for chunk_size chunks in each file, but allows a little extra, so chunk can be up to (chunk_size + tolerance), to prevent tiny chunks made from the ends of sequences',
- usage = '%(prog)s [options] <fasta/q in> <prefix of output files> <chunk size> <tolerance>')
-parser.add_argument('infile', help='Name of input fasta/q file to be split')
-parser.add_argument('outprefix', help='Name of output fasta/q file')
-parser.add_argument('chunk_size', type=int, help='Size of each chunk')
-parser.add_argument('tolerance', type=int, help='Tolerance allowed in chunk size')
-parser.add_argument('--skip_all_Ns', action='store_true', help='Do not output any sequence that consists of all Ns')
-options = parser.parse_args()
-tasks.split_by_fixed_size(
- options.infile,
- options.outprefix,
- options.chunk_size,
- options.tolerance,
- skip_if_all_Ns=options.skip_all_Ns
-)
diff --git a/scripts/fastaq_count_sequences b/scripts/fastaq_count_sequences
deleted file mode 100755
index fcb7911..0000000
--- a/scripts/fastaq_count_sequences
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Counts the number of sequences in a fasta/q file',
- usage = '%(prog)s <fasta/q in>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-options = parser.parse_args()
-print(tasks.count_sequences(options.infile))
diff --git a/scripts/fastaq_deinterleave b/scripts/fastaq_deinterleave
deleted file mode 100755
index a28c505..0000000
--- a/scripts/fastaq_deinterleave
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Deinterleaves fasta/q file, so that reads are written alternately between two output files',
- usage = '%(prog)s [options] <fasta/q in> <out_fwd> <out_rev>')
-parser.add_argument('--fasta_out', action='store_true', help='Use this to write output as fasta (default is same as input)', default=False)
-parser.add_argument('infile', help='Name of fasta/q file to be deinterleaved')
-parser.add_argument('out_fwd', help='Name of output fasta/q file of forwards reads')
-parser.add_argument('out_rev', help='Name of output fasta/q file of reverse reads')
-options = parser.parse_args()
-tasks.deinterleave(options.infile, options.out_fwd, options.out_rev, fasta_out=options.fasta_out)
diff --git a/scripts/fastaq_enumerate_names b/scripts/fastaq_enumerate_names
deleted file mode 100755
index 89831cb..0000000
--- a/scripts/fastaq_enumerate_names
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Renames sequences in a file, calling them 1,2,3... etc',
- usage = '%(prog)s [options] <fasta/q in> <fasta/q out>')
-parser.add_argument('--start_index', type=int, help='Starting number [%(default)s]', default=1)
-parser.add_argument('--rename_file', help='If used, will write a file of old name to new name')
-parser.add_argument('--keep_suffix', action='store_true', help='Use this to keep a /1 or /2 suffix at the end of each name')
-parser.add_argument('infile', help='Name of fasta/q file to be read')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.enumerate_names(options.infile,
- options.outfile,
- start_index=options.start_index,
- keep_illumina_suffix=options.keep_suffix,
- rename_file=options.rename_file)
diff --git a/scripts/fastaq_expand_nucleotides b/scripts/fastaq_expand_nucleotides
deleted file mode 100755
index 2dbde36..0000000
--- a/scripts/fastaq_expand_nucleotides
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Makes all combinations of sequences in input file by using all possibilities of redundant bases. e.g. ART could be AAT or AGT. Assumes input is nucleotides, not amino acids',
- usage = '%(prog)s <infile> <outfile>')
-parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip')
-parser.add_argument('outfile', help='Name of output file')
-options = parser.parse_args()
-tasks.expand_nucleotides(
- options.infile,
- options.outfile,
-)
diff --git a/scripts/fastaq_extend_gaps b/scripts/fastaq_extend_gaps
deleted file mode 100755
index e8622c3..0000000
--- a/scripts/fastaq_extend_gaps
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Extends the length of all gaps (and trims the start/end of sequences) in a fasta/q file. Does this by replacing a set number of bases either side of each gap with Ns. Any sequence that ends up as all Ns is lost',
- usage = '%(prog)s [options] <fasta/q in> <fasta/q out>')
-parser.add_argument('--trim_number', type=int, help='Number of bases to trim around each gap, and off ends of each sequence [%(default)s]', default=100)
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.extend_gaps(options.infile, options.outfile, options.trim_number)
diff --git a/scripts/fastaq_fasta_to_fastq b/scripts/fastaq_fasta_to_fastq
deleted file mode 100755
index 18b6edb..0000000
--- a/scripts/fastaq_fasta_to_fastq
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Given a fasta and qual file, makes a fastq file',
- usage = '%(prog)s <fasta in> <qual in> <fastq out>')
-parser.add_argument('fasta', help='Name of input fasta file', metavar='fasta in')
-parser.add_argument('qual', help='Name of input quality scores file', metavar='qual in')
-parser.add_argument('outfile', help='Name of output fastq file', metavar='fastq out')
-options = parser.parse_args()
-tasks.fasta_to_fastq(options.fasta, options.qual, options.outfile)
diff --git a/scripts/fastaq_filter b/scripts/fastaq_filter
deleted file mode 100755
index cb260e6..0000000
--- a/scripts/fastaq_filter
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Filters a fasta/q file by sequence length and/or by name matching a regular expression',
- usage = '%(prog)s [options] <infile> <outfile>')
-parser.add_argument('--min_length', type=int, help='Minimum length of sequence to keep [%(default)s]', default=0, metavar='INT')
-parser.add_argument('--max_length', type=float, help='Maximum length of sequence to keep [%(default)s]', default=float('inf'), metavar='INT')
-parser.add_argument('--regex', help='If given, only reads with a name matching the regular expression will be kept')
-parser.add_argument('--ids_file', help='If given, only reads whose ID is in th given file will be used. One ID per line of file.')
-parser.add_argument('-v', '--invert', action='store_true', help='Keep sequences that do not match the filters')
-parser.add_argument('infile', help='Name of fasta/q file to be filtered')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.filter(options.infile,
- options.outfile,
- minlength=options.min_length,
- maxlength=options.max_length,
- regex=options.regex,
- ids_file=options.ids_file,
- invert=options.invert
-)
diff --git a/scripts/fastaq_get_ids b/scripts/fastaq_get_ids
deleted file mode 100755
index 59b9e0e..0000000
--- a/scripts/fastaq_get_ids
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Gets IDs from each sequence in a fasta or fastq file',
- usage = '%(prog)s <infile> <outfile>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output file')
-options = parser.parse_args()
-tasks.get_ids(options.infile, options.outfile)
diff --git a/scripts/fastaq_get_seq_flanking_gaps b/scripts/fastaq_get_seq_flanking_gaps
deleted file mode 100755
index 0c54154..0000000
--- a/scripts/fastaq_get_seq_flanking_gaps
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Gets the sequences either side of gaps in a fasta/q file',
- usage = '%(prog)s [options] <fasta/q in> <fasta/q out>')
-parser.add_argument('--left', type=int, help='Number of bases to get to left of gap [%(default)s]', default=25, metavar='INT')
-parser.add_argument('--right', type=int, help='Number of bases to get to right of gap [%(default)s]', default=25, metavar='INT')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.get_seqs_flanking_gaps(options.infile, options.outfile, options.left, options.right)
diff --git a/scripts/fastaq_insert_or_delete_bases b/scripts/fastaq_insert_or_delete_bases
deleted file mode 100755
index 61e1e80..0000000
--- a/scripts/fastaq_insert_or_delete_bases
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import sys
-import random
-from fastaq import sequences, utils, intervals
-
-parser = argparse.ArgumentParser(
- description = 'Deletes or inserts bases at given position(s) from a fasta/q file',
- usage = '%(prog)s <fasta/q in> <outfile>')
-parser.add_argument('infile', help='Name of fasta/q file to be read')
-parser.add_argument('outfile', help='Name of output file')
-parser.add_argument('-d','--delete', action='append', help='Delete the given bases from the given sequence. Format same as samtools view: name:start-end. This option can be used multiple times (once for each region to delete). Overlapping coords will be merged before deleting', metavar='Name:start:bases')
-parser.add_argument('--delete_range', help='Deletes bases starting at position P in each sequence of the input file. Deletes start + (n-1)*step bases from sequence n.', metavar='P,start,step')
-parser.add_argument('-i','--insert', action='append', help='Insert a random string of bases at the given position. Format is name:position:number_to_add. Bases are added after the position. This option can be used multiple times', metavar='Name:start:bases')
-parser.add_argument('--insert_range', help='Inserts random bases starting after position P in each sequence of the input file. Inserts start + (n-1)*step bases into sequence n.', metavar='P,start,step')
-options = parser.parse_args()
-
-test_ops = [int(x is not None) for x in [options.delete, options.insert, options.delete_range, options.insert_range]]
-
-if sum(test_ops) != 1:
- print('Must use one of --delete, --insert, --delete_range, --insert_range. Cannot continue', file=sys.stderr)
- sys.exit(1)
-
-
-def range2dic(range_in):
- if range_in is None:
- return {}
- (pos, start, step) = range_in.split(',')
- d = {}
- d['pos'] = int(pos) - 1
- d['bases'] = int(start)
- d['step'] = int(step)
- return d
-
-delete_range = range2dic(options.delete_range)
-insert_range = range2dic(options.insert_range)
-
-
-# convert the -d regions into sequence name, start and end coords
-to_delete = {}
-if options.delete:
- for s in options.delete:
- id, coords = s.rsplit(':')
- start, end = [int(x)-1 for x in coords.split('-')]
- if id not in to_delete:
- to_delete[id] = []
- to_delete[id].append(intervals.Interval(start, end))
-
-
-to_insert = {}
-if options.insert:
- for s in options.insert:
- id, pos, bases = s.rsplit(':',2)
- pos = int(pos) - 1
- bases = int(bases)
- if id not in to_insert:
- to_insert[id] = []
- to_insert[id].append((pos, bases))
-
-
-assert len(to_delete) * len(to_insert) == 0
-
-# merge overlapping regions to be deleted
-for l in to_delete.values():
- intervals.merge_overlapping_in_list(l)
-
-# sort positions to be inserted
-for l in to_insert.values():
- l.sort()
-
-# read in the fasta/q file and print outfile with deleted sequences
-seq_reader = sequences.file_reader(options.infile)
-f = utils.open_file_write(options.outfile)
-
-for seq in seq_reader:
- if seq.id in to_delete:
- # delete regions for this sequence, but start at the end so the
- # coords don't get messed up after the first deletion
- for inter in reversed(to_delete[seq.id]):
- seq.seq = seq.seq[:inter.start] + seq.seq[inter.end + 1:]
- elif options.delete_range:
- seq.seq = seq.seq[:delete_range['pos']] + seq.seq[delete_range['pos'] + delete_range['bases']:]
- delete_range['bases'] += delete_range['step']
- elif seq.id in to_insert:
- for pos, bases in reversed(to_insert[seq.id]):
- seq.seq = seq.seq[:pos + 1] + ''.join([random.choice('ACGT') for x in range(bases)]) + seq.seq[pos + 1:]
- elif options.insert_range:
- seq.seq = seq.seq[:insert_range['pos'] + 1] + ''.join([random.choice('ACGT') for x in range(insert_range['bases'])]) + seq.seq[insert_range['pos'] + 1:]
- insert_range['bases'] += insert_range['step']
-
- print(seq, file=f)
-
-utils.close(f)
diff --git a/scripts/fastaq_interleave b/scripts/fastaq_interleave
deleted file mode 100755
index 4b39a3e..0000000
--- a/scripts/fastaq_interleave
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Interleaves two fasta/q files, so that reads are written alternately first/second in output file',
- usage = '%(prog)s [options] <fasta/q 1> <fasta/q 2> <outfile>')
-parser.add_argument('infile_1', help='Name of first input fasta/q file')
-parser.add_argument('infile_2', help='Name of second input fasta/q file')
-parser.add_argument('outfile', help='Name of output fasta/q file of interleaved reads')
-options = parser.parse_args()
-tasks.interleave(options.infile_1, options.infile_2, options.outfile)
diff --git a/scripts/fastaq_long_read_simulate b/scripts/fastaq_long_read_simulate
deleted file mode 100755
index 23106f3..0000000
--- a/scripts/fastaq_long_read_simulate
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Simulates long reads from a fasta/q file. Can optionally make insertions into the reads, like pacbio does. If insertions made, coverage calculation is done before the insertions (so total read length may appear longer then expected).',
- usage = '%(prog)s [options] <infile> <outfile>')
-
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output fasta file')
-
-parser.add_argument('--method', help='How to sample the read positions and lengths. Choose from 1) "tiling", where reads of fixed length are taken at equal intervals from the reference. 2) "unfiform", where reads of fixed length taken at positions sampled uniformly. 3) "gamma", where reads lengths are taken from a gamma distribution, and positions sampled uniformly. [%(default)s]', default='tiling', choices=['tiling', 'uniform', 'gamma'], metavar='tiling|uniform|gamma')
-parser.add_argument('--seed', type=int, help='Seed for random number generator [default: use python\'s default]', metavar='INT')
-parser.add_argument('--qual', help='Write a file of fake quality scores called outfile.qual, all bases same quality [%(default)s]', metavar='INT')
-parser.add_argument('--fixed_read_length', type=int, help='Length of each read. Only applies if method is tile or uniform. [%(default)s]', default=20000, metavar='INT')
-parser.add_argument('--coverage', type=float, help='Read coverage. Only applies if method is gamma or uniform. [%(default)s]', default=2, metavar='FLOAT')
-
-
-tiling_group = parser.add_argument_group('tiling options')
-tiling_group.add_argument('--tile_step', type=int, help='Distance between start of each read [%(default)s]', default=10000, metavar='INT')
-
-gamma_group = parser.add_argument_group('gamma options')
-gamma_group.add_argument('--gamma_shape', type=float, help='Shape parameter of gamma distribution [%(default)s]', default=1.2, metavar='FLOAT')
-gamma_group.add_argument('--gamma_scale', type=float, help='Scale parameter of gamma distribution [%(default)s]', default=6000, metavar='FLOAT')
-gamma_group.add_argument('--gamma_min_length', type=int, help='Minimum read length [%(default)s]', default=20000, metavar='INT')
-
-ins_group = parser.add_argument_group('options to add insertions to reads')
-ins_group.add_argument('--ins_skip', type=int, help='Insert a random base every --skip bases plus or minus --ins_window. If this option is used, must also use --ins_window.', metavar='INT')
-ins_group.add_argument('--ins_window', type=int, help='See --ins_skip. If this option is used, must also use --ins_skip.', metavar='INT')
-
-
-options = parser.parse_args()
-tasks.make_long_reads(
- options.infile,
- options.outfile,
- method=options.method,
- fixed_read_length=options.fixed_read_length,
- coverage=options.coverage,
- tile_step=options.tile_step,
- gamma_shape=options.gamma_shape,
- gamma_scale=options.gamma_scale,
- gamma_min_length=options.gamma_min_length,
- seed=options.seed,
- ins_skip=options.ins_skip,
- ins_window=options.ins_window
-)
-
-if options.qual:
- tasks.fastaq_to_fake_qual(options.outfile, options.outfile + '.qual', q=options.qual)
diff --git a/scripts/fastaq_make_random_contigs b/scripts/fastaq_make_random_contigs
deleted file mode 100755
index c6774fe..0000000
--- a/scripts/fastaq_make_random_contigs
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Makes a multi-fasta file of random sequences, all of the same length. Each base has equal chance of being A,C,G or T',
- usage = '%(prog)s [options] <number of sequences> <length of each sequence> <fasta out>')
-parser.add_argument('--first_number', type=int, help='If numbering the sequences, the first sequence gets this number [%(default)s]', default=1)
-parser.add_argument('--name_by_letters', action='store_true', help='Name the contigs A,B,C,... will start at A again if you get to Z')
-parser.add_argument('--prefix', help='Prefix to add to start of every sequence name', default='')
-parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None)
-parser.add_argument('contigs', type=int, help='Nunber of contigs to make')
-parser.add_argument('length', type=int, help='Length of each contig')
-parser.add_argument('outfile', help='Name of output file')
-options = parser.parse_args()
-tasks.make_random_contigs(
- options.contigs,
- options.length,
- options.outfile,
- name_by_letters=options.name_by_letters,
- prefix=options.prefix,
- seed=options.seed,
- first_number=options.first_number
-)
diff --git a/scripts/fastaq_merge b/scripts/fastaq_merge
deleted file mode 100755
index d919323..0000000
--- a/scripts/fastaq_merge
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Converts multi fasta/q file to single sequence file, preserving original order of sequences',
- usage = '%(prog)s <infile> <outfile>')
-parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip')
-parser.add_argument('outfile', help='Name of output file')
-parser.add_argument('-n', '--name', help='Name of sequence in output file [%(default)s]', default='union')
-options = parser.parse_args()
-tasks.merge_to_one_seq(
- options.infile,
- options.outfile,
- seqname=options.name
-)
-
diff --git a/scripts/fastaq_replace_bases b/scripts/fastaq_replace_bases
deleted file mode 100755
index 6ce2fc0..0000000
--- a/scripts/fastaq_replace_bases
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Replaces all occurences of one letter with another in a fasta/q file',
- usage = '%(prog)s <fasta/q in> <outfile> <old> <new>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output file')
-parser.add_argument('old', help='Base to be replaced')
-parser.add_argument('new', help='Replace with this letter')
-options = parser.parse_args()
-tasks.replace_bases(options.infile, options.outfile, options.old, options.new)
diff --git a/scripts/fastaq_reverse_complement b/scripts/fastaq_reverse_complement
deleted file mode 100755
index 147e01f..0000000
--- a/scripts/fastaq_reverse_complement
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Reverse complements all sequences in a fasta/q file',
- usage = '%(prog)s [options] <fasta/q in> <fasta/q out>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.reverse_complement(options.infile, options.outfile)
diff --git a/scripts/fastaq_scaffolds_to_contigs b/scripts/fastaq_scaffolds_to_contigs
deleted file mode 100755
index 46d4861..0000000
--- a/scripts/fastaq_scaffolds_to_contigs
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Creates a file of contigs from a file of scaffolds - i.e. breaks at every gap in the input',
- usage = '%(prog)s [options] <infile> <outfile>')
-parser.add_argument('--number_contigs', action='store_true', help='Use this to enumerate contig names 1,2,3,... within each scaffold')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output contigs file')
-options = parser.parse_args()
-tasks.scaffolds_to_contigs(options.infile, options.outfile, number_contigs=options.number_contigs)
diff --git a/scripts/fastaq_search_for_seq b/scripts/fastaq_search_for_seq
deleted file mode 100755
index c00ed7a..0000000
--- a/scripts/fastaq_search_for_seq
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Searches for an exact match on a given string and its reverese complement, in every sequences of a fasta/q file. Case insensitive. Guaranteed to find all hits',
- usage = '%(prog)s [options] <fasta/q in> <outfile> <search_string>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of outputfile. Tab-delimited output: sequence name, position, strand')
-parser.add_argument('search_string', help='String to search for in the sequences')
-options = parser.parse_args()
-tasks.search_for_seq(options.infile, options.outfile, options.search_string)
diff --git a/scripts/fastaq_sequence_trim b/scripts/fastaq_sequence_trim
deleted file mode 100755
index 7021c6c..0000000
--- a/scripts/fastaq_sequence_trim
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Trims sequences off the start of all sequences in a pair of fasta/q files, whenever there is a perfect match. Only keeps a read pair if both reads of the pair are at least a minimum length after any trimming',
- usage = '%(prog)s [options] <fasta/q 1 in> <fastaq/2 in> <out 1> <out 2> <trim_seqs>')
-parser.add_argument('--min_length', type=int, help='Minimum length of output sequences [%(default)s]', default=50, metavar='INT')
-parser.add_argument('--revcomp', action='store_true', help='Trim the end of each sequence if it matches the reverse complement. This option is intended for PCR primer trimming')
-parser.add_argument('infile_1', help='Name of forward fasta/q file to be trimmed', metavar='fasta/q 1 in')
-parser.add_argument('infile_2', help='Name of reverse fasta/q file to be trimmed', metavar='fasta/q 2 in')
-parser.add_argument('outfile_1', help='Name of output forward fasta/q file', metavar='out_1')
-parser.add_argument('outfile_2', help='Name of output reverse fasta/q file', metavar='out_2')
-parser.add_argument('trim_seqs', help='Name of fasta/q file of sequences to search for at the start of each input sequence', metavar='trim_seqs')
-options = parser.parse_args()
-tasks.sequence_trim(
- options.infile_1,
- options.infile_2,
- options.outfile_1,
- options.outfile_2,
- options.trim_seqs,
- min_length=options.min_length,
- check_revcomp=options.revcomp
-)
diff --git a/scripts/fastaq_split_by_base_count b/scripts/fastaq_split_by_base_count
deleted file mode 100755
index dd7b43d..0000000
--- a/scripts/fastaq_split_by_base_count
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Splits a multi fasta/q file into separate files. Does not split sequences. Puts up to max_bases into each split file. The exception is that any sequence longer than max_bases is put into its own file.',
- usage = '%(prog)s [options] <fasta/q in> <prefix of output files> <max_bases>')
-parser.add_argument('infile', help='Name of input fasta/q file to be split')
-parser.add_argument('outprefix', help='Name of output fasta/q file')
-parser.add_argument('max_bases', type=int, help='Max bases in each output split file', metavar='max_bases')
-parser.add_argument('--max_seqs', type=int, help='Max number of sequences in each output split file [no limit]', metavar='INT')
-
-options = parser.parse_args()
-tasks.split_by_base_count(options.infile, options.outprefix, options.max_bases, options.max_seqs)
diff --git a/scripts/fastaq_strip_illumina_suffix b/scripts/fastaq_strip_illumina_suffix
deleted file mode 100755
index 6a29a42..0000000
--- a/scripts/fastaq_strip_illumina_suffix
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Strips /1 or /2 off the end of every read name in a fasta/q file',
- usage = '%(prog)s [options] <fasta/q in> <fasta/q out>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.strip_illumina_suffix(options.infile, options.outfile)
diff --git a/scripts/fastaq_to_fake_qual b/scripts/fastaq_to_fake_qual
deleted file mode 100755
index 272f7a3..0000000
--- a/scripts/fastaq_to_fake_qual
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Makes fake quality scores file from a fasta/q file',
- usage = '%(prog)s <infile> <outfile>')
-parser.add_argument('infile', help='Name of input file')
-parser.add_argument('outfile', help='Name of output file')
-parser.add_argument('-q', '--qual', type=int, help='Quality score to assign to all bases [%(default)s]', default=40)
-options = parser.parse_args()
-tasks.fastaq_to_fake_qual(
- options.infile,
- options.outfile,
- q=options.qual
-)
-
diff --git a/scripts/fastaq_to_fasta b/scripts/fastaq_to_fasta
deleted file mode 100755
index 742e95f..0000000
--- a/scripts/fastaq_to_fasta
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Converts sequence file to FASTA format',
- usage = '%(prog)s <infile> <outfile>')
-parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip')
-parser.add_argument('outfile', help='Name of output file')
-parser.add_argument('-l', '--line_length', type=int, help='Number of bases on each sequence line of output file [%(default)s]', default=60)
-parser.add_argument('-s', '--strip_after_whitespace', action='store_true', help='Remove everything after first whitesapce in every sequence name')
-options = parser.parse_args()
-tasks.to_fasta(
- options.infile,
- options.outfile,
- line_length=options.line_length,
- strip_after_first_whitespace=options.strip_after_whitespace
-)
diff --git a/scripts/fastaq_to_mira_xml b/scripts/fastaq_to_mira_xml
deleted file mode 100755
index 582d669..0000000
--- a/scripts/fastaq_to_mira_xml
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Creates an xml file from a fasta/q file of reads, for use with Mira assembler',
- usage = '%(prog)s [options] <fastq_in> <xml_out>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('xml_out', help='Name of output xml file')
-options = parser.parse_args()
-tasks.fastaq_to_mira_xml(options.infile, options.xml_out)
diff --git a/scripts/fastaq_to_orfs_gff b/scripts/fastaq_to_orfs_gff
deleted file mode 100755
index 0098023..0000000
--- a/scripts/fastaq_to_orfs_gff
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Writes a GFF file of open reading frames from a fasta/q file',
- usage = '%(prog)s [options] <fasta/q in> <gff_out>')
-parser.add_argument('--min_length', type=int, help='Minimum length of ORF, in nucleotides [%(default)s]', default=300, metavar='INT')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('gff_out', help='Name of output gff file')
-options = parser.parse_args()
-tasks.fastaq_to_orfs_gff(options.infile, options.gff_out, min_length=options.min_length)
diff --git a/scripts/fastaq_to_perfect_reads b/scripts/fastaq_to_perfect_reads
deleted file mode 100755
index 6f3ca10..0000000
--- a/scripts/fastaq_to_perfect_reads
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import random
-from math import floor, ceil
-from fastaq import sequences, utils
-import sys
-
-parser = argparse.ArgumentParser(
- description = 'Makes perfect paired end fastq reads from a fasta/q file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved fastq file.',
- usage = '%(prog)s <fasta/q in> <out.fastq> <mean insert size> <insert std deviation> <mean coverage> <read length>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output fastq file')
-parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size')
-parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation')
-parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage')
-parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length')
-parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME')
-parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads')
-parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT')
-options = parser.parse_args()
-
-random.seed(a=options.seed)
-
-seq_reader = sequences.file_reader(options.infile)
-fout = utils.open_file_write(options.outfile)
-pair_counter = 1
-
-if options.fragments:
- fout_frags = utils.open_file_write(options.fragments)
-
-for ref in seq_reader:
- # check if current seq is long enough
- if len(ref) < options.mean_insert + 4 * options.insert_std:
- print('Warning, sequence ', ref.id, ' too short. Skipping it...', file=sys.stderr)
- continue
-
- # work out how many reads to simulate
- read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength)
-
- # it's possible that we pick the same fragment twice, in which case the
- # reads would get the same name. So remember the frag coords
- used_fragments = {} # (middle_position, length) => count
-
- # do the simulation: pick insert size from normal distribution, and
- # position in genome from uniform distribution
- x = 0
- while x < read_pairs:
- isize = int(random.normalvariate(options.mean_insert, options.insert_std))
- while isize > len(ref) or isize < options.readlength:
- isize = int(random.normalvariate(options.mean_insert, options.insert_std))
- middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize))
- read_start1 = int(middle_pos - ceil(0.5 * isize))
- read_start2 = read_start1 + isize - options.readlength
-
- readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)])
-
- fragment = (middle_pos, isize)
- if fragment in used_fragments:
- used_fragments[fragment] += 1
- readname += '.dup.' + str(used_fragments[fragment])
- else:
- used_fragments[fragment] = 1
-
- read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength)
- read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength)
-
-
- if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq):
- continue
-
- read2.revcomp()
-
- print(read1, file=fout)
- print(read2, file=fout)
-
- if options.fragments:
- frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength])
- print(frag, file=fout_frags)
-
- pair_counter += 1
- x += 1
-
-utils.close(fout)
-if options.fragments:
- utils.close(fout_frags)
diff --git a/scripts/fastaq_to_random_subset b/scripts/fastaq_to_random_subset
deleted file mode 100755
index b4f11c5..0000000
--- a/scripts/fastaq_to_random_subset
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import random
-from fastaq import sequences, utils
-
-parser = argparse.ArgumentParser(
- description = 'Takes a random subset of reads from a fasta/q file and optionally the corresponding read ' +
- 'from a mates file. Ouptut is interleaved if mates file given',
- usage = '%(prog)s [options] <fasta/q in> <outfile> <probablilty of keeping read (pair) in [0,100]>')
-parser.add_argument('--mate_file', help='Name of fasta/q mates file')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of fasta/q output file')
-parser.add_argument('probability', type=int, help='Probability of keeping any given read (pair) in [0,100]', metavar='INT')
-options = parser.parse_args()
-
-seq_reader = sequences.file_reader(options.infile)
-fout = utils.open_file_write(options.outfile)
-
-if options.mate_file:
- mate_seq_reader = sequences.file_reader(options.mate_file)
-
-for seq in seq_reader:
- if options.mate_file:
- try:
- mate_seq = next(mate_seq_reader)
- except StopIteration:
- print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr)
- sys.exit(1)
- if random.randint(0, 100) <= options.probability:
- print(seq, file=fout)
- if options.mate_file:
- print(mate_seq, file=fout)
-
-utils.close(fout)
diff --git a/scripts/fastaq_to_tiling_bam b/scripts/fastaq_to_tiling_bam
deleted file mode 100755
index 9b9738d..0000000
--- a/scripts/fastaq_to_tiling_bam
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import sys
-import os
-from fastaq import sequences, utils
-
-parser = argparse.ArgumentParser(
- description = 'Takes a fasta/q file. Makes a BAM file containing perfect (unpaired) reads tiling the whole genome',
- usage = '%(prog)s [options] <fasta/q in> <read length> <read step> <read prefix> <out.bam>',
- epilog = 'Important: assumes that samtools is in your path')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('read_length', type=int, help='Length of reads')
-parser.add_argument('read_step', type=int, help='Distance between start of each read')
-parser.add_argument('read_prefix', help='Prefix of read names')
-parser.add_argument('outfile', help='Name of output BAM file')
-parser.add_argument('--read_group', help='Add the given read group ID to all reads [%(default)s]' ,default='42')
-options = parser.parse_args()
-
-# make a header first - we need to add the @RG line to the default header made by samtools
-tmp_empty_file = options.outfile + '.tmp.empty'
-f = utils.open_file_write(tmp_empty_file)
-utils.close(f)
-try:
- f = os.popen('samtools view -H -T ' + options.infile + ' ' + tmp_empty_file)
-except IOError:
- print('Error making tmp header file', file=sys.stderr)
- sys.exit(1)
-
-header_lines = f.readlines()
-header_lines.append('@RG\tID:' + options.read_group + '\tSM:FAKE')
-f.close()
-os.unlink(tmp_empty_file)
-
-seq_reader = sequences.file_reader(options.infile)
-try:
- f = os.popen('samtools view -hbS - > ' + options.outfile, 'w')
-except IOError:
- print("Error opening for writing BAM file '" + options.outfile + "'", file=sys.stderr)
- sys.exit(1)
-
-print(''.join(header_lines), file=f)
-
-for seq in seq_reader:
- end_range = len(seq)
- if len(seq) < options.read_length:
- end_range = 1
- for i in range(0, end_range, options.read_step):
- if len(seq) <= options.read_length:
- start = 0
- end = len(seq) - 1
- else:
- start = i
- end = start + options.read_length - 1
-
- if end > len(seq) - 1:
- end = len(seq) - 1
- start = end - options.read_length + 1
-
- read = sequences.Fastq(options.read_prefix + ':' + seq.id + ':' + str(start + 1) + ':' + str(end + 1), seq[start:end+1], 'I' * (end - start + 1))
-
- print ('\t'.join([read.id,
- '0',
- seq.id,
- str(start + 1),
- '60',
- str(len(read)) + 'M',
- '*',
- '*',
- '*',
- read.seq,
- read.qual,
- 'RG:Z:' + options.read_group]), file=f)
-
- if end == len(seq) - 1:
- break
-
-f.close()
-
diff --git a/scripts/fastaq_to_unique_by_id b/scripts/fastaq_to_unique_by_id
deleted file mode 100755
index e743a92..0000000
--- a/scripts/fastaq_to_unique_by_id
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Removes duplicate sequences from a fasta/q file, based on their names. If the same name is found more than once, then the longest sequence is kept. Order of sequences is preserved in output',
- usage = '%(prog)s <infile> <outfile>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.to_unique_by_id(options.infile, options.outfile)
diff --git a/scripts/fastaq_translate b/scripts/fastaq_translate
deleted file mode 100755
index 9ec04c7..0000000
--- a/scripts/fastaq_translate
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Translates all sequences in a fasta or fastq file. Output is always fasta format',
- usage = '%(prog)s <in.fasta/q> <out.fasta>')
-parser.add_argument('--frame', type=int, choices=[0,1,2], help='Frame to translate [%(default)s]', default=0)
-parser.add_argument('infile', help='Name of fasta/q file to be translated', metavar='in.fasta/q')
-parser.add_argument('outfile', help='Name of output fasta file', metavar='out.fasta')
-options = parser.parse_args()
-tasks.translate(options.infile, options.outfile, frame=options.frame)
diff --git a/scripts/fastaq_trim_Ns_at_end b/scripts/fastaq_trim_Ns_at_end
deleted file mode 100755
index 200d71f..0000000
--- a/scripts/fastaq_trim_Ns_at_end
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Trims any Ns off each sequence in a fasta/q file. Does nothing to gaps in the middle, just trims the ends',
- usage = '%(prog)s [options] <fasta/q in> <fasta/q out>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.trim_Ns_at_end(options.infile, options.outfile)
diff --git a/scripts/fastaq_trim_ends b/scripts/fastaq_trim_ends
deleted file mode 100755
index ffc662d..0000000
--- a/scripts/fastaq_trim_ends
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from fastaq import tasks
-
-parser = argparse.ArgumentParser(
- description = 'Trims set number of bases off each sequence in a fasta/q file',
- usage = '%(prog)s [options] <fasta/q in> <bases off start> <bases off end> <fasta/q out>')
-parser.add_argument('infile', help='Name of input fasta/q file')
-parser.add_argument('start_trim', type=int, help='Number of bases to trim off start')
-parser.add_argument('end_trim', type=int, help='Number of bases to trim off end')
-parser.add_argument('outfile', help='Name of output fasta/q file')
-options = parser.parse_args()
-tasks.trim(options.infile, options.outfile, options.start_trim, options.end_trim)
diff --git a/setup.py b/setup.py
index 5506ba9..ec726f7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,19 @@
import os
import glob
+import sys
from setuptools import setup, find_packages
-def read(fname):
- return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+try:
+ import numpy
+except ImportError:
+ print("Error! numpy for Python3 not found.\nPlease install it (e.g. apt-get install python3-numpy)", file=sys.stderr)
+ sys.exit(1)
setup(
- name='Fastaq',
- version='1.6.0',
- description='Scripts to manipulate FASTA and FASTQ files, plus API for developers',
- long_description=read('README.md'),
+ name='pyfastaq',
+ version='3.2.0',
+ description='Script to manipulate FASTA and FASTQ files, plus API for developers',
packages = find_packages(),
author='Martin Hunt',
author_email='mh12 at sanger.ac.uk',
@@ -18,4 +22,10 @@ setup(
test_suite='nose.collector',
install_requires=['nose >= 1.3'],
license='GPLv3',
+ classifiers=[
+ 'Development Status :: 4 - Beta',
+ 'Topic :: Scientific/Engineering :: Bio-Informatics',
+ 'Programming Language :: Python :: 3 :: Only',
+ 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+ ],
)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastaq.git
More information about the debian-med-commit
mailing list