[med-svn] [fastaq] 01/01: Removed all the source code
Jorge Soares
jssoares-guest at moszumanska.debian.org
Wed Oct 8 10:54:01 UTC 2014
This is an automated email from the git hooks/post-receive script.
jssoares-guest pushed a commit to branch master
in repository fastaq.
commit 7b7443623efe3eccef7e2d8b613eb4273d68fe4e
Author: Jorge Soares <j.s.soares at gmail.com>
Date: Wed Oct 8 11:58:24 2014 +0100
Removed all the source code
---
debian/changelog | 4 +-
debian/watch | 4 +-
fastaq/__init__.py | 2 -
fastaq/intervals.py | 117 -----
fastaq/sequences.py | 549 --------------------
fastaq/tasks.py | 556 ---------------------
fastaq/tests/data/sequences_test.embl | 203 --------
fastaq/tests/data/sequences_test.embl.bad | 202 --------
fastaq/tests/data/sequences_test.embl.bad2 | 202 --------
fastaq/tests/data/sequences_test.embl.to_fasta | 64 ---
fastaq/tests/data/sequences_test.fa | 19 -
fastaq/tests/data/sequences_test.fa.ids | 4 -
fastaq/tests/data/sequences_test.fa.qual | 17 -
fastaq/tests/data/sequences_test.fa.qual.bad | 17 -
fastaq/tests/data/sequences_test.fasta_to_fastq.fq | 16 -
fastaq/tests/data/sequences_test.gbk | 170 -------
fastaq/tests/data/sequences_test.gbk.to_fasta | 10 -
fastaq/tests/data/sequences_test.line_length3.fa | 12 -
fastaq/tests/data/sequences_test_3-per-line.fa | 19 -
.../tests/data/sequences_test_cap_to_read_pairs.fa | 16 -
.../sequences_test_cap_to_read_pairs.fa.paired.gz | Bin 92 -> 0 bytes
...sequences_test_cap_to_read_pairs.fa.unpaired.gz | Bin 92 -> 0 bytes
.../tests/data/sequences_test_deinterleaved_1.fa | 4 -
.../tests/data/sequences_test_deinterleaved_2.fa | 4 -
.../data/sequences_test_deinterleaved_bad2_1.fa | 2 -
.../data/sequences_test_deinterleaved_bad2_2.fa | 4 -
.../data/sequences_test_deinterleaved_bad_1.fa | 4 -
.../data/sequences_test_deinterleaved_bad_2.fa | 2 -
fastaq/tests/data/sequences_test_empty_file | 0
.../tests/data/sequences_test_enumerate_names.fa | 8 -
...quences_test_enumerate_names.fa.out.keep_suffix | 8 -
.../sequences_test_enumerate_names.fa.out.start.1 | 8 -
...test_enumerate_names.fa.out.start.1.rename_file | 5 -
.../sequences_test_enumerate_names.fa.out.start.2 | 8 -
fastaq/tests/data/sequences_test_extend_gaps.fa | 8 -
.../tests/data/sequences_test_extend_gaps.fa.out | 4 -
fastaq/tests/data/sequences_test_fai_test.fa | 8 -
fastaq/tests/data/sequences_test_fai_test.fa.fai | 4 -
fastaq/tests/data/sequences_test_fail_no_AT.fq | 5 -
fastaq/tests/data/sequences_test_fail_no_plus.fq | 4 -
fastaq/tests/data/sequences_test_fail_no_qual.fq | 3 -
fastaq/tests/data/sequences_test_fail_no_seq.fq | 5 -
...sequences_test_fastaq_replace_bases.expected.fa | 2 -
.../data/sequences_test_fastaq_replace_bases.fa | 2 -
...sequences_test_fastaq_to_quasr_primers.expected | 2 -
.../data/sequences_test_fastaq_to_quasr_primers.fa | 4 -
.../data/sequences_test_filter_by_ids_file.fa | 8 -
.../sequences_test_filter_by_ids_file.fa.filtered | 4 -
...nces_test_filter_by_ids_file.fa.filtered.invert | 4 -
.../data/sequences_test_filter_by_ids_file.fa.ids | 2 -
.../tests/data/sequences_test_filter_by_regex.fa | 10 -
.../sequences_test_filter_by_regex.first-char-a.fa | 6 -
...sequences_test_filter_by_regex.first-of-pair.fa | 4 -
.../data/sequences_test_filter_by_regex.numeric.fa | 2 -
.../data/sequences_test_get_seqs_flanking_gaps.fa | 4 -
.../sequences_test_get_seqs_flanking_gaps.fa.out | 3 -
fastaq/tests/data/sequences_test_gffv3.gff | 9 -
fastaq/tests/data/sequences_test_gffv3.gff.fasta | 4 -
.../tests/data/sequences_test_gffv3.gff.to_fasta | 4 -
.../data/sequences_test_gffv3.no_FASTA_line.gff | 8 -
...sequences_test_gffv3.no_FASTA_line.gff.to_fasta | 4 -
.../tests/data/sequences_test_gffv3.no_seq.2.gff | 6 -
fastaq/tests/data/sequences_test_gffv3.no_seq.gff | 4 -
fastaq/tests/data/sequences_test_good_file.fq | 11 -
.../data/sequences_test_good_file.fq.to_fasta | 4 -
.../tests/data/sequences_test_good_file_mira.xml | 13 -
fastaq/tests/data/sequences_test_interleaved.fa | 8 -
fastaq/tests/data/sequences_test_interleaved.fq | 16 -
.../tests/data/sequences_test_interleaved_bad.fa | 6 -
fastaq/tests/data/sequences_test_length_filter.fa | 6 -
.../sequences_test_length_filter.min-0.max-1.fa | 0
.../sequences_test_length_filter.min-0.max-inf.fa | 6 -
.../sequences_test_length_filter.min-4.max-4.fa | 2 -
.../sequences_test_make_random_contigs.default.fa | 4 -
.../sequences_test_make_random_contigs.first-42.fa | 4 -
...ces_test_make_random_contigs.name-by-letters.fa | 56 ---
.../sequences_test_make_random_contigs.prefix-p.fa | 4 -
fastaq/tests/data/sequences_test_not_a_fastaq_file | 1 -
fastaq/tests/data/sequences_test_one-per-line.fa | 14 -
.../tests/data/sequences_test_phylip.interleaved | 8 -
.../sequences_test_phylip.interleaved.to_fasta | 6 -
.../tests/data/sequences_test_phylip.interleaved2 | 7 -
.../sequences_test_phylip.interleaved2.to_fasta | 6 -
.../data/sequences_test_phylip.made_by_seaview | 6 -
.../sequences_test_phylip.made_by_seaview.to_fasta | 6 -
fastaq/tests/data/sequences_test_phylip.sequential | 7 -
.../data/sequences_test_phylip.sequential.to_fasta | 6 -
fastaq/tests/data/sequences_test_revcomp.fa | 8 -
fastaq/tests/data/sequences_test_search_string.fa | 2 -
.../data/sequences_test_search_string.fa.hits | 4 -
.../tests/data/sequences_test_split_fixed_size.fa | 12 -
.../sequences_test_split_fixed_size.fa.split.1 | 2 -
.../sequences_test_split_fixed_size.fa.split.2 | 2 -
.../sequences_test_split_fixed_size.fa.split.3 | 2 -
.../sequences_test_split_fixed_size.fa.split.4 | 2 -
.../sequences_test_split_fixed_size.fa.split.5 | 4 -
.../sequences_test_split_fixed_size.fa.split.6 | 2 -
...sequences_test_split_fixed_size.fa.split.coords | 2 -
...test_split_fixed_size.fa.split.skip_if_all_Ns.1 | 2 -
...test_split_fixed_size.fa.split.skip_if_all_Ns.2 | 2 -
...test_split_fixed_size.fa.split.skip_if_all_Ns.3 | 4 -
...test_split_fixed_size.fa.split.skip_if_all_Ns.4 | 2 -
...split_fixed_size.fa.split.skip_if_all_Ns.coords | 1 -
fastaq/tests/data/sequences_test_split_test.fa | 8 -
fastaq/tests/data/sequences_test_split_test.fa.2.1 | 2 -
fastaq/tests/data/sequences_test_split_test.fa.2.2 | 2 -
fastaq/tests/data/sequences_test_split_test.fa.2.3 | 2 -
fastaq/tests/data/sequences_test_split_test.fa.2.4 | 2 -
fastaq/tests/data/sequences_test_split_test.fa.3.1 | 4 -
fastaq/tests/data/sequences_test_split_test.fa.3.2 | 2 -
fastaq/tests/data/sequences_test_split_test.fa.3.3 | 2 -
fastaq/tests/data/sequences_test_split_test.fa.4.1 | 4 -
fastaq/tests/data/sequences_test_split_test.fa.4.2 | 2 -
fastaq/tests/data/sequences_test_split_test.fa.4.3 | 2 -
fastaq/tests/data/sequences_test_split_test.fa.6.1 | 6 -
fastaq/tests/data/sequences_test_split_test.fa.6.2 | 2 -
.../data/sequences_test_split_test.fa.6.limit2.1 | 4 -
.../data/sequences_test_split_test.fa.6.limit2.2 | 2 -
.../data/sequences_test_split_test.fa.6.limit2.3 | 2 -
.../tests/data/sequences_test_split_test.long.fa | 4 -
.../data/sequences_test_split_test.long.fa.2.1 | 2 -
.../data/sequences_test_split_test.long.fa.2.2 | 2 -
.../data/sequences_test_strip_after_whitespace.fa | 6 -
...quences_test_strip_after_whitespace.fa.to_fasta | 6 -
.../data/sequences_test_strip_illumina_suffix.fq | 12 -
...equences_test_strip_illumina_suffix.fq.stripped | 12 -
.../tests/data/sequences_test_to_unique_by_id.fa | 11 -
.../data/sequences_test_to_unique_by_id.fa.out | 6 -
fastaq/tests/data/sequences_test_translate.fa | 2 -
.../tests/data/sequences_test_translate.fa.frame0 | 3 -
.../tests/data/sequences_test_translate.fa.frame1 | 3 -
.../tests/data/sequences_test_translate.fa.frame2 | 3 -
fastaq/tests/data/sequences_test_trim_Ns_at_end.fa | 10 -
.../data/sequences_test_trim_Ns_at_end.fa.trimmed | 8 -
fastaq/tests/data/sequences_test_trimmed.fq | 8 -
fastaq/tests/data/sequences_test_untrimmed.fq | 16 -
fastaq/tests/data/utils_test_file_transpose.txt | 5 -
fastaq/tests/data/utils_test_file_transposed.txt | 3 -
fastaq/tests/data/utils_test_not_really_zipped.gz | 1 -
fastaq/tests/data/utils_test_scaffolds.fa | 8 -
.../data/utils_test_scaffolds.fa.to_contigs.fa | 10 -
..._test_scaffolds.fa.to_contigs.number_contigs.fa | 10 -
fastaq/tests/data/utils_test_system_call.txt | 1 -
fastaq/tests/intervals_test.py | 212 --------
fastaq/tests/sequences_test.py | 535 --------------------
fastaq/tests/tasks_test.py | 449 -----------------
fastaq/tests/utils_test.py | 80 ---
fastaq/utils.py | 86 ----
{scripts => src}/fastaq_capillary_to_pairs | 0
{scripts => src}/fastaq_chunker | 0
{scripts => src}/fastaq_count_sequences | 0
{scripts => src}/fastaq_deinterleave | 0
{scripts => src}/fastaq_enumerate_names | 0
{scripts => src}/fastaq_extend_gaps | 0
{scripts => src}/fastaq_fasta_to_fastq | 0
{scripts => src}/fastaq_filter | 0
{scripts => src}/fastaq_get_ids | 0
{scripts => src}/fastaq_get_seq_flanking_gaps | 0
{scripts => src}/fastaq_insert_or_delete_bases | 0
{scripts => src}/fastaq_interleave | 0
{scripts => src}/fastaq_make_random_contigs | 0
{scripts => src}/fastaq_replace_bases | 0
{scripts => src}/fastaq_reverse_complement | 0
{scripts => src}/fastaq_scaffolds_to_contigs | 0
{scripts => src}/fastaq_search_for_seq | 0
{scripts => src}/fastaq_split_by_base_count | 0
{scripts => src}/fastaq_strip_illumina_suffix | 0
{scripts => src}/fastaq_to_fasta | 0
{scripts => src}/fastaq_to_mira_xml | 0
{scripts => src}/fastaq_to_perfect_reads | 0
{scripts => src}/fastaq_to_quasr_primers_file | 0
{scripts => src}/fastaq_to_random_subset | 0
{scripts => src}/fastaq_to_tiling_bam | 0
{scripts => src}/fastaq_to_unique_by_id | 0
{scripts => src}/fastaq_translate | 0
{scripts => src}/fastaq_trim_Ns_at_end | 0
{scripts => src}/fastaq_trim_ends | 0
177 files changed, 4 insertions(+), 4215 deletions(-)
diff --git a/debian/changelog b/debian/changelog
index a5388a4..f58bb6f 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,5 +1,5 @@
-Fastaq (0.1-1) UNRELEASED; urgency=low
+Fastaq (1.5.0-1) UNRELEASED; urgency=low
- * Initial release (Closes: #<bug>)
+ * Initial release (Closes: #1234)
-- DMPT <debian-med-packaging at lists.alioth.debian.org> Thu, 24 May 2012 14:30:13 +0200
diff --git a/debian/watch b/debian/watch
index 5317993..dba8b5a 100644
--- a/debian/watch
+++ b/debian/watch
@@ -1,4 +1,4 @@
version=3
-https://github.com/js21/Fastaq/tags \
- /js21/Fastaq/archive/([.\d]+)\.tar\.gz
+https://github.com/sanger-pathogens/Fastaq/tags \
+ /sanger-pathogens/Fastaq/archive/([.\d]+)\.tar\.gz
diff --git a/fastaq/__init__.py b/fastaq/__init__.py
deleted file mode 100644
index 52ded75..0000000
--- a/fastaq/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-__all__ = ['utils', 'sequences', 'tasks', 'intervals']
-from fastaq import *
diff --git a/fastaq/intervals.py b/fastaq/intervals.py
deleted file mode 100644
index b320c63..0000000
--- a/fastaq/intervals.py
+++ /dev/null
@@ -1,117 +0,0 @@
-class Error (Exception): pass
-
-
-class Interval:
- '''A class to deal with intervals in a genome. Can do things like intersections, unions etc'''
- def __init__(self, start, end):
- try:
- self.start = int(start)
- self.end = int(end)
- except ValueError:
- raise Error('Error making interval from :"' + str(start) + '" and "' + str(end) + '"')
-
- if self.end < self.start:
- raise Error('Error making interval ' + str(self) + '. end < start.')
-
- def __len__(self):
- return self.end - self.start + 1
-
- def __eq__(self, other):
- return type(other) is type(self) and self.__dict__ == other.__dict__
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __str__(self):
- return '(' + str(self.start) + ',' + str(self.end) + ')'
-
- def __lt__(self, i):
- return self.start < i.start or (self.start == i.start and self.end < i.end)
-
- def __le__(self, i):
- return self.start < i.start or (self.start == i.start and self.end <= i.end)
-
- def intersects(self, i):
- '''Returns true iff this interval intersects the interval i'''
- return self.start <= i.end and i.start <= self.end
-
- def contains(self, i):
- '''Returns true iff this interval contains the interval i'''
- return self.start <= i.start and i.end <= self.end
-
- def union(self, i):
- '''If intervals intersect, returns their union, otherwise returns None'''
- if self.intersects(i) or self.end + 1 == i.start or i.end + 1 == self.start:
- return Interval(min(self.start, i.start), max(self.end, i.end))
- else:
- return None
-
- def union_fill_gap(self, i):
- '''Like union, but ignores whether the two intervals intersect or not'''
- return Interval(min(self.start, i.start), max(self.end, i.end))
-
- def intersection(self, i):
- '''If intervals intersect, returns their intersection, otherwise returns None'''
- if self.intersects(i):
- return Interval(max(self.start, i.start), min(self.end, i.end))
- else:
- return None
-
-
-def intersection(l1, l2):
- '''Returns intersection of two lists. Assumes the lists are sorted by start positions'''
- if len(l1) == 0 or len(l2) == 0:
- return []
-
- out = []
- l2_pos = 0
-
- for l in l1:
- while l2_pos < len(l2) and l2[l2_pos].end < l.start:
- l2_pos += 1
-
- if l2_pos == len(l2):
- break
-
- while l2_pos < len(l2) and l.intersects(l2[l2_pos]):
- out.append(l.intersection(l2[l2_pos]))
- l2_pos += 1
-
- l2_pos = max(0, l2_pos - 1)
-
- return out
-
-
-def merge_overlapping_in_list(l):
- '''Sorts list, merges any overlapping intervals, and also adjacent intervals. e.g.
- [0,1], [1,2] would be merge to [0,.2].'''
- i = 0
- l.sort()
-
- while i < len(l) - 1:
- u = l[i].union(l[i+1])
- if u is not None:
- l[i] = u
- l.pop(i+1)
- else:
- i += 1
-
-
-def remove_contained_in_list(l):
- '''Sorts list in place, then removes any intervals that are completely
- contained inside another interval'''
- i = 0
- l.sort()
-
- while i < len(l) - 1:
- if l[i+1].contains(l[i]):
- l.pop(i)
- elif l[i].contains(l[i+1]):
- l.pop(i+1)
- else:
- i += 1
-
-
-def length_sum_from_list(l):
- '''Returns total length of intervals from a list'''
- return sum([len(x) for x in l])
diff --git a/fastaq/sequences.py b/fastaq/sequences.py
deleted file mode 100644
index 0ce03f8..0000000
--- a/fastaq/sequences.py
+++ /dev/null
@@ -1,549 +0,0 @@
-import re
-import string
-
-from fastaq import utils, intervals
-
-class Error (Exception): pass
-
-
-# python 3's seek is glacially slow. When we read a fasta file, we know
-# we've reached the end of a sequence when we get a new line starting with
-# '>'. Instead of using seek and tell, we just remember the previous line
-# of the file, for any given filehandle
-previous_lines = {}
-
-
-codon2aa = {
-'GCA': 'A',
-'GCC': 'A',
-'GCG': 'A',
-'GCT': 'A',
-'AGA': 'R',
-'AGG': 'R',
-'CGA': 'R',
-'CGC': 'R',
-'CGG': 'R',
-'CGT': 'R',
-'AAC': 'N',
-'AAT': 'N',
-'GAC': 'D',
-'GAT': 'D',
-'TGC': 'C',
-'TGT': 'C',
-'GAA': 'E',
-'GAG': 'E',
-'CAA': 'Q',
-'CAG': 'Q',
-'GGA': 'G',
-'GGC': 'G',
-'GGG': 'G',
-'GGT': 'G',
-'CAC': 'H',
-'CAT': 'H',
-'ATA': 'I',
-'ATC': 'I',
-'ATT': 'I',
-'TTA': 'L',
-'TTG': 'L',
-'CTA': 'L',
-'CTC': 'L',
-'CTG': 'L',
-'CTT': 'L',
-'AAA': 'K',
-'AAG': 'K',
-'ATG': 'M',
-'TTC': 'F',
-'TTT': 'F',
-'CCA': 'P',
-'CCC': 'P',
-'CCG': 'P',
-'CCT': 'P',
-'AGC': 'S',
-'AGT': 'S',
-'TCA': 'S',
-'TCC': 'S',
-'TCG': 'S',
-'TCT': 'S',
-'ACA': 'T',
-'ACC': 'T',
-'ACG': 'T',
-'ACT': 'T',
-'TGG': 'W',
-'TAC': 'Y',
-'TAT': 'Y',
-'GTA': 'V',
-'GTC': 'V',
-'GTG': 'V',
-'GTT': 'V',
-'TAA': '*',
-'TAG': '*',
-'TGA': '*'}
-
-def file_reader(fname, read_quals=False):
- '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences'''
- f = utils.open_file_read(fname)
- line = f.readline()
- phylip_regex = re.compile('^\s*[0-9]+\s+[0-9]+$')
- gbk_regex = re.compile('^LOCUS\s+\S')
-
- if line.startswith('>'):
- seq = Fasta()
- previous_lines[f] = line
- elif line.startswith('##gff-version 3'):
- seq = Fasta()
- # if a GFF file, need to skip past all the annotation
- # and get to the fasta sequences at the end of the file
- while not line.startswith('>'):
- line = f.readline()
- if not line:
- utils.close(f)
- raise Error('No sequences found in GFF file "' + fname + '"')
-
- seq = Fasta()
- previous_lines[f] = line
- elif line.startswith('ID ') and line[5] != ' ':
- seq = Embl()
- previous_lines[f] = line
- elif gbk_regex.search(line):
- seq = Embl()
- previous_lines[f] = line
- elif line.startswith('@'):
- seq = Fastq()
- previous_lines[f] = line
- elif phylip_regex.search(line):
- # phylip format could be interleaved or not, need to look at next
- # couple of lines to figure that out. Don't expect these files to
- # be too huge, so just store all the sequences in memory
- number_of_seqs, bases_per_seq = line.strip().split()
- number_of_seqs = int(number_of_seqs)
- bases_per_seq = int(bases_per_seq)
- got_blank_line = False
-
- first_line = line
- seq_lines = []
- while 1:
- line = f.readline()
- if line == '':
- break
- elif line == '\n':
- got_blank_line = True
- else:
- seq_lines.append(line.rstrip())
- utils.close(f)
-
- if len(seq_lines) == 1 or len(seq_lines) == number_of_seqs:
- sequential = True
- elif seq_lines[0][10] != ' ' and seq_lines[1][10] == ' ':
- sequential = True
- else:
- sequential = False
-
- # if the 11th char of second sequence line is a space, then the file is sequential, e.g.:
- # GAGCCCGGGC AATACAGGGT AT
- # as opposed to:
- # Salmo gairAAGCCTTGGC AGTGCAGGGT
- if sequential:
- current_id = None
- current_seq = ''
- for line in seq_lines:
- if len(current_seq) == bases_per_seq or len(current_seq) == 0:
- if current_id is not None:
- yield Fasta(current_id, current_seq.replace('-', ''))
- current_seq = ''
- current_id, new_bases = line[0:10].rstrip(), line.rstrip()[10:]
- else:
- new_bases = line.rstrip()
-
- current_seq += new_bases.replace(' ','')
-
- yield Fasta(current_id, current_seq.replace('-', ''))
- else:
- # seaview files start all seqs at pos >=12. Other files start
- # their sequence at the start of the line
- if seq_lines[number_of_seqs + 1][0] == ' ':
- first_gap_pos = seq_lines[0].find(' ')
- end_of_gap = first_gap_pos
- while seq_lines[0][end_of_gap] == ' ':
- end_of_gap += 1
- first_seq_base = end_of_gap
- else:
- first_seq_base = 10
-
- seqs = []
- for i in range(number_of_seqs):
- name, bases = seq_lines[i][0:first_seq_base].rstrip(), seq_lines[i][first_seq_base:]
- seqs.append(Fasta(name, bases))
-
- for i in range(number_of_seqs, len(seq_lines)):
- seqs[i%number_of_seqs].seq += seq_lines[i]
-
- for fa in seqs:
- fa.seq = fa.seq.replace(' ','').replace('-','')
- yield fa
-
- return
- elif line == '':
- utils.close(f)
- return
- else:
- utils.close(f)
- raise Error('Error determining file type from file "' + fname + '". First line is:\n' + line.rstrip())
-
- try:
- while seq.get_next_from_file(f, read_quals):
- yield seq
- finally:
- utils.close(f)
-
-
-class Fasta:
- '''Class to store and manipulate FASTA sequences. They have two things: a name and a sequence'''
- # this defines the line length when printing sequences
- line_length = 60
-
- def _get_id_from_header_line(self, line):
- if line.startswith('>'):
- return line.rstrip()[1:]
- else:
- raise Error('Error! expected line starting with ">", but got this:\n', line)
-
-
- def __init__(self, id_in=None, seq_in=None):
- self.id = id_in
- self.seq = seq_in
-
- def __eq__(self, other):
- return type(other) is type(self) and self.__dict__ == other.__dict__
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __len__(self):
- return len(self.seq)
-
- def split_capillary_id(self):
- '''Gets the prefix and suffix of an name of a capillary read, e.g. xxxxx.p1k or xxxx.q1k. Returns a tuple (prefix, suffx)'''
- try:
- a = self.id.rsplit('.', 1)
- if a[1].startswith('p'):
- dir = 'fwd'
- elif a[1].startswith('q'):
- dir = 'rev'
- else:
- dir = 'unk'
-
- return {'prefix': a[0], 'dir': dir, 'suffix':a[1]}
- except:
- raise Error('Error in split_capillary_id() on ID', self.id)
-
- def strip_after_first_whitespace(self):
- '''Removes everything in the name after the first whitespace character'''
- self.id = self.id.split()[0]
-
- def strip_illumina_suffix(self):
- '''Removes any trailing /1 or /2 off the end of the name'''
- if self.id.endswith('/1') or self.id.endswith('/2'):
- self.id = self.id[:-2]
-
- def revcomp(self):
- '''Reverse complements the sequence'''
- self.seq = self.seq.translate(str.maketrans("ATCGatcg", "TAGCtagc"))[::-1]
-
- def is_all_Ns(self, start=0, end=None):
- '''Returns true if the sequence is all Ns (upper or lower case)'''
- if end is not None:
- if start > end:
- raise Error('Error in is_all_Ns. Start coord must be <= end coord')
- end += 1
- else:
- end = len(self)
-
- if len(self) == 0:
- return False
- else:
- return re.search('[^Nn]', self.seq[start:end]) is None
-
- def trim_Ns(self):
- '''Removes any leading or trailing N or n characters from the sequence'''
- self.seq = self.seq.strip('Nn')
-
- def replace_bases(self, old, new):
- '''Replaces all occurences of 'old' with 'new' '''
- self.seq = self.seq.replace(old, new)
-
- def replace_interval(self, start, end, new):
- '''Replaces the sequence from start to end with the sequence "new"'''
- if start > end or start > len(self) - 1 or end > len(self) - 1:
- raise Error('Error replacing bases ' + str(start) + '-' + str(end) + ' in sequence ' + self.id)
-
- self.seq = self.seq[0:start] + new + self.seq[end + 1:]
-
- def gaps(self, min_length = 1):
- '''Finds the positions of all gaps in the sequence that are at least min_length long. Returns a list of Intervals. Coords are zero-based'''
- gaps = []
- regex = re.compile('N+', re.IGNORECASE)
- for m in regex.finditer(self.seq):
- if m.span()[1] - m.span()[0] + 1 >= min_length:
- gaps.append(intervals.Interval(m.span()[0], m.span()[1] - 1))
- return gaps
-
- def contig_coords(self):
- '''Finds coords of contigs, i.e. everything that's not a gap (N or n). Returns a list of Intervals. Coords are zero-based'''
- # contigs are the opposite of gaps, so work out the coords from the gap coords
- gaps = self.gaps()
-
- if len(gaps) == 0:
- return [intervals.Interval(0, len(self) - 1)]
-
- coords = [0]
- for g in gaps:
- if g.start == 0:
- coords = [g.end + 1]
- else:
- coords += [g.start - 1, g.end + 1]
-
- if coords[-1] < len(self):
- coords.append(len(self) - 1)
-
- return [intervals.Interval(coords[i], coords[i+1]) for i in range(0, len(coords)-1,2)]
-
-
-
- # Fills the object with the next sequence in the file. Returns
- # True if this was successful, False if no more sequences in the file.
- # If reading a file of quality scores, set read_quals = True
- def get_next_from_file(self, f, read_quals=False):
- if f in previous_lines:
- if previous_lines[f] == None:
- self.id = self.seq = None
- return False
- else:
- self.id = self._get_id_from_header_line(previous_lines[f])
- else:
- line = '\n'
- while line == '\n':
- line = f.readline()
- self.id = self._get_id_from_header_line(line)
-
- self.seq = ''
- seq_lines = [] # much faster to store the seq lines in an array,
- # then join at the end
-
- while 1:
- line = f.readline()
-
- if line.startswith('>'):
- previous_lines[f] = line.rstrip()
- break
- elif line == '':
- previous_lines[f] = None
- break
- else:
- seq_lines.append(line.rstrip())
-
- if read_quals:
- self.seq = ' '.join(seq_lines)
- else:
- self.seq = ''.join(seq_lines)
- return True
-
- def __str__(self):
- if Fasta.line_length == 0:
- return '>' + self.id + '\n' + self.seq
- else:
- return '>' + self.id + '\n' + '\n'.join(self.seq[i:i+Fasta.line_length] for i in range(0, len(self), Fasta.line_length))
-
- def __getitem__(self, index):
- return self.seq[index]
-
- def trim(self, start, end):
- '''Removes first 'start'/'end' bases off the start/end of the sequence'''
- self.seq = self.seq[start:len(self.seq) - end]
-
- # qual_scores should be a list of quality scores
- def to_Fastq(self, qual_scores):
- '''Returns a Fastq object. qual_scores expected to be a list of numbers, like you would get in a .qual file'''
- if len(self) != len(qual_scores):
- raise Error('Error making Fastq from Fasta, lengths differ.', self.id)
- return Fastq(self.id, self.seq, ''.join([chr(max(0, min(x, 93)) + 33) for x in qual_scores]))
-
- def search(self, search_string):
- '''Finds every occurence (including overlapping ones) of the search_string, including on the reverse strand. Returns a list where each element is a tuple (position, strand) where strand is in ['-', '+']. Positions are zero-based'''
- seq = self.seq.upper()
- search_string = search_string.upper()
- pos = 0
- found = seq.find(search_string, pos)
- hits = []
-
- while found != -1:
- hits.append((found, '+'))
- pos = found + 1
- found = seq.find(search_string, pos)
-
-
- pos = 0
- search_string = Fasta('x', search_string)
- search_string.revcomp()
- search_string = search_string.seq
- found = seq.find(search_string, pos)
-
- while found != -1:
- hits.append((found, '-'))
- pos = found + 1
- found = seq.find(search_string, pos)
-
- return hits
-
- def translate(self, frame=0):
- '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2'''
- return Fasta(self.id, ''.join([codon2aa.get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))
-
-
-class Embl(Fasta):
- '''Exactly the same as Fasta, but reading seqs from a file works differently'''
- def __eq__(self, other):
- return type(other) in [Fasta, Embl] and type(self) in [Fasta, Embl] and self.__dict__ == other.__dict__
-
- def _get_id_from_header_line(self, line):
- if line.startswith('ID ') and line[5] != ' ':
- return line.split()[1].rstrip(';')
- elif line.startswith('LOCUS'):
- return line.split()[1]
- else:
- raise Error('Error! expected line starting with "ID" or "LOCUS", but got this:\n', line)
-
- def get_next_from_file(self, f, read_quals=False):
- if f in previous_lines:
- line = ''
- if previous_lines[f] == None:
- self.id = self.seq = None
- return False
- else:
- self.id = self._get_id_from_header_line(previous_lines[f])
- else:
- line = '\n'
- while line == '\n':
- line = f.readline()
- self.id = self._get_id_from_header_line(line)
-
- self.seq = ''
- seq_lines = []
-
- while not (line.startswith('SQ') or line.rstrip() == 'ORIGIN'):
- line = f.readline()
- if line == '':
- raise Error('Error! No SQ or ORIGIN line found for sequence ' + self.id)
-
- line = f.readline()
-
- while not line.startswith('//'):
- if line == '' or line[0] != ' ':
- raise Error('Error! Did not find end of sequence ' + self.id)
- seq_lines.append(''.join(line.rstrip().strip(' 0123456789').split()))
- line = f.readline()
-
-
- while 1:
- if line.startswith('ID') or line.startswith('LOCUS'):
- previous_lines[f] = line.rstrip()
- break
- elif line == '':
- previous_lines[f] = None
- break
-
- line = f.readline()
-
- self.seq = ''.join(seq_lines)
- return True
-
-class Fastq(Fasta):
- '''Class to store and manipulate FASTQ sequences. They have three things: a name, sequence and string of quality scores'''
- def __init__(self, id_in=None, seq_in=None, qual_in=None):
- super().__init__(id_in, seq_in)
- self.qual = qual_in
- if (not self.seq == self.qual == None) and len(self.qual) != len(self.seq):
- raise Error('Error constructing Fastq. Mismatch in sequence and quality length\n' + str(self))
-
- def __str__(self):
- return '@' + self.id + '\n' + self.seq + '\n+\n' + self.qual
-
- def __eq__(self, other):
- return type(other) is type(self) and self.__dict__ == other.__dict__
-
- def get_next_from_file(self, f, read_quals=False):
- if f in previous_lines:
- line = previous_lines[f]
- del previous_lines[f]
- else:
- line = f.readline()
-
- while line == '\n':
- line = f.readline()
-
- if not line:
- self = Fastq('', '', '')
- return False
-
- if not line.startswith('@'):
- raise Error('Error getting next sequence from fastq file. Got line:\n' + line)
-
- self.id = line.rstrip()[1:]
- line = f.readline()
- if not line:
- raise Error('Error getting next sequence from fastq file, sequence has ID ' + self.id)
-
- self.seq = line.strip()
-
- line = f.readline()
- if not (line and line.startswith('+')):
- raise Error('Error getting next sequence from fastq file, no line starting with +, sequence has ID ' + self.id)
-
- line = f.readline()
- if not line:
- raise Error('Error getting next sequence from fastq file, sequence has ID ' + self.id)
-
- self.qual = line.rstrip()
- return True
-
- def revcomp(self):
- '''Reverse complements the sequence'''
- super().revcomp()
- self.qual = self.qual[::-1]
-
- def trim(self, start, end):
- '''Removes first 'start'/'end' bases off the start/end of the sequence'''
- super().trim(start, end)
- self.qual = self.qual[start:len(self.qual) - end]
-
- def to_Fasta_and_qual(self):
- quals = [ord(x) - 33 for x in self.qual]
- return (Fasta(self.id, self.seq), quals)
-
-
- def trim_Ns(self):
- '''Removes any leading or trailing N or n characters from the sequence'''
- # get index of first base that is not an N
- i = 0
- while i < len(self) and self.seq[i] in 'nN':
- i += 1
-
- # strip off start of sequence and quality
- self.seq = self.seq[i:]
- self.qual = self.qual[i:]
-
- # strip the ends
- self.seq = self.seq.rstrip('Nn')
- self.qual = self.qual[:len(self.seq)]
-
- def replace_interval(self, start, end, new, qual_string):
- '''Replaces the sequence from start to end with the sequence "new"'''
- if len(new) != len(qual_string):
- raise Error('Length of new seq and qual string in replace_interval() must be equal. Cannot continue')
- super().replace_interval(start, end, new)
- self.qual = self.qual[0:start] + qual_string + self.qual[end + 1:]
-
- def translate(self):
- '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2'''
- fa = super().translate()
- return Fastq(fa.id, fa.seq, 'I'*len(fa.seq))
-
diff --git a/fastaq/tasks.py b/fastaq/tasks.py
deleted file mode 100644
index ad10b06..0000000
--- a/fastaq/tasks.py
+++ /dev/null
@@ -1,556 +0,0 @@
-import re
-import copy
-import random
-from fastaq import sequences, utils
-
-class Error (Exception): pass
-
-def capillary_to_pairs(infile, outprefix):
- # hash the sequences, only taking longest where an end has been sequenced more than once
- seq_reader = sequences.file_reader(infile)
- fwd_seqs = {}
- rev_seqs = {}
- unpaired_seqs = {}
-
- for seq in seq_reader:
- id_info = seq.split_capillary_id()
- if id_info['dir'] == 'fwd':
- seq.id = id_info['prefix'] + '/1'
- h = fwd_seqs
- elif id_info['dir'] == 'rev':
- seq.id = id_info['prefix'] + '/2'
- h = rev_seqs
- else:
- seq.id = id_info['prefix']
- h = unpaired_seqs
-
- key = id_info['prefix']
-
- if key not in h or len(h[key]) < len(seq):
- h[key] = copy.copy(seq)
-
- # write the output files
- f_pe = utils.open_file_write(outprefix + '.paired.gz')
- f_up = utils.open_file_write(outprefix + '.unpaired.gz')
-
- for id in fwd_seqs:
- if id in rev_seqs:
- print(fwd_seqs[id], file=f_pe)
- print(rev_seqs[id], file=f_pe)
- del rev_seqs[id]
- else:
- print(fwd_seqs[id], file=f_up)
-
- for seq in rev_seqs.values():
- print(seq, file=f_up)
-
- for seq in unpaired_seqs.values():
- print(seq, file=f_up)
-
- utils.close(f_pe)
- utils.close(f_up)
-
-
-def count_sequences(infile):
- '''Returns the number of sequences in a file'''
- seq_reader = sequences.file_reader(infile)
- n = 0
- for seq in seq_reader:
- n += 1
- return n
-
-
-def deinterleave(infile, outfile_1, outfile_2, fasta_out=False):
- seq_reader = sequences.file_reader(infile)
- f_1 = utils.open_file_write(outfile_1)
- f_2 = utils.open_file_write(outfile_2)
- for seq in seq_reader:
- if fasta_out:
- print(sequences.Fasta(seq.id, seq.seq), file=f_1)
- else:
- print(seq, file=f_1)
- try:
- next(seq_reader)
- except StopIteration:
- utils.close(f_1)
- utils.close(f_2)
- raise Error('Error getting mate for sequence. Cannot continue')
- if fasta_out:
- print(sequences.Fasta(seq.id, seq.seq), file=f_2)
- else:
- print(seq, file=f_2)
-
- utils.close(f_1)
- utils.close(f_2)
-
-
-def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False, rename_file=None):
- seq_reader = sequences.file_reader(infile)
- fout_seqs = utils.open_file_write(outfile)
- counter = start_index
-
- if keep_illumina_suffix:
- sequence_suffixes = ['/1', '/2']
- else:
- sequence_suffixes = []
-
-
- if rename_file is not None:
- fout_rename = utils.open_file_write(rename_file)
- print('#old\tnew', file=fout_rename)
-
- for seq in seq_reader:
- old_id = seq.id
- seq.id = str(counter)
-
- for suff in sequence_suffixes:
- if old_id.endswith(suff):
- seq.id += suff
- break
-
- if rename_file is not None:
- print(old_id, seq.id, sep='\t', file=fout_rename)
-
- print(seq, file=fout_seqs)
- counter += 1
-
- utils.close(fout_seqs)
-
- if rename_file is not None:
- utils.close(fout_rename)
-
-
-def extend_gaps(infile, outfile, trim):
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- if len(seq) < 2 * trim:
- continue
-
- gaps = seq.gaps()
- bases = list(seq.seq)
-
- # extend the length of each gap
- for gap in gaps:
- left_start = max(gap.start - trim, 0)
- right_end = min(gap.end + trim + 1, len(seq))
-
- for i in range(left_start, gap.start):
- bases[i] = 'N'
-
- for i in range(gap.end, right_end):
- bases[i] = 'N'
-
- seq.seq = ''.join(bases)
-
- # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq
- seq.trim(trim, trim)
- seq.trim_Ns()
-
- # check that there is some non-N sequence left over
- regex = re.compile('[^nN]')
- if regex.search(seq.seq) is not None:
- print(seq, file=fout)
-
- utils.close(fout)
-
-
-def fasta_to_fastq(fasta_in, qual_in, outfile):
- fa_reader = sequences.file_reader(fasta_in)
- qual_reader = sequences.file_reader(qual_in, read_quals=True)
- f_out = utils.open_file_write(outfile)
-
- for seq in fa_reader:
- qual = next(qual_reader)
- if seq.id != qual.id:
- utils.close(f_out)
- raise Error('Mismatch in names from fasta and qual file', seq.id, qual.id)
-
- qual.seq = [int(x) for x in qual.seq.split()]
- print(seq.to_Fastq(qual.seq), file=f_out)
-
- utils.close(f_out)
-
-
-def fastaq_to_mira_xml(infile, outfile):
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
- print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout)
-
- for seq in seq_reader:
- print(' <trace>',
- ' <trace_name>' + seq.id + '</trace_name>',
- ' <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>',
- ' <clip_vector_left>1</clip_vector_left>',
- ' </trace>', sep='\n', file=fout)
-
-
- print('</trace_volume>', file=fout)
- utils.close(fout)
-
-
-def file_to_dict(infile, d):
- seq_reader = sequences.file_reader(infile)
- for seq in seq_reader:
- d[seq.id] = copy.copy(seq)
-
-
-def filter(infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False):
- ids_from_file = set()
- if ids_file is not None:
- f = utils.open_file_read(ids_file)
- for line in f:
- ids_from_file.add(line.rstrip())
- utils.close(f)
-
- seq_reader = sequences.file_reader(infile)
- f_out = utils.open_file_write(outfile)
- if regex is not None:
- r = re.compile(regex)
-
- for seq in seq_reader:
- hit = minlength <= len(seq) <= maxlength \
- and (regex is None or r.search(seq.id) is not None) \
- and (ids_file is None or seq.id in ids_from_file)
-
- if hit != invert:
- print(seq, file=f_out)
- utils.close(f_out)
-
-
-def get_ids(infile, outfile):
- seq_reader = sequences.file_reader(infile)
- f_out = utils.open_file_write(outfile)
- for seq in seq_reader:
- print(seq.id, file=f_out)
- utils.close(f_out)
-
-
-def get_seqs_flanking_gaps(infile, outfile, left, right):
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
-
- print('#id', 'gap_start', 'gap_end', 'left_bases', 'right_bases', sep='\t', file=fout)
-
- for seq in seq_reader:
- gaps = seq.gaps()
-
- for gap in gaps:
- left_start = max(gap.start - left, 0)
- right_end = min(gap.end + right + 1, len(seq))
- print(seq.id,
- gap.start + 1,
- gap.end + 1,
- seq.seq[left_start:gap.start],
- seq.seq[gap.end + 1:right_end],
- sep='\t', file=fout)
-
- utils.close(fout)
-
-
-def interleave(infile_1, infile_2, outfile):
- seq_reader_1 = sequences.file_reader(infile_1)
- seq_reader_2 = sequences.file_reader(infile_2)
- f_out = utils.open_file_write(outfile)
-
- for seq_1 in seq_reader_1:
- try:
- seq_2 = next(seq_reader_2)
- except:
- utils.close(f_out)
- raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue')
-
- print(seq_1, file=f_out)
- print(seq_2, file=f_out)
-
- try:
- seq_2 = next(seq_reader_2)
- except:
- seq_2 = None
-
- if seq_2 is not None:
- utils.close(f_out)
- raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue')
-
- utils.close(f_out)
-
-
-def make_random_contigs(contigs, length, outfile, name_by_letters=False, prefix='', seed=None, first_number=1):
- '''Makes a multi fasta file of random sequences, all the same length'''
- random.seed(a=seed)
- fout = utils.open_file_write(outfile)
- letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
- letters_index = 0
-
- for i in range(contigs):
- if name_by_letters:
- name = letters[letters_index]
- letters_index += 1
- if letters_index == len(letters):
- letters_index = 0
- else:
- name = str(i + first_number)
-
- fa = sequences.Fasta(prefix + name, ''.join([random.choice('ACGT') for x in range(length)]))
- print(fa, file=fout)
-
- utils.close(fout)
-
-
-def reverse_complement(infile, outfile):
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- seq.revcomp()
- print(seq, file=fout)
-
- utils.close(fout)
-
-
-def scaffolds_to_contigs(infile, outfile, number_contigs=False):
- '''Makes a file of contigs from scaffolds by splitting at every N.
- Use number_contigs=True to add .1, .2, etc onto end of each
- contig, instead of default to append coordinates.'''
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- contigs = seq.contig_coords()
- counter = 1
- for contig in contigs:
- if number_contigs:
- name = seq.id + '.' + str(counter)
- counter += 1
- else:
- name = '.'.join([seq.id, str(contig.start + 1), str(contig.end + 1)])
- print(sequences.Fasta(name, seq[contig.start:contig.end+1]), file=fout)
-
- utils.close(fout)
-
-
-def search_for_seq(infile, outfile, search_string):
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- hits = seq.search(search_string)
- for hit in hits:
- print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout)
-
- utils.close(fout)
-
-
-def translate(infile, outfile, frame=0):
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- print(seq.translate(frame=frame), file=fout)
-
- utils.close(fout)
-
-
-def trim(infile, outfile, start, end):
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- seq.trim(start, end)
- if len(seq):
- print(seq, file=fout)
-
- utils.close(fout)
-
-
-def trim_Ns_at_end(infile, outfile):
- seq_reader = sequences.file_reader(infile)
- fout = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- seq.trim_Ns()
- if len(seq):
- print(seq, file=fout)
-
- utils.close(fout)
-
-
-def lengths_from_fai(fai_file, d):
- f = utils.open_file_read(fai_file)
- for line in f:
- (id, length) = line.rstrip().split()[:2]
- d[id] = int(length)
- utils.close(f)
-
-
-def split_by_base_count(infile, outfiles_prefix, max_bases, max_seqs=None):
- '''Splits a fasta/q file into separate files, file size determined by number of bases.
-
- Puts <= max_bases in each split file The exception is a single sequence >=max_bases
- is put in its own file. This does not split sequences.
- '''
- seq_reader = sequences.file_reader(infile)
- base_count = 0
- file_count = 1
- seq_count = 0
- fout = None
- if max_seqs is None:
- max_seqs = float('inf')
-
- for seq in seq_reader:
- if base_count == 0:
- fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
- file_count += 1
-
- if base_count + len(seq) > max_bases or seq_count >= max_seqs:
- if base_count == 0:
- print(seq, file=fout)
- utils.close(fout)
- else:
- utils.close(fout)
- fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
- print(seq, file=fout)
- base_count = len(seq)
- file_count += 1
- seq_count = 1
- else:
- base_count += len(seq)
- seq_count += 1
- print(seq, file=fout)
-
- utils.close(fout)
-
-
-def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_all_Ns=False):
- '''Splits fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file'''
- file_count = 1
- coords = []
- small_sequences = [] # sequences shorter than chunk_size
- seq_reader = sequences.file_reader(infile)
- f_coords = utils.open_file_write(outfiles_prefix + '.coords')
-
- for seq in seq_reader:
- if skip_if_all_Ns and seq.is_all_Ns():
- continue
- if len(seq) < chunk_size:
- small_sequences.append(copy.copy(seq))
- elif len(seq) <= chunk_size + tolerance:
- f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
- print(seq, file=f)
- utils.close(f)
- file_count += 1
- else:
- # make list of chunk coords
- chunks = [(x,x+chunk_size) for x in range(0, len(seq), chunk_size)]
- if chunks[-1][1] - 1 > len(seq):
- chunks[-1] = (chunks[-1][0], len(seq))
- if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) <= tolerance:
- chunks[-2] = (chunks[-2][0], chunks[-1][1])
- chunks.pop()
-
- # write one output file per chunk
- offset = 0
- for chunk in chunks:
- if not(skip_if_all_Ns and seq.is_all_Ns(start=chunk[0], end=chunk[1]-1)):
- f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
- chunk_id = seq.id + ':' + str(chunk[0]+1) + '-' + str(chunk[1])
- print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]), file=f)
- print(chunk_id, seq.id, offset, sep='\t', file=f_coords)
- utils.close(f)
- file_count += 1
-
- offset += chunk[1] - chunk[0]
-
- # write files of small sequences
- if len(small_sequences):
- f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
- file_count += 1
- base_count = 0
- for seq in small_sequences:
- if base_count > 0 and base_count + len(seq) > chunk_size + tolerance:
- utils.close(f)
- f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
- file_count += 1
- base_count = 0
-
- print(seq, file=f)
- base_count += len(seq)
-
- utils.close(f)
-
-
-def replace_bases(infile, outfile, old, new):
- seq_reader = sequences.file_reader(infile)
- f_out = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- seq.replace_bases(old, new)
- print(seq, file=f_out)
-
- utils.close(f_out)
-
-
-def strip_illumina_suffix(infile, outfile):
- seq_reader = sequences.file_reader(infile)
- f_out = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- seq.strip_illumina_suffix()
- print(seq, file=f_out)
-
- utils.close(f_out)
-
-
-def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False):
- seq_reader = sequences.file_reader(infile)
- f_out = utils.open_file_write(outfile)
- original_line_length = sequences.Fasta.line_length
- sequences.Fasta.line_length = line_length
-
- for seq in seq_reader:
- if strip_after_first_whitespace:
- seq.strip_after_first_whitespace()
-
- if type(seq) == sequences.Fastq:
- print(sequences.Fasta(seq.id, seq.seq), file=f_out)
- else:
- print(seq, file=f_out)
-
- utils.close(f_out)
- sequences.Fasta.line_length = original_line_length
-
-
-def to_quasr_primers(infile, outfile):
- seq_reader = sequences.file_reader(infile)
- f_out = utils.open_file_write(outfile)
-
- for seq in seq_reader:
- seq2 = copy.copy(seq)
- seq2.revcomp()
- print(seq.seq, seq2.seq, sep='\t', file=f_out)
-
- utils.close(f_out)
-
-
-def to_unique_by_id(infile, outfile):
- seq_reader = sequences.file_reader(infile)
- seqs = {}
- ids_in_order = []
-
- # has the reads, keeping the longest one when we get the same
- # name more than once
- for seq in seq_reader:
- if len(seq) == 0:
- continue
- if seq.id not in seqs:
- seqs[seq.id] = copy.copy(seq)
- ids_in_order.append(seq.id)
- elif len(seqs[seq.id]) < len(seq):
- seqs[seq.id] = copy.copy(seq)
-
- # write the output
- f_out = utils.open_file_write(outfile)
- for id in ids_in_order:
- print(seqs[id], file=f_out)
- utils.close(f_out)
diff --git a/fastaq/tests/data/sequences_test.embl b/fastaq/tests/data/sequences_test.embl
deleted file mode 100644
index b40c185..0000000
--- a/fastaq/tests/data/sequences_test.embl
+++ /dev/null
@@ -1,203 +0,0 @@
-ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP.
-XX
-AC X56734; S46826;
-XX
-DT 12-SEP-1991 (Rel. 29, Created)
-DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
-XX
-DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
-XX
-KW beta-glucosidase.
-XX
-OS Trifolium repens (white clover)
-OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
-OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
-OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
-XX
-RN [5]
-RP 1-1859
-RX DOI; 10.1007/BF00039495.
-RX PUBMED; 1907511.
-RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
-RT "Nucleotide and derived amino acid sequence of the cyanogenic
-RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
-RL Plant Mol. Biol. 17(2):209-219(1991).
-XX
-RN [6]
-RP 1-1859
-RA Hughes M.A.;
-RT ;
-RL Submitted (19-NOV-1990) to the INSDC.
-RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
-RL Upon Tyne, NE2 4HH, UK
-XX
-DR EuropePMC; PMC99098; 11752244.
-XX
-FH Key Location/Qualifiers
-FH
-FT source 1..1859
-FT /organism="Trifolium repens"
-FT /mol_type="mRNA"
-FT /clone_lib="lambda gt10"
-FT /clone="TRE361"
-FT /tissue_type="leaves"
-FT /db_xref="taxon:3899"
-FT mRNA 1..1859
-FT /experiment="experimental evidence, no additional details
-FT recorded"
-FT CDS 14..1495
-FT /product="beta-glucosidase"
-FT /EC_number="3.2.1.21"
-FT /note="non-cyanogenic"
-FT /db_xref="GOA:P26204"
-FT /db_xref="InterPro:IPR001360"
-FT /db_xref="InterPro:IPR013781"
-FT /db_xref="InterPro:IPR017853"
-FT /db_xref="InterPro:IPR018120"
-FT /db_xref="UniProtKB/Swiss-Prot:P26204"
-FT /protein_id="CAA40058.1"
-FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
-FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
-FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
-FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
-FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
-FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
-FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
-FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
-FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
-XX
-SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
- aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
- cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
- tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
- aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
- tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
- caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
- ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
- atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
- ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
- tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
- gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
- aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
- aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
- taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
- gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
- cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
- gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
- ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
- acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
- acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
- gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
- gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
- agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
- ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
- taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
- tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
- ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
- tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
- aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
- agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
- tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
-//
-ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP.
-XX
-AC X56734; S46826;
-XX
-DT 12-SEP-1991 (Rel. 29, Created)
-DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
-XX
-DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
-XX
-KW beta-glucosidase.
-XX
-OS Trifolium repens (white clover)
-OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
-OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
-OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
-XX
-RN [5]
-RP 1-1859
-RX DOI; 10.1007/BF00039495.
-RX PUBMED; 1907511.
-RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
-RT "Nucleotide and derived amino acid sequence of the cyanogenic
-RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
-RL Plant Mol. Biol. 17(2):209-219(1991).
-XX
-RN [6]
-RP 1-1859
-RA Hughes M.A.;
-RT ;
-RL Submitted (19-NOV-1990) to the INSDC.
-RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
-RL Upon Tyne, NE2 4HH, UK
-XX
-DR EuropePMC; PMC99098; 11752244.
-XX
-FH Key Location/Qualifiers
-FH
-FT source 1..1859
-FT /organism="Trifolium repens"
-FT /mol_type="mRNA"
-FT /clone_lib="lambda gt10"
-FT /clone="TRE361"
-FT /tissue_type="leaves"
-FT /db_xref="taxon:3899"
-FT mRNA 1..1859
-FT /experiment="experimental evidence, no additional details
-FT recorded"
-FT CDS 14..1495
-FT /product="beta-glucosidase"
-FT /EC_number="3.2.1.21"
-FT /note="non-cyanogenic"
-FT /db_xref="GOA:P26204"
-FT /db_xref="InterPro:IPR001360"
-FT /db_xref="InterPro:IPR013781"
-FT /db_xref="InterPro:IPR017853"
-FT /db_xref="InterPro:IPR018120"
-FT /db_xref="UniProtKB/Swiss-Prot:P26204"
-FT /protein_id="CAA40058.1"
-FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
-FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
-FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
-FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
-FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
-FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
-FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
-FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
-FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
-XX
-SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
- aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
- cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
- tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
- aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
- tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
- caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
- ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
- atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
- ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
- tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
- gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
- aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
- aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
- taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
- gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
- cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
- gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
- ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
- acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
- acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
- gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
- gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
- agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
- ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
- taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
- tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
- ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
- tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
- aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
- agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
- tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859
-//
-
diff --git a/fastaq/tests/data/sequences_test.embl.bad b/fastaq/tests/data/sequences_test.embl.bad
deleted file mode 100644
index 10ca1ab..0000000
--- a/fastaq/tests/data/sequences_test.embl.bad
+++ /dev/null
@@ -1,202 +0,0 @@
-ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP.
-XX
-AC X56734; S46826;
-XX
-DT 12-SEP-1991 (Rel. 29, Created)
-DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
-XX
-DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
-XX
-KW beta-glucosidase.
-XX
-OS Trifolium repens (white clover)
-OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
-OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
-OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
-XX
-RN [5]
-RP 1-1859
-RX DOI; 10.1007/BF00039495.
-RX PUBMED; 1907511.
-RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
-RT "Nucleotide and derived amino acid sequence of the cyanogenic
-RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
-RL Plant Mol. Biol. 17(2):209-219(1991).
-XX
-RN [6]
-RP 1-1859
-RA Hughes M.A.;
-RT ;
-RL Submitted (19-NOV-1990) to the INSDC.
-RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
-RL Upon Tyne, NE2 4HH, UK
-XX
-DR EuropePMC; PMC99098; 11752244.
-XX
-FH Key Location/Qualifiers
-FH
-FT source 1..1859
-FT /organism="Trifolium repens"
-FT /mol_type="mRNA"
-FT /clone_lib="lambda gt10"
-FT /clone="TRE361"
-FT /tissue_type="leaves"
-FT /db_xref="taxon:3899"
-FT mRNA 1..1859
-FT /experiment="experimental evidence, no additional details
-FT recorded"
-FT CDS 14..1495
-FT /product="beta-glucosidase"
-FT /EC_number="3.2.1.21"
-FT /note="non-cyanogenic"
-FT /db_xref="GOA:P26204"
-FT /db_xref="InterPro:IPR001360"
-FT /db_xref="InterPro:IPR013781"
-FT /db_xref="InterPro:IPR017853"
-FT /db_xref="InterPro:IPR018120"
-FT /db_xref="UniProtKB/Swiss-Prot:P26204"
-FT /protein_id="CAA40058.1"
-FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
-FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
-FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
-FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
-FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
-FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
-FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
-FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
-FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
-XX
-SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
- aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
- cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
- tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
- aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
- tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
- caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
- ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
- atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
- ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
- tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
- gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
- aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
- aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
- taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
- gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
- cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
- gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
- ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
- acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
- acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
- gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
- gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
- agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
- ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
- taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
- tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
- ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
- tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
- aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
- agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
- tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
-//
-ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP.
-XX
-AC X56734; S46826;
-XX
-DT 12-SEP-1991 (Rel. 29, Created)
-DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
-XX
-DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
-XX
-KW beta-glucosidase.
-XX
-OS Trifolium repens (white clover)
-OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
-OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
-OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
-XX
-RN [5]
-RP 1-1859
-RX DOI; 10.1007/BF00039495.
-RX PUBMED; 1907511.
-RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
-RT "Nucleotide and derived amino acid sequence of the cyanogenic
-RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
-RL Plant Mol. Biol. 17(2):209-219(1991).
-XX
-RN [6]
-RP 1-1859
-RA Hughes M.A.;
-RT ;
-RL Submitted (19-NOV-1990) to the INSDC.
-RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
-RL Upon Tyne, NE2 4HH, UK
-XX
-DR EuropePMC; PMC99098; 11752244.
-XX
-FH Key Location/Qualifiers
-FH
-FT source 1..1859
-FT /organism="Trifolium repens"
-FT /mol_type="mRNA"
-FT /clone_lib="lambda gt10"
-FT /clone="TRE361"
-FT /tissue_type="leaves"
-FT /db_xref="taxon:3899"
-FT mRNA 1..1859
-FT /experiment="experimental evidence, no additional details
-FT recorded"
-FT CDS 14..1495
-FT /product="beta-glucosidase"
-FT /EC_number="3.2.1.21"
-FT /note="non-cyanogenic"
-FT /db_xref="GOA:P26204"
-FT /db_xref="InterPro:IPR001360"
-FT /db_xref="InterPro:IPR013781"
-FT /db_xref="InterPro:IPR017853"
-FT /db_xref="InterPro:IPR018120"
-FT /db_xref="UniProtKB/Swiss-Prot:P26204"
-FT /protein_id="CAA40058.1"
-FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
-FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
-FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
-FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
-FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
-FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
-FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
-FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
-FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
-XX
- aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
- cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
- tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
- aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
- tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
- caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
- ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
- atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
- ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
- tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
- gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
- aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
- aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
- taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
- gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
- cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
- gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
- ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
- acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
- acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
- gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
- gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
- agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
- ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
- taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
- tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
- ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
- tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
- aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
- agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
- tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859
-//
-
diff --git a/fastaq/tests/data/sequences_test.embl.bad2 b/fastaq/tests/data/sequences_test.embl.bad2
deleted file mode 100644
index 1dd59b1..0000000
--- a/fastaq/tests/data/sequences_test.embl.bad2
+++ /dev/null
@@ -1,202 +0,0 @@
-ID seq1; SV 1; linear; mRNA; STD; PLN; 1859 BP.
-XX
-AC X56734; S46826;
-XX
-DT 12-SEP-1991 (Rel. 29, Created)
-DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
-XX
-DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
-XX
-KW beta-glucosidase.
-XX
-OS Trifolium repens (white clover)
-OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
-OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
-OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
-XX
-RN [5]
-RP 1-1859
-RX DOI; 10.1007/BF00039495.
-RX PUBMED; 1907511.
-RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
-RT "Nucleotide and derived amino acid sequence of the cyanogenic
-RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
-RL Plant Mol. Biol. 17(2):209-219(1991).
-XX
-RN [6]
-RP 1-1859
-RA Hughes M.A.;
-RT ;
-RL Submitted (19-NOV-1990) to the INSDC.
-RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
-RL Upon Tyne, NE2 4HH, UK
-XX
-DR EuropePMC; PMC99098; 11752244.
-XX
-FH Key Location/Qualifiers
-FH
-FT source 1..1859
-FT /organism="Trifolium repens"
-FT /mol_type="mRNA"
-FT /clone_lib="lambda gt10"
-FT /clone="TRE361"
-FT /tissue_type="leaves"
-FT /db_xref="taxon:3899"
-FT mRNA 1..1859
-FT /experiment="experimental evidence, no additional details
-FT recorded"
-FT CDS 14..1495
-FT /product="beta-glucosidase"
-FT /EC_number="3.2.1.21"
-FT /note="non-cyanogenic"
-FT /db_xref="GOA:P26204"
-FT /db_xref="InterPro:IPR001360"
-FT /db_xref="InterPro:IPR013781"
-FT /db_xref="InterPro:IPR017853"
-FT /db_xref="InterPro:IPR018120"
-FT /db_xref="UniProtKB/Swiss-Prot:P26204"
-FT /protein_id="CAA40058.1"
-FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
-FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
-FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
-FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
-FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
-FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
-FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
-FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
-FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
-XX
-SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
- aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
- cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
- tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
- aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
- tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
- caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
- ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
- atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
- ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
- tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
- gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
- aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
- aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
- taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
- gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
- cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
- gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
- ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
- acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
- acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
- gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
- gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
- agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
- ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
- taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
- tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
- ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
- tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
- aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
- agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
- tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
-ID seq2; SV 1; linear; mRNA; STD; PLN; 1859 BP.
-XX
-AC X56734; S46826;
-XX
-DT 12-SEP-1991 (Rel. 29, Created)
-DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
-XX
-DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
-XX
-KW beta-glucosidase.
-XX
-OS Trifolium repens (white clover)
-OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
-OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
-OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
-XX
-RN [5]
-RP 1-1859
-RX DOI; 10.1007/BF00039495.
-RX PUBMED; 1907511.
-RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
-RT "Nucleotide and derived amino acid sequence of the cyanogenic
-RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
-RL Plant Mol. Biol. 17(2):209-219(1991).
-XX
-RN [6]
-RP 1-1859
-RA Hughes M.A.;
-RT ;
-RL Submitted (19-NOV-1990) to the INSDC.
-RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
-RL Upon Tyne, NE2 4HH, UK
-XX
-DR EuropePMC; PMC99098; 11752244.
-XX
-FH Key Location/Qualifiers
-FH
-FT source 1..1859
-FT /organism="Trifolium repens"
-FT /mol_type="mRNA"
-FT /clone_lib="lambda gt10"
-FT /clone="TRE361"
-FT /tissue_type="leaves"
-FT /db_xref="taxon:3899"
-FT mRNA 1..1859
-FT /experiment="experimental evidence, no additional details
-FT recorded"
-FT CDS 14..1495
-FT /product="beta-glucosidase"
-FT /EC_number="3.2.1.21"
-FT /note="non-cyanogenic"
-FT /db_xref="GOA:P26204"
-FT /db_xref="InterPro:IPR001360"
-FT /db_xref="InterPro:IPR013781"
-FT /db_xref="InterPro:IPR017853"
-FT /db_xref="InterPro:IPR018120"
-FT /db_xref="UniProtKB/Swiss-Prot:P26204"
-FT /protein_id="CAA40058.1"
-FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
-FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
-FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
-FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
-FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
-FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
-FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
-FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
-FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
-XX
-SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
- aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
- cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
- tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
- aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
- tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
- caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
- ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
- atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
- ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
- tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
- gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
- aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
- aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
- taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
- gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
- cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
- gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
- ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
- acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
- acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
- gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
- gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
- agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
- ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
- taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
- tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
- ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
- tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
- aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
- agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
- tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa ccccccccc 1859
-//
-
diff --git a/fastaq/tests/data/sequences_test.embl.to_fasta b/fastaq/tests/data/sequences_test.embl.to_fasta
deleted file mode 100644
index 89e2230..0000000
--- a/fastaq/tests/data/sequences_test.embl.to_fasta
+++ /dev/null
@@ -1,64 +0,0 @@
->seq1
-aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcatt
-cacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgag
-tcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttga
-aggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaata
-tccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgcta
-caaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctc
-ttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaa
-atattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactct
-ttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccgg
-tgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagt
-gaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactagg
-aacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaac
-aggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgta
-taagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactg
-gttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttga
-cttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcg
-gcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatgg
-ttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttc
-acatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaa
-acatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatat
-gtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcct
-gcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtaga
-agaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcg
-ttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactg
-taatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaaga
-tggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaa
-ctagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagt
-tggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtg
-aagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtacc
-agaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatac
-tttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
->seq2
-aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcatt
-cacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgag
-tcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttga
-aggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaata
-tccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgcta
-caaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctc
-ttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaa
-atattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactct
-ttttcattgggatcttccccaagtcttagaagatgagtatggtggtttcttaaactccgg
-tgtaataaatgattttcgagactatacggatctttgcttcaaggaatttggagatagagt
-gaggtattggagtactctaaatgagccatgggtgtttagcaattctggatatgcactagg
-aacaaatgcaccaggtcgatgttcggcctccaacgtggccaagcctggtgattctggaac
-aggaccttatatagttacacacaatcaaattcttgctcatgcagaagctgtacatgtgta
-taagactaaataccaggcatatcaaaagggaaagataggcataacgttggtatctaactg
-gttaatgccacttgatgataatagcataccagatataaaggctgccgagagatcacttga
-cttccaatttggattgtttatggaacaattaacaacaggagattattctaagagcatgcg
-gcgtatagttaaaaaccgattacctaagttctcaaaattcgaatcaagcctagtgaatgg
-ttcatttgattttattggtataaactattactcttctagttatattagcaatgccccttc
-acatggcaatgccaaacccagttactcaacaaatcctatgaccaatatttcatttgaaaa
-acatgggatacccttaggtccaagggctgcttcaatttggatatatgtttatccatatat
-gtttatccaagaggacttcgagatcttttgttacatattaaaaataaatataacaatcct
-gcaattttcaatcactgaaaatggtatgaatgaattcaacgatgcaacacttccagtaga
-agaagctcttttgaatacttacagaattgattactattaccgtcacttatactacattcg
-ttctgcaatcagggctggctcaaatgtgaagggtttttacgcatggtcatttttggactg
-taatgaatggtttgcaggctttactgttcgttttggattaaactttgtagattagaaaga
-tggattaaaaaggtaccctaagctttctgcccaatggtacaagaactttctcaaaagaaa
-ctagctagtattattaaaagaactttgtagtagattacagtacatcgtttgaagttgagt
-tggtgcacctaattaaataaaagaggttactcttaacatatttttaggccattcgttgtg
-aagttgttaggctgttatttctattatactatgttgtagtaataagtgcattgttgtacc
-agaagctatgatcataactataggttgatccttcatgtatcagtttgatgttgagaatac
-tttgaattaaaagtctttttttatttttttaaaaaaaaaaaaaaaaaaaaccccccccc
diff --git a/fastaq/tests/data/sequences_test.fa b/fastaq/tests/data/sequences_test.fa
deleted file mode 100644
index 22da7a3..0000000
--- a/fastaq/tests/data/sequences_test.fa
+++ /dev/null
@@ -1,19 +0,0 @@
->1
-ACGTA
->2
-A
-
-C
-GT
-
-A
-
->3
-
-
-ACGTA
->4
-ACGTA
-
-
-
diff --git a/fastaq/tests/data/sequences_test.fa.ids b/fastaq/tests/data/sequences_test.fa.ids
deleted file mode 100644
index 94ebaf9..0000000
--- a/fastaq/tests/data/sequences_test.fa.ids
+++ /dev/null
@@ -1,4 +0,0 @@
-1
-2
-3
-4
diff --git a/fastaq/tests/data/sequences_test.fa.qual b/fastaq/tests/data/sequences_test.fa.qual
deleted file mode 100644
index 435d562..0000000
--- a/fastaq/tests/data/sequences_test.fa.qual
+++ /dev/null
@@ -1,17 +0,0 @@
->1
-40 40 40
-40 40
-
->2
-40
-40
-
-40
-40 40
->3
-
-40 40 40 40 40
-
->4
-40 40 40 40 40
-
diff --git a/fastaq/tests/data/sequences_test.fa.qual.bad b/fastaq/tests/data/sequences_test.fa.qual.bad
deleted file mode 100644
index 92c8d8d..0000000
--- a/fastaq/tests/data/sequences_test.fa.qual.bad
+++ /dev/null
@@ -1,17 +0,0 @@
->1
-40 40 40
-40 40
-
->3
-40
-40
-
-40
-40 40
->3
-
-40 40 40 40 40
-
->4
-40 40 40 40 40
-
diff --git a/fastaq/tests/data/sequences_test.fasta_to_fastq.fq b/fastaq/tests/data/sequences_test.fasta_to_fastq.fq
deleted file mode 100644
index 48f7282..0000000
--- a/fastaq/tests/data/sequences_test.fasta_to_fastq.fq
+++ /dev/null
@@ -1,16 +0,0 @@
- at 1
-ACGTA
-+
-IIIII
- at 2
-ACGTA
-+
-IIIII
- at 3
-ACGTA
-+
-IIIII
- at 4
-ACGTA
-+
-IIIII
diff --git a/fastaq/tests/data/sequences_test.gbk b/fastaq/tests/data/sequences_test.gbk
deleted file mode 100644
index 40f1afb..0000000
--- a/fastaq/tests/data/sequences_test.gbk
+++ /dev/null
@@ -1,170 +0,0 @@
-LOCUS NAME1 5028 bp DNA PLN 21-JUN-1999
-DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
- (AXL2) and Rev7p (REV7) genes, complete cds.
-ACCESSION U49845
-VERSION U49845.1 GI:1293613
-KEYWORDS .
-SOURCE Saccharomyces cerevisiae (baker's yeast)
- ORGANISM Saccharomyces cerevisiae
- Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
- Saccharomycetales; Saccharomycetaceae; Saccharomyces.
-REFERENCE 1 (bases 1 to 5028)
- AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
- TITLE Cloning and sequence of REV7, a gene whose function is required for
- DNA damage-induced mutagenesis in Saccharomyces cerevisiae
- JOURNAL Yeast 10 (11), 1503-1509 (1994)
- PUBMED 7871890
-REFERENCE 2 (bases 1 to 5028)
- AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
- TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
- plasma membrane glycoprotein
- JOURNAL Genes Dev. 10 (7), 777-793 (1996)
- PUBMED 8846915
-REFERENCE 3 (bases 1 to 5028)
- AUTHORS Roemer,T.
- TITLE Direct Submission
- JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
- Haven, CT, USA
-FEATURES Location/Qualifiers
- source 1..5028
- /organism="Saccharomyces cerevisiae"
- /db_xref="taxon:4932"
- /chromosome="IX"
- /map="9"
- CDS <1..206
- /codon_start=3
- /product="TCP1-beta"
- /protein_id="AAA98665.1"
- /db_xref="GI:1293614"
- /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
- AEVLLRVDNIIRARPRTANRQHM"
- gene 687..3158
- /gene="AXL2"
- CDS 687..3158
- /gene="AXL2"
- /note="plasma membrane glycoprotein"
- /codon_start=1
- /function="required for axial budding pattern of S.
- cerevisiae"
- /product="Axl2p"
- /protein_id="AAA98666.1"
- /db_xref="GI:1293615"
- /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
- TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
- VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
- VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
- TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
- YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
- DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
- DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
- NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
- CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
- NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
- SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
- YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
- HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
- VDFSNKSNVNVGQVKDIHGRIPEML"
- gene complement(3300..4037)
- /gene="REV7"
- CDS complement(3300..4037)
- /gene="REV7"
- /codon_start=1
- /product="Rev7p"
- /protein_id="AAA98667.1"
- /db_xref="GI:1293616"
- /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
- FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
- KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
- RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
- LISGDDKILNGVYSQYEEGESIFGSLF"
-ORIGIN
- 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
- 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
- 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
- 181 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
-//
-LOCUS NAME2 5028 bp DNA PLN 21-JUN-1999
-DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
- (AXL2) and Rev7p (REV7) genes, complete cds.
-ACCESSION U49845
-VERSION U49845.1 GI:1293613
-KEYWORDS .
-SOURCE Saccharomyces cerevisiae (baker's yeast)
- ORGANISM Saccharomyces cerevisiae
- Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
- Saccharomycetales; Saccharomycetaceae; Saccharomyces.
-REFERENCE 1 (bases 1 to 5028)
- AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
- TITLE Cloning and sequence of REV7, a gene whose function is required for
- DNA damage-induced mutagenesis in Saccharomyces cerevisiae
- JOURNAL Yeast 10 (11), 1503-1509 (1994)
- PUBMED 7871890
-REFERENCE 2 (bases 1 to 5028)
- AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
- TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
- plasma membrane glycoprotein
- JOURNAL Genes Dev. 10 (7), 777-793 (1996)
- PUBMED 8846915
-REFERENCE 3 (bases 1 to 5028)
- AUTHORS Roemer,T.
- TITLE Direct Submission
- JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
- Haven, CT, USA
-FEATURES Location/Qualifiers
- source 1..5028
- /organism="Saccharomyces cerevisiae"
- /db_xref="taxon:4932"
- /chromosome="IX"
- /map="9"
- CDS <1..206
- /codon_start=3
- /product="TCP1-beta"
- /protein_id="AAA98665.1"
- /db_xref="GI:1293614"
- /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
- AEVLLRVDNIIRARPRTANRQHM"
- gene 687..3158
- /gene="AXL2"
- CDS 687..3158
- /gene="AXL2"
- /note="plasma membrane glycoprotein"
- /codon_start=1
- /function="required for axial budding pattern of S.
- cerevisiae"
- /product="Axl2p"
- /protein_id="AAA98666.1"
- /db_xref="GI:1293615"
- /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
- TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
- VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
- VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
- TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
- YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
- DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
- DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
- NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
- CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
- NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
- SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
- YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
- HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
- VDFSNKSNVNVGQVKDIHGRIPEML"
- gene complement(3300..4037)
- /gene="REV7"
- CDS complement(3300..4037)
- /gene="REV7"
- /codon_start=1
- /product="Rev7p"
- /protein_id="AAA98667.1"
- /db_xref="GI:1293616"
- /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
- FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
- KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
- RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
- LISGDDKILNGVYSQYEEGESIFGSLF"
-ORIGIN
- 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
- 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
- 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
- 181 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgaaa
-//
diff --git a/fastaq/tests/data/sequences_test.gbk.to_fasta b/fastaq/tests/data/sequences_test.gbk.to_fasta
deleted file mode 100644
index 270d9ec..0000000
--- a/fastaq/tests/data/sequences_test.gbk.to_fasta
+++ /dev/null
@@ -1,10 +0,0 @@
->NAME1
-gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattg
-ccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagct
-ctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaa
-tgccatgactcagattctaattttaagctattcaatttctctttgatc
->NAME2
-gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattg
-ccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagct
-ctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaa
-tgccatgactcagattctaattttaagctattcaatttctctttgaaa
diff --git a/fastaq/tests/data/sequences_test.line_length3.fa b/fastaq/tests/data/sequences_test.line_length3.fa
deleted file mode 100644
index a77df6d..0000000
--- a/fastaq/tests/data/sequences_test.line_length3.fa
+++ /dev/null
@@ -1,12 +0,0 @@
->1
-ACG
-TA
->2
-ACG
-TA
->3
-ACG
-TA
->4
-ACG
-TA
diff --git a/fastaq/tests/data/sequences_test_3-per-line.fa b/fastaq/tests/data/sequences_test_3-per-line.fa
deleted file mode 100644
index 01ec932..0000000
--- a/fastaq/tests/data/sequences_test_3-per-line.fa
+++ /dev/null
@@ -1,19 +0,0 @@
->ID
-A
->ID
-AA
->ID
-AAA
->ID
-AAA
-A
->ID
-AAA
-AA
->ID
-AAA
-AAA
->ID
-AAA
-AAA
-A
diff --git a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa b/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa
deleted file mode 100644
index dffde8b..0000000
--- a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa
+++ /dev/null
@@ -1,16 +0,0 @@
->one.p1k
-ACGT
->one.q1k
-CCCC
->two.p1k
-A
->two.q1k
-C
->one.p1k
-TTTTTTTTTT
->three.q1k
-A
->four.x
-T
->five.p1k
-G
diff --git a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.paired.gz b/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.paired.gz
deleted file mode 100644
index 5f98494..0000000
Binary files a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.paired.gz and /dev/null differ
diff --git a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.unpaired.gz b/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.unpaired.gz
deleted file mode 100644
index 2e8d705..0000000
Binary files a/fastaq/tests/data/sequences_test_cap_to_read_pairs.fa.unpaired.gz and /dev/null differ
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_1.fa b/fastaq/tests/data/sequences_test_deinterleaved_1.fa
deleted file mode 100644
index cb095ce..0000000
--- a/fastaq/tests/data/sequences_test_deinterleaved_1.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->1/1
-ACGTA
->2/1
-A
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_2.fa b/fastaq/tests/data/sequences_test_deinterleaved_2.fa
deleted file mode 100644
index d0017c9..0000000
--- a/fastaq/tests/data/sequences_test_deinterleaved_2.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->1/2
-ACGTA
->2/2
-C
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_bad2_1.fa b/fastaq/tests/data/sequences_test_deinterleaved_bad2_1.fa
deleted file mode 100644
index 0f656b5..0000000
--- a/fastaq/tests/data/sequences_test_deinterleaved_bad2_1.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->1/1
-ACGTA
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_bad2_2.fa b/fastaq/tests/data/sequences_test_deinterleaved_bad2_2.fa
deleted file mode 100644
index ad68ff1..0000000
--- a/fastaq/tests/data/sequences_test_deinterleaved_bad2_2.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->1/2
-ACGTA
->2/2
-A
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_bad_1.fa b/fastaq/tests/data/sequences_test_deinterleaved_bad_1.fa
deleted file mode 100644
index cb095ce..0000000
--- a/fastaq/tests/data/sequences_test_deinterleaved_bad_1.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->1/1
-ACGTA
->2/1
-A
diff --git a/fastaq/tests/data/sequences_test_deinterleaved_bad_2.fa b/fastaq/tests/data/sequences_test_deinterleaved_bad_2.fa
deleted file mode 100644
index baf20b4..0000000
--- a/fastaq/tests/data/sequences_test_deinterleaved_bad_2.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->1/2
-ACGTA
diff --git a/fastaq/tests/data/sequences_test_empty_file b/fastaq/tests/data/sequences_test_empty_file
deleted file mode 100644
index e69de29..0000000
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa b/fastaq/tests/data/sequences_test_enumerate_names.fa
deleted file mode 100644
index d2dce18..0000000
--- a/fastaq/tests/data/sequences_test_enumerate_names.fa
+++ /dev/null
@@ -1,8 +0,0 @@
->one/1
-A
->one/2
-C
->two/1
-G
->two/2
-T
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.keep_suffix b/fastaq/tests/data/sequences_test_enumerate_names.fa.out.keep_suffix
deleted file mode 100644
index dba3ca0..0000000
--- a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.keep_suffix
+++ /dev/null
@@ -1,8 +0,0 @@
->1/1
-A
->2/2
-C
->3/1
-G
->4/2
-T
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1 b/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1
deleted file mode 100644
index 2c8d196..0000000
--- a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1
+++ /dev/null
@@ -1,8 +0,0 @@
->1
-A
->2
-C
->3
-G
->4
-T
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1.rename_file b/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1.rename_file
deleted file mode 100644
index 8de27ac..0000000
--- a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.1.rename_file
+++ /dev/null
@@ -1,5 +0,0 @@
-#old new
-one/1 1
-one/2 2
-two/1 3
-two/2 4
diff --git a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.2 b/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.2
deleted file mode 100644
index fdb1d1f..0000000
--- a/fastaq/tests/data/sequences_test_enumerate_names.fa.out.start.2
+++ /dev/null
@@ -1,8 +0,0 @@
->2
-A
->3
-C
->4
-G
->5
-T
diff --git a/fastaq/tests/data/sequences_test_extend_gaps.fa b/fastaq/tests/data/sequences_test_extend_gaps.fa
deleted file mode 100644
index b05b56c..0000000
--- a/fastaq/tests/data/sequences_test_extend_gaps.fa
+++ /dev/null
@@ -1,8 +0,0 @@
->1
-AC
->2
-ACGTACGT
->3
-ACGTNACGT
->4
-ACGTACGTNACGTACGT
diff --git a/fastaq/tests/data/sequences_test_extend_gaps.fa.out b/fastaq/tests/data/sequences_test_extend_gaps.fa.out
deleted file mode 100644
index e1ec718..0000000
--- a/fastaq/tests/data/sequences_test_extend_gaps.fa.out
+++ /dev/null
@@ -1,4 +0,0 @@
->2
-GTAC
->4
-GTACNNNNNGTAC
diff --git a/fastaq/tests/data/sequences_test_fai_test.fa b/fastaq/tests/data/sequences_test_fai_test.fa
deleted file mode 100644
index 7c02b44..0000000
--- a/fastaq/tests/data/sequences_test_fai_test.fa
+++ /dev/null
@@ -1,8 +0,0 @@
->1
-A
->2
-AA
->3
-AAA
->4
-AAAA
diff --git a/fastaq/tests/data/sequences_test_fai_test.fa.fai b/fastaq/tests/data/sequences_test_fai_test.fa.fai
deleted file mode 100644
index 154a9d6..0000000
--- a/fastaq/tests/data/sequences_test_fai_test.fa.fai
+++ /dev/null
@@ -1,4 +0,0 @@
-1 1 3 1 2
-2 2 8 2 3
-3 3 14 3 4
-4 4 21 4 5
diff --git a/fastaq/tests/data/sequences_test_fail_no_AT.fq b/fastaq/tests/data/sequences_test_fail_no_AT.fq
deleted file mode 100644
index d1472fc..0000000
--- a/fastaq/tests/data/sequences_test_fail_no_AT.fq
+++ /dev/null
@@ -1,5 +0,0 @@
- at 1
-A
-+
-I
-NOT_AN_ at _LINE
diff --git a/fastaq/tests/data/sequences_test_fail_no_plus.fq b/fastaq/tests/data/sequences_test_fail_no_plus.fq
deleted file mode 100644
index 52b5b7c..0000000
--- a/fastaq/tests/data/sequences_test_fail_no_plus.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at A
-A
-NOT_A_+
-I
diff --git a/fastaq/tests/data/sequences_test_fail_no_qual.fq b/fastaq/tests/data/sequences_test_fail_no_qual.fq
deleted file mode 100644
index 16ca520..0000000
--- a/fastaq/tests/data/sequences_test_fail_no_qual.fq
+++ /dev/null
@@ -1,3 +0,0 @@
- at A
-A
-+
diff --git a/fastaq/tests/data/sequences_test_fail_no_seq.fq b/fastaq/tests/data/sequences_test_fail_no_seq.fq
deleted file mode 100644
index de51137..0000000
--- a/fastaq/tests/data/sequences_test_fail_no_seq.fq
+++ /dev/null
@@ -1,5 +0,0 @@
- at A
-A
-+
-I
- at B
diff --git a/fastaq/tests/data/sequences_test_fastaq_replace_bases.expected.fa b/fastaq/tests/data/sequences_test_fastaq_replace_bases.expected.fa
deleted file mode 100644
index 98e1577..0000000
--- a/fastaq/tests/data/sequences_test_fastaq_replace_bases.expected.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->1
-ACGXXXAXA
diff --git a/fastaq/tests/data/sequences_test_fastaq_replace_bases.fa b/fastaq/tests/data/sequences_test_fastaq_replace_bases.fa
deleted file mode 100644
index c33edf7..0000000
--- a/fastaq/tests/data/sequences_test_fastaq_replace_bases.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->1
-ACGTTTATA
diff --git a/fastaq/tests/data/sequences_test_fastaq_to_quasr_primers.expected b/fastaq/tests/data/sequences_test_fastaq_to_quasr_primers.expected
deleted file mode 100644
index 88ce837..0000000
--- a/fastaq/tests/data/sequences_test_fastaq_to_quasr_primers.expected
+++ /dev/null
@@ -1,2 +0,0 @@
-ACGT ACGT
-AG CT
diff --git a/fastaq/tests/data/sequences_test_fastaq_to_quasr_primers.fa b/fastaq/tests/data/sequences_test_fastaq_to_quasr_primers.fa
deleted file mode 100644
index be7c130..0000000
--- a/fastaq/tests/data/sequences_test_fastaq_to_quasr_primers.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->1
-ACGT
->2
-AG
diff --git a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa b/fastaq/tests/data/sequences_test_filter_by_ids_file.fa
deleted file mode 100644
index 35845b6..0000000
--- a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa
+++ /dev/null
@@ -1,8 +0,0 @@
->seq1
-A
->seq2
-C
->seq3
-G
->seq4
-T
diff --git a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered b/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered
deleted file mode 100644
index 3519900..0000000
--- a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered
+++ /dev/null
@@ -1,4 +0,0 @@
->seq2
-C
->seq4
-T
diff --git a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered.invert b/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered.invert
deleted file mode 100644
index af15a5f..0000000
--- a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.filtered.invert
+++ /dev/null
@@ -1,4 +0,0 @@
->seq1
-A
->seq3
-G
diff --git a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.ids b/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.ids
deleted file mode 100644
index 486529d..0000000
--- a/fastaq/tests/data/sequences_test_filter_by_ids_file.fa.ids
+++ /dev/null
@@ -1,2 +0,0 @@
-seq4
-seq2
diff --git a/fastaq/tests/data/sequences_test_filter_by_regex.fa b/fastaq/tests/data/sequences_test_filter_by_regex.fa
deleted file mode 100644
index d2cc8eb..0000000
--- a/fastaq/tests/data/sequences_test_filter_by_regex.fa
+++ /dev/null
@@ -1,10 +0,0 @@
->1
-AAA
->a
-AAA
->a/1
-AAA
->a/2
-AAA
->b/1
-AAA
diff --git a/fastaq/tests/data/sequences_test_filter_by_regex.first-char-a.fa b/fastaq/tests/data/sequences_test_filter_by_regex.first-char-a.fa
deleted file mode 100644
index e874092..0000000
--- a/fastaq/tests/data/sequences_test_filter_by_regex.first-char-a.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->a
-AAA
->a/1
-AAA
->a/2
-AAA
diff --git a/fastaq/tests/data/sequences_test_filter_by_regex.first-of-pair.fa b/fastaq/tests/data/sequences_test_filter_by_regex.first-of-pair.fa
deleted file mode 100644
index 6dd5e50..0000000
--- a/fastaq/tests/data/sequences_test_filter_by_regex.first-of-pair.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->a/1
-AAA
->b/1
-AAA
diff --git a/fastaq/tests/data/sequences_test_filter_by_regex.numeric.fa b/fastaq/tests/data/sequences_test_filter_by_regex.numeric.fa
deleted file mode 100644
index b2d5b58..0000000
--- a/fastaq/tests/data/sequences_test_filter_by_regex.numeric.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->1
-AAA
diff --git a/fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa b/fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa
deleted file mode 100644
index 18a368a..0000000
--- a/fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->a
-ACGTCNGTCNNNGT
->b
-ACGTGTGTTG
diff --git a/fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa.out b/fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa.out
deleted file mode 100644
index ecd2305..0000000
--- a/fastaq/tests/data/sequences_test_get_seqs_flanking_gaps.fa.out
+++ /dev/null
@@ -1,3 +0,0 @@
-#id gap_start gap_end left_bases right_bases
-a 6 6 GTC GTC
-a 10 12 GTC GT
diff --git a/fastaq/tests/data/sequences_test_gffv3.gff b/fastaq/tests/data/sequences_test_gffv3.gff
deleted file mode 100644
index 5dab817..0000000
--- a/fastaq/tests/data/sequences_test_gffv3.gff
+++ /dev/null
@@ -1,9 +0,0 @@
-##gff-version 3
-# comment
-##sequence-region seq1 1 10
-seq1 . gene 3 7 . + . ID=gene1;name=name1
-##FASTA
->seq1
-ACGTACGTAC
->seq2
-ACGTACGTAC
diff --git a/fastaq/tests/data/sequences_test_gffv3.gff.fasta b/fastaq/tests/data/sequences_test_gffv3.gff.fasta
deleted file mode 100644
index 3b2f48f..0000000
--- a/fastaq/tests/data/sequences_test_gffv3.gff.fasta
+++ /dev/null
@@ -1,4 +0,0 @@
->seq1
-ACACGTGACG
->seq2
-AGTACCGTAA
diff --git a/fastaq/tests/data/sequences_test_gffv3.gff.to_fasta b/fastaq/tests/data/sequences_test_gffv3.gff.to_fasta
deleted file mode 100644
index 89cfed0..0000000
--- a/fastaq/tests/data/sequences_test_gffv3.gff.to_fasta
+++ /dev/null
@@ -1,4 +0,0 @@
->seq1
-ACGTACGTAC
->seq2
-ACGTACGTAC
diff --git a/fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff b/fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff
deleted file mode 100644
index 8e580fa..0000000
--- a/fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff
+++ /dev/null
@@ -1,8 +0,0 @@
-##gff-version 3
-# comment
-##sequence-region seq1 1 10
-seq1 . gene 3 7 . + . ID=gene1;name=name1
->seq1
-ACGTACGTAC
->seq2
-ACGTACGTAC
diff --git a/fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff.to_fasta b/fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff.to_fasta
deleted file mode 100644
index 89cfed0..0000000
--- a/fastaq/tests/data/sequences_test_gffv3.no_FASTA_line.gff.to_fasta
+++ /dev/null
@@ -1,4 +0,0 @@
->seq1
-ACGTACGTAC
->seq2
-ACGTACGTAC
diff --git a/fastaq/tests/data/sequences_test_gffv3.no_seq.2.gff b/fastaq/tests/data/sequences_test_gffv3.no_seq.2.gff
deleted file mode 100644
index d9fda5c..0000000
--- a/fastaq/tests/data/sequences_test_gffv3.no_seq.2.gff
+++ /dev/null
@@ -1,6 +0,0 @@
-##gff-version 3
-# comment
-##sequence-region seq1 1 10
-seq1 . gene 3 7 . + . ID=gene1;name=name1
-##FASTA
-oops
diff --git a/fastaq/tests/data/sequences_test_gffv3.no_seq.gff b/fastaq/tests/data/sequences_test_gffv3.no_seq.gff
deleted file mode 100644
index dbe2b52..0000000
--- a/fastaq/tests/data/sequences_test_gffv3.no_seq.gff
+++ /dev/null
@@ -1,4 +0,0 @@
-##gff-version 3
-# comment
-##sequence-region seq1 1 10
-seq1 . gene 3 7 . + . ID=gene1;name=name1
diff --git a/fastaq/tests/data/sequences_test_good_file.fq b/fastaq/tests/data/sequences_test_good_file.fq
deleted file mode 100644
index 12a42bc..0000000
--- a/fastaq/tests/data/sequences_test_good_file.fq
+++ /dev/null
@@ -1,11 +0,0 @@
- at ID
-ACGTA
-+
-IIIII
-
-
-
- at ID
-ACGTA
-+blah
-IIIII
diff --git a/fastaq/tests/data/sequences_test_good_file.fq.to_fasta b/fastaq/tests/data/sequences_test_good_file.fq.to_fasta
deleted file mode 100644
index c11bdfd..0000000
--- a/fastaq/tests/data/sequences_test_good_file.fq.to_fasta
+++ /dev/null
@@ -1,4 +0,0 @@
->ID
-ACGTA
->ID
-ACGTA
diff --git a/fastaq/tests/data/sequences_test_good_file_mira.xml b/fastaq/tests/data/sequences_test_good_file_mira.xml
deleted file mode 100644
index a9fe6a2..0000000
--- a/fastaq/tests/data/sequences_test_good_file_mira.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<?xml version="1.0"?>
-<trace_volume>
- <trace>
- <trace_name>ID</trace_name>
- <clip_quality_right>5</clip_quality_right>
- <clip_vector_left>1</clip_vector_left>
- </trace>
- <trace>
- <trace_name>ID</trace_name>
- <clip_quality_right>5</clip_quality_right>
- <clip_vector_left>1</clip_vector_left>
- </trace>
-</trace_volume>
diff --git a/fastaq/tests/data/sequences_test_interleaved.fa b/fastaq/tests/data/sequences_test_interleaved.fa
deleted file mode 100644
index 3692716..0000000
--- a/fastaq/tests/data/sequences_test_interleaved.fa
+++ /dev/null
@@ -1,8 +0,0 @@
->1/1
-ACGTA
->1/2
-ACGTA
->2/1
-A
->2/2
-C
diff --git a/fastaq/tests/data/sequences_test_interleaved.fq b/fastaq/tests/data/sequences_test_interleaved.fq
deleted file mode 100644
index 951d5a5..0000000
--- a/fastaq/tests/data/sequences_test_interleaved.fq
+++ /dev/null
@@ -1,16 +0,0 @@
- at 1/1
-ACGTA
-+
-IIIII
- at 1/2
-ACGTA
-+
-IIIII
- at 2/1
-A
-+
-I
- at 2/2
-C
-+
-I
diff --git a/fastaq/tests/data/sequences_test_interleaved_bad.fa b/fastaq/tests/data/sequences_test_interleaved_bad.fa
deleted file mode 100644
index fef6d47..0000000
--- a/fastaq/tests/data/sequences_test_interleaved_bad.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->1/1
-ACGTA
->1/2
-ACGTA
->2/1
-A
diff --git a/fastaq/tests/data/sequences_test_length_filter.fa b/fastaq/tests/data/sequences_test_length_filter.fa
deleted file mode 100644
index 7507f2e..0000000
--- a/fastaq/tests/data/sequences_test_length_filter.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->3
-AAA
->4
-AAAA
->5
-AAAAA
diff --git a/fastaq/tests/data/sequences_test_length_filter.min-0.max-1.fa b/fastaq/tests/data/sequences_test_length_filter.min-0.max-1.fa
deleted file mode 100644
index e69de29..0000000
diff --git a/fastaq/tests/data/sequences_test_length_filter.min-0.max-inf.fa b/fastaq/tests/data/sequences_test_length_filter.min-0.max-inf.fa
deleted file mode 100644
index 7507f2e..0000000
--- a/fastaq/tests/data/sequences_test_length_filter.min-0.max-inf.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->3
-AAA
->4
-AAAA
->5
-AAAAA
diff --git a/fastaq/tests/data/sequences_test_length_filter.min-4.max-4.fa b/fastaq/tests/data/sequences_test_length_filter.min-4.max-4.fa
deleted file mode 100644
index 15e79c4..0000000
--- a/fastaq/tests/data/sequences_test_length_filter.min-4.max-4.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->4
-AAAA
diff --git a/fastaq/tests/data/sequences_test_make_random_contigs.default.fa b/fastaq/tests/data/sequences_test_make_random_contigs.default.fa
deleted file mode 100644
index 8efafc7..0000000
--- a/fastaq/tests/data/sequences_test_make_random_contigs.default.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->1
-ACG
->2
-ACG
diff --git a/fastaq/tests/data/sequences_test_make_random_contigs.first-42.fa b/fastaq/tests/data/sequences_test_make_random_contigs.first-42.fa
deleted file mode 100644
index 67ee20a..0000000
--- a/fastaq/tests/data/sequences_test_make_random_contigs.first-42.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->42
-ACG
->43
-ACG
diff --git a/fastaq/tests/data/sequences_test_make_random_contigs.name-by-letters.fa b/fastaq/tests/data/sequences_test_make_random_contigs.name-by-letters.fa
deleted file mode 100644
index 447d3e0..0000000
--- a/fastaq/tests/data/sequences_test_make_random_contigs.name-by-letters.fa
+++ /dev/null
@@ -1,56 +0,0 @@
->A
-ACG
->B
-ACG
->C
-ACG
->D
-ACG
->E
-ACG
->F
-ACG
->G
-ACG
->H
-ACG
->I
-ACG
->J
-ACG
->K
-ACG
->L
-ACG
->M
-ACG
->N
-ACG
->O
-ACG
->P
-ACG
->Q
-ACG
->R
-ACG
->S
-ACG
->T
-ACG
->U
-ACG
->V
-ACG
->W
-ACG
->X
-ACG
->Y
-ACG
->Z
-ACG
->A
-ACG
->B
-ACG
diff --git a/fastaq/tests/data/sequences_test_make_random_contigs.prefix-p.fa b/fastaq/tests/data/sequences_test_make_random_contigs.prefix-p.fa
deleted file mode 100644
index dc68695..0000000
--- a/fastaq/tests/data/sequences_test_make_random_contigs.prefix-p.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->p1
-ACG
->p2
-ACG
diff --git a/fastaq/tests/data/sequences_test_not_a_fastaq_file b/fastaq/tests/data/sequences_test_not_a_fastaq_file
deleted file mode 100644
index da4a76b..0000000
--- a/fastaq/tests/data/sequences_test_not_a_fastaq_file
+++ /dev/null
@@ -1 +0,0 @@
-i am not a fasta or fastq file
diff --git a/fastaq/tests/data/sequences_test_one-per-line.fa b/fastaq/tests/data/sequences_test_one-per-line.fa
deleted file mode 100644
index b6e4b2f..0000000
--- a/fastaq/tests/data/sequences_test_one-per-line.fa
+++ /dev/null
@@ -1,14 +0,0 @@
->ID
-A
->ID
-AA
->ID
-AAA
->ID
-AAAA
->ID
-AAAAA
->ID
-AAAAAA
->ID
-AAAAAAA
diff --git a/fastaq/tests/data/sequences_test_phylip.interleaved b/fastaq/tests/data/sequences_test_phylip.interleaved
deleted file mode 100644
index e9a42db..0000000
--- a/fastaq/tests/data/sequences_test_phylip.interleaved
+++ /dev/null
@@ -1,8 +0,0 @@
- 3 42
-Turkey AA-CTNGGGC ATTTCAGGGT
-Salmo_gairAAGCCTTGGC AGTGCAGGGT
-H. SapiensACCGGTTGGC CGTTCAGGGT
-
-GAGCCCGGGC AATACAGGGT AT
-GAGCCGTGGC CGGGCACGGT AT
-ACAGGTTGGC CGTTCAGGGT AA
diff --git a/fastaq/tests/data/sequences_test_phylip.interleaved.to_fasta b/fastaq/tests/data/sequences_test_phylip.interleaved.to_fasta
deleted file mode 100644
index 22dfb80..0000000
--- a/fastaq/tests/data/sequences_test_phylip.interleaved.to_fasta
+++ /dev/null
@@ -1,6 +0,0 @@
->Turkey
-AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT
->Salmo_gair
-AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT
->H. Sapiens
-ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA
diff --git a/fastaq/tests/data/sequences_test_phylip.interleaved2 b/fastaq/tests/data/sequences_test_phylip.interleaved2
deleted file mode 100644
index 18e8795..0000000
--- a/fastaq/tests/data/sequences_test_phylip.interleaved2
+++ /dev/null
@@ -1,7 +0,0 @@
- 3 42
-Turkey AA-CTNGGGC ATTTCAGGGT
-Salmo_gairAAGCCTTGGC AGTGCAGGGT
-H. SapiensACCGGTTGGC CGTTCAGGGT
-GAGCCCGGGC AATACAGGGT AT
-GAGCCGTGGC CGGGCACGGT AT
-ACAGGTTGGC CGTTCAGGGT AA
diff --git a/fastaq/tests/data/sequences_test_phylip.interleaved2.to_fasta b/fastaq/tests/data/sequences_test_phylip.interleaved2.to_fasta
deleted file mode 100644
index 22dfb80..0000000
--- a/fastaq/tests/data/sequences_test_phylip.interleaved2.to_fasta
+++ /dev/null
@@ -1,6 +0,0 @@
->Turkey
-AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT
->Salmo_gair
-AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT
->H. Sapiens
-ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA
diff --git a/fastaq/tests/data/sequences_test_phylip.made_by_seaview b/fastaq/tests/data/sequences_test_phylip.made_by_seaview
deleted file mode 100644
index 3f7b0cf..0000000
--- a/fastaq/tests/data/sequences_test_phylip.made_by_seaview
+++ /dev/null
@@ -1,6 +0,0 @@
-2 97
-seq1 GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG
-seq2 AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA
-
- GGGGGGGGGG GGGGGGGGGG GGGGGGGGGG GGGGGGT
- AAAAAAAAAA AAAAAAAAAA AAAAAAAAA- -AAAAAG
diff --git a/fastaq/tests/data/sequences_test_phylip.made_by_seaview.to_fasta b/fastaq/tests/data/sequences_test_phylip.made_by_seaview.to_fasta
deleted file mode 100644
index 10d8264..0000000
--- a/fastaq/tests/data/sequences_test_phylip.made_by_seaview.to_fasta
+++ /dev/null
@@ -1,6 +0,0 @@
->seq1
-GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
-GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGT
->seq2
-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG
diff --git a/fastaq/tests/data/sequences_test_phylip.sequential b/fastaq/tests/data/sequences_test_phylip.sequential
deleted file mode 100644
index e9ce346..0000000
--- a/fastaq/tests/data/sequences_test_phylip.sequential
+++ /dev/null
@@ -1,7 +0,0 @@
- 3 42
-Turkey AA-CTNGGGC ATTTCAGGGT
-GAGCCCGGGC AATACAGGGT AT
-Salmo_gairAAGCCTTGGC AGTGCAGGGT
-GAGCCGTGGC CGGGCACGGT AT
-H. SapiensACCGGTTGGC CGTTCAGGGT
-ACAGGTTGGC CGTTCAGGGT AA
diff --git a/fastaq/tests/data/sequences_test_phylip.sequential.to_fasta b/fastaq/tests/data/sequences_test_phylip.sequential.to_fasta
deleted file mode 100644
index 22dfb80..0000000
--- a/fastaq/tests/data/sequences_test_phylip.sequential.to_fasta
+++ /dev/null
@@ -1,6 +0,0 @@
->Turkey
-AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT
->Salmo_gair
-AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT
->H. Sapiens
-ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA
diff --git a/fastaq/tests/data/sequences_test_revcomp.fa b/fastaq/tests/data/sequences_test_revcomp.fa
deleted file mode 100644
index 4d9922f..0000000
--- a/fastaq/tests/data/sequences_test_revcomp.fa
+++ /dev/null
@@ -1,8 +0,0 @@
->1
-TACGT
->2
-TACGT
->3
-TACGT
->4
-TACGT
diff --git a/fastaq/tests/data/sequences_test_search_string.fa b/fastaq/tests/data/sequences_test_search_string.fa
deleted file mode 100644
index 5dc3d2c..0000000
--- a/fastaq/tests/data/sequences_test_search_string.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->seq
-AAGATCTAGAGATC
diff --git a/fastaq/tests/data/sequences_test_search_string.fa.hits b/fastaq/tests/data/sequences_test_search_string.fa.hits
deleted file mode 100644
index cde92ef..0000000
--- a/fastaq/tests/data/sequences_test_search_string.fa.hits
+++ /dev/null
@@ -1,4 +0,0 @@
-seq 2 +
-seq 8 +
-seq 10 +
-seq 5 -
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa b/fastaq/tests/data/sequences_test_split_fixed_size.fa
deleted file mode 100644
index 8b2f4c5..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa
+++ /dev/null
@@ -1,12 +0,0 @@
->seq1
-ACGTNNNNN
->seq2
-ACGTA
->seq3
-NNNN
->seq4
-AC
->seq5
-ACG
->seq6
-A
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.1 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.1
deleted file mode 100644
index a72c34d..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.1
+++ /dev/null
@@ -1,2 +0,0 @@
->seq1:1-4
-ACGT
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.2 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.2
deleted file mode 100644
index 6a1a218..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.2
+++ /dev/null
@@ -1,2 +0,0 @@
->seq1:5-9
-NNNNN
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.3 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.3
deleted file mode 100644
index 5ff6016..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.3
+++ /dev/null
@@ -1,2 +0,0 @@
->seq2
-ACGTA
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.4 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.4
deleted file mode 100644
index 05a52a9..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.4
+++ /dev/null
@@ -1,2 +0,0 @@
->seq3
-NNNN
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.5 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.5
deleted file mode 100644
index bee7218..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.5
+++ /dev/null
@@ -1,4 +0,0 @@
->seq4
-AC
->seq5
-ACG
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.6 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.6
deleted file mode 100644
index e17b9b7..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.6
+++ /dev/null
@@ -1,2 +0,0 @@
->seq6
-A
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.coords b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.coords
deleted file mode 100644
index 3ed2ead..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.coords
+++ /dev/null
@@ -1,2 +0,0 @@
-seq1:1-4 seq1 0
-seq1:5-9 seq1 4
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.1 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.1
deleted file mode 100644
index a72c34d..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.1
+++ /dev/null
@@ -1,2 +0,0 @@
->seq1:1-4
-ACGT
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.2 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.2
deleted file mode 100644
index 5ff6016..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.2
+++ /dev/null
@@ -1,2 +0,0 @@
->seq2
-ACGTA
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.3 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.3
deleted file mode 100644
index bee7218..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.3
+++ /dev/null
@@ -1,4 +0,0 @@
->seq4
-AC
->seq5
-ACG
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.4 b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.4
deleted file mode 100644
index e17b9b7..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.4
+++ /dev/null
@@ -1,2 +0,0 @@
->seq6
-A
diff --git a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords b/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords
deleted file mode 100644
index 3934ef1..0000000
--- a/fastaq/tests/data/sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords
+++ /dev/null
@@ -1 +0,0 @@
-seq1:1-4 seq1 0
diff --git a/fastaq/tests/data/sequences_test_split_test.fa b/fastaq/tests/data/sequences_test_split_test.fa
deleted file mode 100644
index 7c02b44..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa
+++ /dev/null
@@ -1,8 +0,0 @@
->1
-A
->2
-AA
->3
-AAA
->4
-AAAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.2.1 b/fastaq/tests/data/sequences_test_split_test.fa.2.1
deleted file mode 100644
index 5e9a7fe..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.2.1
+++ /dev/null
@@ -1,2 +0,0 @@
->1
-A
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.2.2 b/fastaq/tests/data/sequences_test_split_test.fa.2.2
deleted file mode 100644
index 7e79b53..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.2.2
+++ /dev/null
@@ -1,2 +0,0 @@
->2
-AA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.2.3 b/fastaq/tests/data/sequences_test_split_test.fa.2.3
deleted file mode 100644
index 22603c9..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.2.3
+++ /dev/null
@@ -1,2 +0,0 @@
->3
-AAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.2.4 b/fastaq/tests/data/sequences_test_split_test.fa.2.4
deleted file mode 100644
index 15e79c4..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.2.4
+++ /dev/null
@@ -1,2 +0,0 @@
->4
-AAAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.3.1 b/fastaq/tests/data/sequences_test_split_test.fa.3.1
deleted file mode 100644
index 5a17cab..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.3.1
+++ /dev/null
@@ -1,4 +0,0 @@
->1
-A
->2
-AA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.3.2 b/fastaq/tests/data/sequences_test_split_test.fa.3.2
deleted file mode 100644
index 22603c9..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.3.2
+++ /dev/null
@@ -1,2 +0,0 @@
->3
-AAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.3.3 b/fastaq/tests/data/sequences_test_split_test.fa.3.3
deleted file mode 100644
index 15e79c4..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.3.3
+++ /dev/null
@@ -1,2 +0,0 @@
->4
-AAAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.4.1 b/fastaq/tests/data/sequences_test_split_test.fa.4.1
deleted file mode 100644
index 5a17cab..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.4.1
+++ /dev/null
@@ -1,4 +0,0 @@
->1
-A
->2
-AA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.4.2 b/fastaq/tests/data/sequences_test_split_test.fa.4.2
deleted file mode 100644
index 22603c9..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.4.2
+++ /dev/null
@@ -1,2 +0,0 @@
->3
-AAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.4.3 b/fastaq/tests/data/sequences_test_split_test.fa.4.3
deleted file mode 100644
index 15e79c4..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.4.3
+++ /dev/null
@@ -1,2 +0,0 @@
->4
-AAAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.1 b/fastaq/tests/data/sequences_test_split_test.fa.6.1
deleted file mode 100644
index a7fcecf..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.6.1
+++ /dev/null
@@ -1,6 +0,0 @@
->1
-A
->2
-AA
->3
-AAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.2 b/fastaq/tests/data/sequences_test_split_test.fa.6.2
deleted file mode 100644
index 15e79c4..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.6.2
+++ /dev/null
@@ -1,2 +0,0 @@
->4
-AAAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.1 b/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.1
deleted file mode 100644
index 5a17cab..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.1
+++ /dev/null
@@ -1,4 +0,0 @@
->1
-A
->2
-AA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.2 b/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.2
deleted file mode 100644
index 22603c9..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.2
+++ /dev/null
@@ -1,2 +0,0 @@
->3
-AAA
diff --git a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.3 b/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.3
deleted file mode 100644
index 15e79c4..0000000
--- a/fastaq/tests/data/sequences_test_split_test.fa.6.limit2.3
+++ /dev/null
@@ -1,2 +0,0 @@
->4
-AAAA
diff --git a/fastaq/tests/data/sequences_test_split_test.long.fa b/fastaq/tests/data/sequences_test_split_test.long.fa
deleted file mode 100644
index 23dad6a..0000000
--- a/fastaq/tests/data/sequences_test_split_test.long.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->1
-AAAAAAAA
->2
-AAAAAAAA
diff --git a/fastaq/tests/data/sequences_test_split_test.long.fa.2.1 b/fastaq/tests/data/sequences_test_split_test.long.fa.2.1
deleted file mode 100644
index ecc99e6..0000000
--- a/fastaq/tests/data/sequences_test_split_test.long.fa.2.1
+++ /dev/null
@@ -1,2 +0,0 @@
->1
-AAAAAAAA
diff --git a/fastaq/tests/data/sequences_test_split_test.long.fa.2.2 b/fastaq/tests/data/sequences_test_split_test.long.fa.2.2
deleted file mode 100644
index 2a2c6a7..0000000
--- a/fastaq/tests/data/sequences_test_split_test.long.fa.2.2
+++ /dev/null
@@ -1,2 +0,0 @@
->2
-AAAAAAAA
diff --git a/fastaq/tests/data/sequences_test_strip_after_whitespace.fa b/fastaq/tests/data/sequences_test_strip_after_whitespace.fa
deleted file mode 100644
index d394bf9..0000000
--- a/fastaq/tests/data/sequences_test_strip_after_whitespace.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->seq 1
-ACGT
->seq 1 2
-ACGT
->seq
-ACGT
diff --git a/fastaq/tests/data/sequences_test_strip_after_whitespace.fa.to_fasta b/fastaq/tests/data/sequences_test_strip_after_whitespace.fa.to_fasta
deleted file mode 100644
index cde7200..0000000
--- a/fastaq/tests/data/sequences_test_strip_after_whitespace.fa.to_fasta
+++ /dev/null
@@ -1,6 +0,0 @@
->seq
-ACGT
->seq
-ACGT
->seq
-ACGT
diff --git a/fastaq/tests/data/sequences_test_strip_illumina_suffix.fq b/fastaq/tests/data/sequences_test_strip_illumina_suffix.fq
deleted file mode 100644
index 05a65a4..0000000
--- a/fastaq/tests/data/sequences_test_strip_illumina_suffix.fq
+++ /dev/null
@@ -1,12 +0,0 @@
- at one/1
-A
-+
-I
- at one/2
-A
-+
-I
- at two/3
-A
-+
-I
diff --git a/fastaq/tests/data/sequences_test_strip_illumina_suffix.fq.stripped b/fastaq/tests/data/sequences_test_strip_illumina_suffix.fq.stripped
deleted file mode 100644
index 4425cc6..0000000
--- a/fastaq/tests/data/sequences_test_strip_illumina_suffix.fq.stripped
+++ /dev/null
@@ -1,12 +0,0 @@
- at one
-A
-+
-I
- at one
-A
-+
-I
- at two/3
-A
-+
-I
diff --git a/fastaq/tests/data/sequences_test_to_unique_by_id.fa b/fastaq/tests/data/sequences_test_to_unique_by_id.fa
deleted file mode 100644
index 5b486ee..0000000
--- a/fastaq/tests/data/sequences_test_to_unique_by_id.fa
+++ /dev/null
@@ -1,11 +0,0 @@
->seq1
-AA
->seq2
-A
->seq3
-A
->seq1
-A
->seq4
->seq1
-AAA
diff --git a/fastaq/tests/data/sequences_test_to_unique_by_id.fa.out b/fastaq/tests/data/sequences_test_to_unique_by_id.fa.out
deleted file mode 100644
index 8c40ed1..0000000
--- a/fastaq/tests/data/sequences_test_to_unique_by_id.fa.out
+++ /dev/null
@@ -1,6 +0,0 @@
->seq1
-AAA
->seq2
-A
->seq3
-A
diff --git a/fastaq/tests/data/sequences_test_translate.fa b/fastaq/tests/data/sequences_test_translate.fa
deleted file mode 100644
index 62c5afc..0000000
--- a/fastaq/tests/data/sequences_test_translate.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->seq
-GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA
diff --git a/fastaq/tests/data/sequences_test_translate.fa.frame0 b/fastaq/tests/data/sequences_test_translate.fa.frame0
deleted file mode 100644
index 0cdd1cf..0000000
--- a/fastaq/tests/data/sequences_test_translate.fa.frame0
+++ /dev/null
@@ -1,3 +0,0 @@
->seq
-AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVV
-V***
diff --git a/fastaq/tests/data/sequences_test_translate.fa.frame1 b/fastaq/tests/data/sequences_test_translate.fa.frame1
deleted file mode 100644
index 80a5aaa..0000000
--- a/fastaq/tests/data/sequences_test_translate.fa.frame1
+++ /dev/null
@@ -1,3 +0,0 @@
->seq
-QPRLEGDAGVTMTIAVKSNREAGVTI*SFYCYSCLKRCSFHPRLAVHPRLQPRLGTM*SW
-FNS
diff --git a/fastaq/tests/data/sequences_test_translate.fa.frame2 b/fastaq/tests/data/sequences_test_translate.fa.frame2
deleted file mode 100644
index 874a17d..0000000
--- a/fastaq/tests/data/sequences_test_translate.fa.frame2
+++ /dev/null
@@ -1,3 +0,0 @@
->seq
-SRG*KATPA*Q*RLL*RATGRRGSPYNHFIATPA*KDVLSTPA*QFILVYNHDLVLCSRG
-LIV
diff --git a/fastaq/tests/data/sequences_test_trim_Ns_at_end.fa b/fastaq/tests/data/sequences_test_trim_Ns_at_end.fa
deleted file mode 100644
index 752e880..0000000
--- a/fastaq/tests/data/sequences_test_trim_Ns_at_end.fa
+++ /dev/null
@@ -1,10 +0,0 @@
->1
-A
->2
-nNNNNNNCNNNANNNN
->3
-NNnA
->4
-AnnnNn
->5
-NNnnnNNn
diff --git a/fastaq/tests/data/sequences_test_trim_Ns_at_end.fa.trimmed b/fastaq/tests/data/sequences_test_trim_Ns_at_end.fa.trimmed
deleted file mode 100644
index f9ca62c..0000000
--- a/fastaq/tests/data/sequences_test_trim_Ns_at_end.fa.trimmed
+++ /dev/null
@@ -1,8 +0,0 @@
->1
-A
->2
-CNNNA
->3
-A
->4
-A
diff --git a/fastaq/tests/data/sequences_test_trimmed.fq b/fastaq/tests/data/sequences_test_trimmed.fq
deleted file mode 100644
index ba91557..0000000
--- a/fastaq/tests/data/sequences_test_trimmed.fq
+++ /dev/null
@@ -1,8 +0,0 @@
- at ID
-GT
-+
-II
- at ID
-GT
-+
-II
diff --git a/fastaq/tests/data/sequences_test_untrimmed.fq b/fastaq/tests/data/sequences_test_untrimmed.fq
deleted file mode 100644
index 349de37..0000000
--- a/fastaq/tests/data/sequences_test_untrimmed.fq
+++ /dev/null
@@ -1,16 +0,0 @@
- at ID
-ACGTA
-+
-IIIII
- at ID
-ACGTA
-+blah
-IIIII
-@
-NNN
-+
-III
-@
-N
-+
-I
diff --git a/fastaq/tests/data/utils_test_file_transpose.txt b/fastaq/tests/data/utils_test_file_transpose.txt
deleted file mode 100644
index 1661a3c..0000000
--- a/fastaq/tests/data/utils_test_file_transpose.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-11 12 13
-21 22 23
-31 32 33
-41 42
-51 52 53
diff --git a/fastaq/tests/data/utils_test_file_transposed.txt b/fastaq/tests/data/utils_test_file_transposed.txt
deleted file mode 100644
index 8af3080..0000000
--- a/fastaq/tests/data/utils_test_file_transposed.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-11 21 31 41 51
-12 22 32 42 52
-13 23 33 . 53
diff --git a/fastaq/tests/data/utils_test_not_really_zipped.gz b/fastaq/tests/data/utils_test_not_really_zipped.gz
deleted file mode 100644
index d81cc07..0000000
--- a/fastaq/tests/data/utils_test_not_really_zipped.gz
+++ /dev/null
@@ -1 +0,0 @@
-42
diff --git a/fastaq/tests/data/utils_test_scaffolds.fa b/fastaq/tests/data/utils_test_scaffolds.fa
deleted file mode 100644
index 5c1b14b..0000000
--- a/fastaq/tests/data/utils_test_scaffolds.fa
+++ /dev/null
@@ -1,8 +0,0 @@
->scaf1
-ACGT
->scaf2
-ACNNNGTNA
->scaf3
-NNAAAANNN
->scaf4
-NNNNNNN
diff --git a/fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.fa b/fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.fa
deleted file mode 100644
index a599ad2..0000000
--- a/fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.fa
+++ /dev/null
@@ -1,10 +0,0 @@
->scaf1.1.4
-ACGT
->scaf2.1.2
-AC
->scaf2.6.7
-GT
->scaf2.9.9
-A
->scaf3.3.6
-AAAA
diff --git a/fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.number_contigs.fa b/fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.number_contigs.fa
deleted file mode 100644
index 412e4bd..0000000
--- a/fastaq/tests/data/utils_test_scaffolds.fa.to_contigs.number_contigs.fa
+++ /dev/null
@@ -1,10 +0,0 @@
->scaf1.1
-ACGT
->scaf2.1
-AC
->scaf2.2
-GT
->scaf2.3
-A
->scaf3.1
-AAAA
diff --git a/fastaq/tests/data/utils_test_system_call.txt b/fastaq/tests/data/utils_test_system_call.txt
deleted file mode 100644
index f5b2b3b..0000000
--- a/fastaq/tests/data/utils_test_system_call.txt
+++ /dev/null
@@ -1 +0,0 @@
-this is the contents of system call test file
diff --git a/fastaq/tests/intervals_test.py b/fastaq/tests/intervals_test.py
deleted file mode 100644
index c6282a0..0000000
--- a/fastaq/tests/intervals_test.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#!/usr/bin/env python3
-
-import unittest
-from fastaq import intervals
-
-class TestIntervals(unittest.TestCase):
- def test_init(self):
- '''Throw error if try to construct genome_interval from a non-int, or end<start'''
- with self.assertRaises(intervals.Error):
- intervals.Interval('a', 1)
- with self.assertRaises(intervals.Error):
- intervals.Interval(1, 'a')
- with self.assertRaises(intervals.Error):
- intervals.Interval('a', 'a')
- with self.assertRaises(intervals.Error):
- intervals.Interval(3, 2)
-
- def test_comparisons(self):
- '''<, <=, == should work as expected'''
- self.assertTrue(intervals.Interval(1,2) < intervals.Interval(2,2))
- self.assertTrue(intervals.Interval(1,2) <= intervals.Interval(2,2))
- self.assertFalse(intervals.Interval(2,2) <= intervals.Interval(1,2))
- self.assertFalse(intervals.Interval(2,2) < intervals.Interval(1,2))
- self.assertFalse(intervals.Interval(2,2) < intervals.Interval(2,2))
- self.assertTrue(intervals.Interval(1,2) == intervals.Interval(1,2))
- self.assertFalse(intervals.Interval(1,2) == intervals.Interval(1,3))
- self.assertTrue(intervals.Interval(1,2) != intervals.Interval(1,3))
- self.assertFalse(intervals.Interval(1,2) != intervals.Interval(1,2))
-
- def test_len(self):
- self.assertEqual(len(intervals.Interval(1,2)), 2)
- self.assertEqual(len(intervals.Interval(1,1)), 1)
- self.assertEqual(len(intervals.Interval(10,20)), 11)
-
- def test_intersects(self):
- '''Intersection of two intervals should do the right thing'''
- a = intervals.Interval(5, 10)
- no_intersect = [intervals.Interval(3, 4),
- intervals.Interval(11,20)]
- intersect = [intervals.Interval(3,5),
- intervals.Interval(3,6),
- intervals.Interval(9,12),
- intervals.Interval(10,12),
- intervals.Interval(6,7),
- intervals.Interval(1,20)]
-
- for i in no_intersect:
- self.assertFalse(a.intersects(i), 'shouldn\'t intersect: ' + str(a) + ', ' + str(i))
-
- for i in intersect:
- self.assertTrue(a.intersects(i), 'should intersect: ' + str(a) + ', ' + str(i))
-
- def test_contains(self):
- '''Check that contains() works as expected'''
- a = intervals.Interval(5, 10)
- not_contained = [intervals.Interval(1,2),
- intervals.Interval(4,5),
- intervals.Interval(4,10),
- intervals.Interval(4,11),
- intervals.Interval(5,11),
- intervals.Interval(1,2),
- intervals.Interval(9,11),
- intervals.Interval(10,11),
- intervals.Interval(11,20)]
-
-
- contained = [intervals.Interval(5,5),
- intervals.Interval(5,10),
- intervals.Interval(6,7),
- intervals.Interval(6,10),
- intervals.Interval(10,10)]
-
- for i in not_contained:
- self.assertFalse(a.contains(i), 'shouldn\'t contain: ' + str(a) + ', ' + str(i))
-
- for i in contained:
- self.assertTrue(a.contains(i), 'should contain: ' + str(a) + ', ' + str(i))
-
- def test_union(self):
- '''Union should either return None or the correct union'''
- a = intervals.Interval(5, 10)
- b = intervals.Interval(8, 15)
- c = intervals.Interval(12, 20)
- d = intervals.Interval(21,22)
- self.assertEqual(a.union(c), None)
- self.assertEqual(c.union(a), None)
- self.assertEqual(a.union(b), intervals.Interval(5,15))
- self.assertEqual(b.union(a), intervals.Interval(5,15))
- self.assertEqual(c.union(d), intervals.Interval(12,22))
- self.assertEqual(d.union(c), intervals.Interval(12,22))
-
- def test_union_flll_gap(self):
- '''union_fill_gap() should ignore intersections and return the maximum range of coords'''
- a = intervals.Interval(5, 10)
- b = intervals.Interval(8, 15)
- c = intervals.Interval(12, 20)
- d = intervals.Interval(21,22)
- self.assertEqual(a.union_fill_gap(c), intervals.Interval(5,20))
- self.assertEqual(c.union_fill_gap(a), intervals.Interval(5,20))
- self.assertEqual(a.union_fill_gap(b), intervals.Interval(5,15))
- self.assertEqual(b.union_fill_gap(a), intervals.Interval(5,15))
- self.assertEqual(c.union_fill_gap(d), intervals.Interval(12,22))
- self.assertEqual(d.union_fill_gap(c), intervals.Interval(12,22))
-
-
- def test_intersection(self):
- '''Intersection should either return None or the correct intersection'''
- a = intervals.Interval(5, 10)
- b = intervals.Interval(8, 15)
- c = intervals.Interval(12, 20)
- self.assertEqual(a.intersection(c), None)
- self.assertEqual(a.intersection(b), intervals.Interval(8,10))
-
-class Test_intersection(unittest.TestCase):
- def test_intersection(self):
- '''intersection() should correctly intersect two lists of intervals'''
- a = [intervals.Interval(1,2),
- intervals.Interval(10,20),
- intervals.Interval(51,52),
- intervals.Interval(54,55),
- intervals.Interval(57,58)]
-
- b = [intervals.Interval(5,6),
- intervals.Interval(9,11),
- intervals.Interval(13,14),
- intervals.Interval(17,18),
- intervals.Interval(20,25),
- intervals.Interval(50,60)]
-
- c = [intervals.Interval(100,200)]
-
- i = [intervals.Interval(10,11),
- intervals.Interval(13,14),
- intervals.Interval(17,18),
- intervals.Interval(20,20),
- intervals.Interval(51,52),
- intervals.Interval(54,55),
- intervals.Interval(57,58)]
-
- self.assertSequenceEqual(intervals.intersection(a,b), i)
- self.assertSequenceEqual(intervals.intersection(b,a), i)
- self.assertSequenceEqual(intervals.intersection(c,a), [])
- self.assertEqual(intervals.intersection([],a), [])
- self.assertEqual(intervals.intersection(a,[]), [])
-
-class Test_merge_overlapping_in_list(unittest.TestCase):
- def test_merge_overlapping_in_list(self):
- '''merge_overlapping_in_list() merges correctly'''
- a = [intervals.Interval(1,2),
- intervals.Interval(51,60),
- intervals.Interval(10,20),
- intervals.Interval(20,30),
- intervals.Interval(20,30),
- intervals.Interval(29,50),
- intervals.Interval(65,70)]
-
- b = [intervals.Interval(1,2),
- intervals.Interval(10,60),
- intervals.Interval(65,70)]
-
- intervals.merge_overlapping_in_list(a)
- self.assertSequenceEqual(a, b)
-
-class Test_remove_contained_in_list(unittest.TestCase):
- def test_remove_contained_in_list(self):
- '''test_remove_contained_in_list removes the right elements of list'''
- a = [intervals.Interval(1,2),
- intervals.Interval(4,4),
- intervals.Interval(4,5),
- intervals.Interval(5,6),
- intervals.Interval(7,9),
- intervals.Interval(8,10),
- intervals.Interval(9,11),
- intervals.Interval(20,25),
- intervals.Interval(20,24),
- intervals.Interval(20,26),
- intervals.Interval(30,38),
- intervals.Interval(30,37),
- intervals.Interval(30,36),
- intervals.Interval(30,35),
- intervals.Interval(30,35),
- intervals.Interval(32,33),
- intervals.Interval(38,50),
- intervals.Interval(65,70),
- intervals.Interval(67,70)]
-
- b = [intervals.Interval(1,2),
- intervals.Interval(4,5),
- intervals.Interval(5,6),
- intervals.Interval(7,9),
- intervals.Interval(8,10),
- intervals.Interval(9,11),
- intervals.Interval(20,26),
- intervals.Interval(30,38),
- intervals.Interval(38,50),
- intervals.Interval(65,70)]
-
- intervals.remove_contained_in_list(a)
- self.assertSequenceEqual(a, b)
-
-class Test_length_sum_from_list(unittest.TestCase):
- def test_length_sum_from_list(self):
- '''Test that total length of intervals is summed correctly'''
- a = [intervals.Interval(1,2),
- intervals.Interval(4,5),
- intervals.Interval(10,19)]
-
- self.assertEqual(14, intervals.length_sum_from_list(a))
-
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/fastaq/tests/sequences_test.py b/fastaq/tests/sequences_test.py
deleted file mode 100644
index 4bafb66..0000000
--- a/fastaq/tests/sequences_test.py
+++ /dev/null
@@ -1,535 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import filecmp
-import os
-import unittest
-from fastaq import sequences, utils, intervals
-
-modules_dir = os.path.dirname(os.path.abspath(sequences.__file__))
-data_dir = os.path.join(modules_dir, 'tests', 'data')
-
-class Error (Exception): pass
-
-expected_embl = [
- 'aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcattcacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgagtcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttgaaggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaatatccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgctacaaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctcttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaaatattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactctttttcat [...]
- 'aaacaaaccaaatatggattttattgtagccatatttgctctgtttgttattagctcattcacaattacttccacaaatgcagttgaagcttctactcttcttgacataggtaacctgagtcggagcagttttcctcgtggcttcatctttggtgctggatcttcagcataccaatttgaaggtgcagtaaacgaaggcggtagaggaccaagtatttgggataccttcacccataaatatccagaaaaaataagggatggaagcaatgcagacatcacggttgaccaatatcaccgctacaaggaagatgttgggattatgaaggatcaaaatatggattcgtatagattctcaatctcttggccaagaatactcccaaagggaaagttgagcggaggcataaatcacgaaggaatcaaatattacaacaaccttatcaacgaactattggctaacggtatacaaccatttgtaactctttttcat [...]
-]
-class TestFasta(unittest.TestCase):
- def setUp(self):
- self.fasta = sequences.Fasta('ID', 'ACGTA')
-
- def test_equality(self):
- self.assertTrue(self.fasta == sequences.Fasta('ID', 'ACGTA'))
- self.assertFalse(self.fasta == sequences.Fasta('I', 'ACGTA'))
- self.assertFalse(self.fasta == sequences.Fasta('ID', 'ACGT'))
- self.assertFalse(self.fasta != sequences.Fasta('ID', 'ACGTA'))
- self.assertTrue(self.fasta != sequences.Fasta('I', 'ACGTA'))
- self.assertTrue(self.fasta != sequences.Fasta('ID', 'ACGT'))
-
- def test_init(self):
- '''__init__ should get the ID and sequence correctly'''
- self.assertEqual(self.fasta.id, 'ID')
- self.assertEqual(self.fasta.seq, 'ACGTA')
-
- def test_get_next_from_file(self):
- '''get_next_from_file() should read seqs from OK, including weirdness in file'''
- f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.fa'))
- fa = sequences.Fasta()
- counter = 1
-
- while fa.get_next_from_file(f_in):
- self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA'))
- counter += 1
-
- utils.close(f_in)
-
- def test_get_id_from_header_line(self):
- '''Check that can get ID from header line or die properly'''
- self.assertEqual(sequences.Fasta._get_id_from_header_line(self.fasta, '>X'), 'X')
- with self.assertRaises(sequences.Error):
- self.assertEqual(sequences.Fasta._get_id_from_header_line(self.fasta, 'X'), 'X')
-
- def test_getitem(self):
- '''getitem() should return the right subsequence'''
- seq = 'AACGTGTCA'
- fa = sequences.Fasta('x', seq)
- self.assertEqual(seq[1], fa[1])
- self.assertEqual(seq[0:2], fa[0:2])
- self.assertEqual(seq[1:], fa[1:])
-
- def test_len(self):
- '''len() should return the length of the sequence'''
- self.assertEqual(5, len(self.fasta))
-
- def test_print_line_length(self):
- '''__str__ should be formatted correctly with the right number of chars per line of sequence'''
- line_lengths = [0, 3]
- correct_files = [os.path.join(data_dir, x) for x in ['sequences_test_one-per-line.fa', 'sequences_test_3-per-line.fa']]
-
- for i in range(len(line_lengths)):
- seq_reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_one-per-line.fa'))
- sequences.Fasta.line_length = line_lengths[i]
- tmp_out = 'tmp.line_length_test.fa'
- f = utils.open_file_write(tmp_out)
- for s in seq_reader:
- print(s, file=f)
- utils.close(f)
- self.assertTrue(filecmp.cmp(correct_files[i], tmp_out))
- os.unlink(tmp_out)
-
- sequences.Fasta.line_length = 60
-
- def test_strip_after_first_whitespace(self):
- '''Test strip_after_first_whitespace()'''
- seqs = [
- sequences.Fasta('name', 'A'),
- sequences.Fasta('name foo', 'A'),
- sequences.Fasta('name foo bar', 'A'),
- sequences.Fasta('name\tfoo', 'A'),
- ]
-
- for seq in seqs:
- seq.strip_after_first_whitespace()
-
- for seq in seqs:
- self.assertEqual(seq.id, 'name')
-
- def test_strip_illumina_suffix(self):
- '''Check that /1 and /2 removed correctly from IDs'''
- seqs = [sequences.Fasta('name/1', 'A'),
- sequences.Fasta('name/2', 'A'),
- sequences.Fasta('name', 'A'),
- sequences.Fasta('name/1/2', 'A'),
- sequences.Fasta('name/2/1', 'A'),
- sequences.Fasta('name/3', 'A')]
-
- correct_names = ['name', 'name', 'name', 'name/1', 'name/2', 'name/3']
-
- for seq in seqs:
- seq.strip_illumina_suffix()
-
- for i in range(len(seqs)):
- self.assertEqual(seqs[i].id, correct_names[i])
-
- def test_revcomp(self):
- '''revcomp() should correctly reverse complement a sequence'''
- fa = sequences.Fasta('ID', 'ACGTNacgtn')
- fa.revcomp()
- self.assertEqual(fa, sequences.Fasta('ID', 'nacgtNACGT'))
-
- def test_gaps(self):
- '''gaps() should find the gaps in a sequence correctly'''
- test_seqs = [sequences.Fasta('ID', 'ACGT'),
- sequences.Fasta('ID', 'NACGT'),
- sequences.Fasta('ID', 'NACGTN'),
- sequences.Fasta('ID', 'ANNCGT'),
- sequences.Fasta('ID', 'NANNCGTNN')]
-
- correct_gaps = [[],
- [intervals.Interval(0, 0)],
- [intervals.Interval(0, 0), intervals.Interval(5, 5)],
- [intervals.Interval(1, 2)],
- [intervals.Interval(0, 0), intervals.Interval(2, 3), intervals.Interval(7, 8)]]
-
- for i in range(len(test_seqs)):
- gaps = test_seqs[i].gaps()
- self.assertListEqual(correct_gaps[i], gaps)
-
- def test_contig_coords(self):
- '''contig_coords() should get the coords of all contigs in a sequence correctly'''
- test_seqs = [sequences.Fasta('ID', 'ACGT'),
- sequences.Fasta('ID', 'NACGT'),
- sequences.Fasta('ID', 'NNACGT'),
- sequences.Fasta('ID', 'ACGTN'),
- sequences.Fasta('ID', 'ACGTNN'),
- sequences.Fasta('ID', 'NANNCGT'),
- sequences.Fasta('ID', 'ACNNNGTNA'),
- sequences.Fasta('ID', 'ANNCGTNNAAAAA')]
-
- correct_coords = [[intervals.Interval(0,3)],
- [intervals.Interval(1, 4)],
- [intervals.Interval(2, 5)],
- [intervals.Interval(0, 3)],
- [intervals.Interval(0, 3)],
- [intervals.Interval(1, 1), intervals.Interval(4,6)],
- [intervals.Interval(0, 1), intervals.Interval(5, 6), intervals.Interval(8, 8)],
- [intervals.Interval(0, 0), intervals.Interval(3, 5), intervals.Interval(8, 12)]]
-
- for i in range(len(test_seqs)):
- gaps = test_seqs[i].contig_coords()
- self.assertListEqual(correct_coords[i], gaps)
-
- def test_is_all_Ns(self):
- '''Test is_all_Ns()'''
- self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns())
- self.assertTrue(sequences.Fasta('ID', 'N').is_all_Ns())
- self.assertTrue(sequences.Fasta('ID', 'nNn').is_all_Ns())
- self.assertFalse(sequences.Fasta('ID', 'a').is_all_Ns())
- self.assertFalse(sequences.Fasta('ID', '').is_all_Ns())
- self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns())
- self.assertFalse(sequences.Fasta('ID', 'naN').is_all_Ns())
- self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=0))
- self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=1))
- self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=1))
- self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=2))
- self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1))
- self.assertTrue(sequences.Fasta('ID', 'anN').is_all_Ns(start=1))
- self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(end=1))
- self.assertTrue(sequences.Fasta('ID', 'nNA').is_all_Ns(end=1))
-
- with self.assertRaises(sequences.Error):
- sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=0)
-
- def test_trim_Ns(self):
- '''trim_Ns() should do the right trimming of a sequence'''
- fa = sequences.Fasta('ID', 'ANNANA')
- test_seqs = [sequences.Fasta('ID', 'ANNANA'),
- sequences.Fasta('ID', 'NANNANA'),
- sequences.Fasta('ID', 'NANNANAN'),
- sequences.Fasta('ID', 'ANNANAN'),
- sequences.Fasta('ID', 'NNNNNNANNANAN'),
- sequences.Fasta('ID', 'NNANNANANn')]
-
- for s in test_seqs:
- s.trim_Ns()
- self.assertEqual(fa, s)
-
- def test_replace_bases(self):
- '''Check that bases get replaced correctly'''
- fa = sequences.Fasta('X', 'AUCGTUUACT')
- fa.replace_bases('U', 'T')
- self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT'))
-
- def test_replace_interval(self):
- '''Test replace_interval()'''
- fa = sequences.Fasta('ID', 'ACGTA')
- fa.replace_interval(0, 0, 'NEW')
- self.assertEqual(fa, sequences.Fasta('ID', 'NEWCGTA'))
-
- fa = sequences.Fasta('ID', 'ACGTA')
- fa.replace_interval(4, 4, 'NEW')
- self.assertEqual(fa, sequences.Fasta('ID', 'ACGTNEW'))
-
- fa = sequences.Fasta('ID', 'ACGTA')
- fa.replace_interval(2, 3, 'NEW')
- self.assertEqual(fa, sequences.Fasta('ID', 'ACNEWA'))
-
- fa = sequences.Fasta('ID', 'ACGTA')
- with self.assertRaises(sequences.Error):
- fa.replace_interval(3,2,'x')
- with self.assertRaises(sequences.Error):
- fa.replace_interval(1,5,'x')
- with self.assertRaises(sequences.Error):
- fa.replace_interval(5,10,'x')
-
- fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
- fq.replace_interval(0, 0, 'NEW', 'III')
- self.assertEqual(fq, sequences.Fastq('ID', 'NEWCGTA', 'IIIBCDE'))
-
- fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
- fq.replace_interval(4, 4, 'NEW', 'III')
- self.assertEqual(fq, sequences.Fastq('ID', 'ACGTNEW', 'ABCDIII'))
-
- fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
- fq.replace_interval(2, 3, 'NEW', 'III')
- self.assertEqual(fq, sequences.Fastq('ID', 'ACNEWA', 'ABIIIE'))
-
- with self.assertRaises(sequences.Error):
- fq.replace_interval(1,1,'x', 'xx')
-
- def test_search_string(self):
- '''Check that search_string() finds all the hits'''
- fa = sequences.Fasta('X', 'AAA')
- hits = fa.search('G')
- self.assertTrue(len(hits) == 0)
- hits = fa.search('AAA')
- self.assertListEqual(hits, [(0, '+')])
- hits = fa.search('AA')
- self.assertListEqual(hits, [(0, '+'), (1, '+')])
- hits = fa.search('TTT')
- self.assertListEqual(hits, [(0, '-')])
-
- def test_to_Fastq(self):
- '''Check to_Fastq converts OK, including out of range quality scores'''
- fa = sequences.Fasta('X', 'AAAAA')
- quals = [-1, 0, 40, 93, 94]
- self.assertEqual(sequences.Fastq('X', 'AAAAA', '!!I~~'), fa.to_Fastq(quals))
- with self.assertRaises(sequences.Error):
- fa.to_Fastq('AAAAAAAAAAAAA')
-
-
- def test_translate(self):
- '''Test nucleotide -> amino acid conversion works on Fasta'''
- fa = sequences.Fasta('ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA')
- self.assertEqual(sequences.Fasta('ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***'), fa.translate())
- self.assertEqual(sequences.Fasta('ID', 'QPRLEGDAGVTMTIAVKSNREAGVTI*SFYCYSCLKRCSFHPRLAVHPRLQPRLGTM*SWFNS'), fa.translate(frame=1))
- print(fa.translate(frame=1))
- self.assertEqual(sequences.Fasta('ID', 'SRG*KATPA*Q*RLL*RATGRRGSPYNHFIATPA*KDVLSTPA*QFILVYNHDLVLCSRGLIV'), fa.translate(frame=2))
-
-
- def test_split_capillary_id(self):
- '''Tests that we get information from a sanger capillary read name OK'''
- ids = ['abcde.p1k', 'abcde.x.p1k', 'abcde.p1ka', 'abcde.q1k', 'abcde.w2k']
- expected = [{'prefix': 'abcde', 'dir': 'fwd', 'suffix': 'p1k'},
- {'prefix': 'abcde.x', 'dir': 'fwd', 'suffix': 'p1k'},
- {'prefix': 'abcde', 'dir': 'fwd', 'suffix': 'p1ka'},
- {'prefix': 'abcde', 'dir': 'rev', 'suffix': 'q1k'},
- {'prefix': 'abcde', 'dir': 'unk', 'suffix': 'w2k'}]
-
- for i in range(len(ids)):
- fa = sequences.Fasta(ids[i], 'A')
- self.assertEqual(fa.split_capillary_id(), expected[i])
-
- with self.assertRaises(sequences.Error):
- fa = sequences.Fasta('name', 'A')
- fa.split_capillary_id()
-
-
-class TestEmbl(unittest.TestCase):
- def test_get_id_from_header_line(self):
- '''Test get id from header line of EMBL'''
- embl = sequences.Embl('ID', 'ACGT')
- self.assertEqual(embl._get_id_from_header_line('ID X; blah'), 'X')
- self.assertEqual(embl._get_id_from_header_line('LOCUS X foo'), 'X')
- with self.assertRaises(sequences.Error):
- self.assertEqual(embl._get_id_from_header_line('ID X;'), 'X')
- with self.assertRaises(sequences.Error):
- self.assertEqual(embl._get_id_from_header_line('XX X;'), 'X')
-
-
- def test_get_next_from_embl_file(self):
- f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.embl'))
- embl = sequences.Embl()
- counter = 1
-
- while embl.get_next_from_file(f_in):
- self.assertEqual(embl, sequences.Fasta('seq' + str(counter), expected_embl[counter-1]))
- counter += 1
-
- utils.close(f_in)
-
-
- def test_get_next_from_gbk_file(self):
- f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.gbk'))
- embl = sequences.Embl()
- counter = 1
- expected = [
- 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc',
- 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa']
-
- while embl.get_next_from_file(f_in):
- self.assertEqual(embl, sequences.Fasta('NAME' + str(counter), expected[counter-1]))
- counter += 1
-
- utils.close(f_in)
-
-
-class TestFastq(unittest.TestCase):
- def setUp(self):
- self.fastq = sequences.Fastq('ID', 'ACGTA', 'IIIII')
-
- def test_init(self):
- '''__init__ should get the ID, sequence and quality correctly'''
- self.assertEqual(self.fastq.id, 'ID')
- self.assertEqual(self.fastq.seq, 'ACGTA')
- self.assertEqual(self.fastq.qual, 'IIIII')
-
- def test_init_length_mismatch(self):
- '''__init__ should raise an error when length of seq and quality not the same'''
- with self.assertRaises(sequences.Error):
- sequences.Fastq('X', 'A', 'II')
-
- def test_get_next_from_file(self):
- '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file'''
- bad_files = ['sequences_test_fail_no_AT.fq',
- 'sequences_test_fail_no_seq.fq',
- 'sequences_test_fail_no_plus.fq',
- 'sequences_test_fail_no_qual.fq']
-
- bad_files = [os.path.join(data_dir, x) for x in bad_files]
-
- for fname in bad_files:
- f_in = utils.open_file_read(fname)
- fq = sequences.Fastq()
- with self.assertRaises(sequences.Error):
- while fq.get_next_from_file(f_in):
- pass
-
- utils.close(f_in)
-
- fname = os.path.join(data_dir, 'sequences_test_good_file.fq')
- try:
- f_in = open(fname)
- except IOError:
- print("Error opening '" + fname + "'", file=sys.stderr)
- sys.exit(1)
-
- fq = sequences.Fastq()
- while fq.get_next_from_file(f_in):
- self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
- utils.close(f_in)
-
- def test_revcomp(self):
- '''revcomp() should correctly reverse complement a sequence'''
- fq = sequences.Fastq('ID', 'ACGTNacgtn', '1234567890')
- fq.revcomp()
- self.assertEqual(fq, sequences.Fastq('ID', 'nacgtNACGT', '0987654321'))
-
- def test_trim_Ns(self):
- '''trim_Ns() should do the right trimming of a fastq sequence'''
- fq = sequences.Fastq('ID', 'ANNANA', '111111')
- test_seqs = [sequences.Fastq('ID', 'ANNANA', '111111'),
- sequences.Fastq('ID', 'NANNANA', '1111111'),
- sequences.Fastq('ID', 'NANNANAN', '11111111'),
- sequences.Fastq('ID', 'ANNANAN', '1111111'),
- sequences.Fastq('ID', 'NNNNNNANNANAN', '1111111111111'),
- sequences.Fastq('ID', 'NNANNANANn', '1111111111')]
-
- for s in test_seqs:
- s.trim_Ns()
- self.assertEqual(fq, s)
-
- def test_trim(self):
- '''trim() should trim the right number of bases off start and end'''
- fq = sequences.Fastq('ID', '1234567890', '1234567890')
- fq.trim(0, 0)
- self.assertEqual(fq, sequences.Fastq('ID', '1234567890', '1234567890'))
-
- fq = sequences.Fastq('ID', '1234567890', '1234567890')
- fq.trim(1, 0)
- self.assertEqual(fq, sequences.Fastq('ID', '234567890', '234567890'))
-
- fq = sequences.Fastq('ID', '1234567890', '1234567890')
- fq.trim(0, 1)
- self.assertEqual(fq, sequences.Fastq('ID', '123456789', '123456789'))
-
- fq = sequences.Fastq('ID', '1234567890', '1234567890')
- fq.trim(2, 2)
- self.assertEqual(fq, sequences.Fastq('ID', '345678', '345678'))
-
- def test_to_Fasta_and_qual(self):
- '''Check to_Fasta_and_qual converts quality scores correctly'''
- fq = sequences.Fastq('ID', 'ACGT', '>ADI')
- (fa, qual) = fq.to_Fasta_and_qual()
- self.assertEqual(fa, sequences.Fasta('ID', 'ACGT'))
- self.assertListEqual(qual, [29, 32, 35, 40])
-
-
- def test_translate(self):
- '''Test nucleatide -> amino acid conversion works on Fasta'''
- fq = sequences.Fastq('ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII')
-
- self.assertEqual(sequences.Fastq('ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'), fq.translate())
-
-class TestFileReader(unittest.TestCase):
- def test_file_reader_fasta(self):
- '''file_reader should iterate through a fasta file correctly'''
- reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.fa'))
- counter = 1
- for seq in reader:
- self.assertEqual(seq, sequences.Fasta(str(counter), 'ACGTA'))
- counter += 1
-
- def test_file_reader_fastq(self):
- '''file_reader should iterate through a fastq file correctly'''
- reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_good_file.fq'))
- for seq in reader:
- self.assertEqual(seq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
-
- def test_file_reader_bad_format(self):
- '''file_reader should die properly when not given fasta or fastq file'''
- with self.assertRaises(sequences.Error):
- reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_not_a_fastaq_file'))
- for seq in reader:
- pass
-
- def test_file_reader_gff(self):
- '''Test read gff file'''
- good_files = [
- 'sequences_test_gffv3.gff',
- 'sequences_test_gffv3.no_FASTA_line.gff'
- ]
- good_files = [os.path.join(data_dir, x) for x in good_files]
-
- for f in good_files:
- reader = sequences.file_reader(f)
- counter = 1
- for seq in reader:
- self.assertEqual(seq, sequences.Fasta('seq' + str(counter), 'ACGTACGTAC'))
- counter += 1
-
- bad_files = [
- 'sequences_test_gffv3.no_seq.gff',
- 'sequences_test_gffv3.no_seq.2.gff'
- ]
- bad_files = [os.path.join(data_dir, x) for x in bad_files]
-
- for filename in bad_files:
- with self.assertRaises(sequences.Error):
- reader = sequences.file_reader(filename)
- for seq in reader:
- pass
-
- def test_file_reader_embl(self):
- '''Test read embl file'''
- reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.embl'))
-
- counter = 1
- for seq in reader:
- self.assertEqual(seq, sequences.Fasta('seq' + str(counter), expected_embl[counter-1]))
- counter += 1
-
- bad_files = [
- 'sequences_test.embl.bad',
- 'sequences_test.embl.bad2',
- ]
- bad_files = [os.path.join(data_dir, x) for x in bad_files]
-
- for filename in bad_files:
- with self.assertRaises(sequences.Error):
- reader = sequences.file_reader(filename)
- for seq in reader:
- pass
-
- def test_file_reader_phylip(self):
- '''Test read phylip file'''
- test_files = [
- 'sequences_test_phylip.interleaved',
- 'sequences_test_phylip.interleaved2',
- 'sequences_test_phylip.sequential'
- ]
-
- test_files = [os.path.join(data_dir, f) for f in test_files]
-
- expected_seqs = [
- sequences.Fasta('Turkey', 'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'),
- sequences.Fasta('Salmo_gair', 'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'),
- sequences.Fasta('H. Sapiens', 'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA')
- ]
-
- for fname in test_files:
- reader = sequences.file_reader(fname)
- i = 0
- for seq in reader:
- self.assertEqual(expected_seqs[i], seq)
- i += 1
-
- # files made by seaview are a little different in the first line.
- # Test one of these
- expected_seqs = [
- sequences.Fasta('seq1', 96 * 'G' + 'T'),
- sequences.Fasta('seq2', 94 * 'A' + 'G')
- ]
-
- reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview'))
- i = 0
- for seq in reader:
- print(seq)
- self.assertEqual(expected_seqs[i], seq)
- i += 1
-
-
-if __name__ == '__main__':
- unittest.main()
-
diff --git a/fastaq/tests/tasks_test.py b/fastaq/tests/tasks_test.py
deleted file mode 100644
index 084425e..0000000
--- a/fastaq/tests/tasks_test.py
+++ /dev/null
@@ -1,449 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import filecmp
-import os
-import unittest
-from fastaq import tasks, sequences
-
-modules_dir = os.path.dirname(os.path.abspath(sequences.__file__))
-data_dir = os.path.join(modules_dir, 'tests', 'data')
-
-class Error (Exception): pass
-
-
-class TestCapillaryToPairs(unittest.TestCase):
- def test_capillary_to_pairs(self):
- '''Check that capillary reads file converted to paired and unpaired'''
- tmp_prefix = 'tmp.cap_to_pairs'
- tasks.capillary_to_pairs(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa'), tmp_prefix)
- # sequences have been hashed, so could be in any order in
- # output files. So need to check contents of files are OK
- d_correct_paired = {}
- d_correct_unpaired = {}
- tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.paired.gz'), d_correct_paired)
- tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.unpaired.gz'), d_correct_unpaired)
- d_test_paired = {}
- d_test_unpaired = {}
- tasks.file_to_dict(tmp_prefix + '.paired.gz', d_test_paired)
- tasks.file_to_dict(tmp_prefix + '.unpaired.gz', d_test_unpaired)
- self.assertDictEqual(d_test_paired, d_correct_paired)
- self.assertDictEqual(d_test_unpaired, d_correct_unpaired)
- os.unlink(tmp_prefix + '.paired.gz')
- os.unlink(tmp_prefix + '.unpaired.gz')
-
-
-class TestDeinterleave(unittest.TestCase):
- def test_deinterleave(self):
- '''deinterleave should deal with an interleaved file correctly'''
- tmp_1 = 'tmp.deinterleaved_1.fa'
- tmp_2 = 'tmp.deinterleaved_2.fa'
- tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved.fa'), tmp_1, tmp_2)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'), tmp_1))
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'), tmp_2))
-
- tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved.fq'), tmp_1, tmp_2, fasta_out=True)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'), tmp_1))
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'), tmp_2))
-
- with self.assertRaises(tasks.Error):
- tasks.deinterleave(os.path.join(data_dir, 'sequences_test_interleaved_bad.fa'), tmp_1, tmp_2)
- os.unlink(tmp_1)
- os.unlink(tmp_2)
-
-
-class TestEnumerateNames(unittest.TestCase):
- def test_enumerate_names(self):
- '''Test enomereate_names works with all options'''
- outfile = 'tmp.enumerate_seqs.fa'
- rename_out = outfile + '.rename'
- tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1'), outfile))
- tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, rename_file=rename_out)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1'), outfile))
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.1.rename_file'), rename_out))
- tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, start_index=2)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.start.2'), outfile))
- tasks.enumerate_names(os.path.join(data_dir, 'sequences_test_enumerate_names.fa'), outfile, keep_illumina_suffix=True)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_enumerate_names.fa.out.keep_suffix'), outfile))
- os.unlink(outfile)
- os.unlink(rename_out)
-
-
-class TestExtendGaps(unittest.TestCase):
- def test_extend_gaps(self):
- '''Test that gap extension works'''
- outfile = 'tmp.gap_extend.fa'
- tasks.extend_gaps(os.path.join(data_dir, 'sequences_test_extend_gaps.fa'), outfile, trim=2)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_extend_gaps.fa.out'), outfile))
- os.unlink(outfile)
-
-
-class TestFastqToMiraXml(unittest.TestCase):
- def test_fastaq_to_mira_xml(self):
- '''check that fastaq_to_mira_xml makes the correct xml file from a fastq file'''
- tmp = 'tmp.mira.xml'
- tasks.fastaq_to_mira_xml(os.path.join(data_dir, 'sequences_test_good_file.fq'), tmp)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_good_file_mira.xml'), tmp))
- os.unlink(tmp)
-
-
-class TestFilter(unittest.TestCase):
- def test_length_filter(self):
- '''Check that filtering by length works as expected'''
- infile = os.path.join(data_dir, 'sequences_test_length_filter.fa')
- correct_files = [os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-1.fa'),
- os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-inf.fa'),
- os.path.join(data_dir, 'sequences_test_length_filter.min-4.max-4.fa')]
- cutoffs = [(0, 1), (0, float('inf')), (4, 4)]
-
- for i in range(len(cutoffs)):
- outfile = 'tmp.length_filter.fa'
- tasks.filter(infile, outfile, minlength=cutoffs[i][0], maxlength=cutoffs[i][1])
- self.assertTrue(filecmp.cmp(correct_files[i], outfile))
- os.unlink(outfile)
-
- def test_regex_filter(self):
- '''Check that filtering by name regex works as expected'''
- infile = os.path.join(data_dir, 'sequences_test_filter_by_regex.fa')
- correct_files = [os.path.join(data_dir, 'sequences_test_filter_by_regex.numeric.fa'),
- os.path.join(data_dir, 'sequences_test_filter_by_regex.first-of-pair.fa'),
- os.path.join(data_dir, 'sequences_test_filter_by_regex.first-char-a.fa')]
- regexes = ['^[0-9]+$', '/1$', '^a']
-
- for i in range(len(regexes)):
- outfile = 'tmp.regex_filter.fa'
- tasks.filter(infile, outfile, regex=regexes[i])
- self.assertTrue(filecmp.cmp(correct_files[i], outfile))
- os.unlink(outfile)
-
- def test_ids_from_file_filter(self):
- '''Test that can extract reads from a file of read names'''
- infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa')
- outfile = 'tmp.ids_file_filter.fa'
- tasks.filter(infile, outfile, ids_file=infile + '.ids')
- self.assertTrue(filecmp.cmp(infile + '.filtered', outfile))
- os.unlink(outfile)
-
- def test_invert_filter(self):
- '''Test that inverting filtering works'''
- infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa')
- outfile = 'tmp.ids_file_filter.fa'
- tasks.filter(infile, outfile, ids_file=infile + '.ids', invert=True)
- self.assertTrue(filecmp.cmp(infile + '.filtered.invert', outfile))
- os.unlink(outfile)
-
-
-class TestGetSeqsFlankingGaps(unittest.TestCase):
- def test_get_seqs_flanking_gaps(self):
- outfile = 'tmp.seqs_flanking_gaps'
- tasks.get_seqs_flanking_gaps(os.path.join(data_dir, 'sequences_test_get_seqs_flanking_gaps.fa'), outfile, 3, 3)
- self.assertTrue(filecmp.cmp(outfile, os.path.join(data_dir, 'sequences_test_get_seqs_flanking_gaps.fa.out')))
- os.unlink(outfile)
-
-
-class TestInterleave(unittest.TestCase):
- def test_interleave(self):
- '''Check that interleave works as expected'''
- tmp = 'tmp.interleaved.fa'
- tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_1.fa'),
- os.path.join(data_dir, 'sequences_test_deinterleaved_2.fa'),
- tmp)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_interleaved.fa'), tmp))
-
- with self.assertRaises(tasks.Error):
- tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_bad_1.fa'),
- os.path.join(data_dir, 'sequences_test_deinterleaved_bad_2.fa'),
- tmp)
-
- with self.assertRaises(tasks.Error):
- tasks.interleave(os.path.join(data_dir, 'sequences_test_deinterleaved_bad2_1.fa'),
- os.path.join(data_dir, 'sequences_test_deinterleaved_bad2_2.fa'),
- tmp)
- os.unlink(tmp)
-
-
-class TestMakeRandomContigs(unittest.TestCase):
- def test_make_random_contigs(self):
- '''Test make_random_contigs()'''
- # Can't guarantee same results from random (even using same seed), so
- # just check sequence names and lengths
- def files_are_equal(file1, file2):
- seqs1 = {}
- seqs2 = {}
- tasks.file_to_dict(file1, seqs1)
- tasks.file_to_dict(file2, seqs2)
- if len(seqs1) != len(seqs2):
- return False
-
- for name in seqs1:
- seq1 = seqs1[name]
- seq2 = seqs2[name]
- if seq1.id != seq2.id:
- return False
- if len(seq1) != len(seq2):
- return False
-
- return True
-
- tmp = 'tmp.random_contigs.fa'
- tasks.make_random_contigs(2, 3, tmp)
- self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.default.fa'), tmp))
- tasks.make_random_contigs(2, 3, tmp, prefix='p')
- self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.prefix-p.fa'), tmp))
- tasks.make_random_contigs(2, 3, tmp, first_number=42)
- self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.first-42.fa'), tmp))
- tasks.make_random_contigs(28, 3, tmp, name_by_letters=True)
- self.assertTrue(files_are_equal(os.path.join(data_dir, 'sequences_test_make_random_contigs.name-by-letters.fa'), tmp))
- os.unlink(tmp)
-
-
-class TestReverseComplement(unittest.TestCase):
- def test_reverse_complement(self):
- '''reverse_complement should correctly reverse complement each seq in a file'''
- tmp = 'tmp.revcomp.fa'
- tasks.reverse_complement(os.path.join(data_dir, 'sequences_test.fa'), tmp)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_revcomp.fa'), tmp))
- os.unlink(tmp)
-
-
-class TestScaffoldsToContigs(unittest.TestCase):
- def test_scaffolds_to_contigs(self):
- '''Test scaffolds_to_contigs'''
- tmp = 'tmp.contigs.fa'
- tasks.scaffolds_to_contigs(os.path.join(data_dir, 'utils_test_scaffolds.fa'), tmp)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'utils_test_scaffolds.fa.to_contigs.fa'), tmp))
- os.unlink(tmp)
-
- def test_scaffolds_to_contigs_number_contigs(self):
- '''Test scaffolds_to_contigs with contig numbering'''
- tmp = 'tmp.contigs.fa'
- tasks.scaffolds_to_contigs(os.path.join(data_dir, 'utils_test_scaffolds.fa'), tmp, number_contigs=True)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'utils_test_scaffolds.fa.to_contigs.number_contigs.fa'), tmp))
- os.unlink(tmp)
-
-
-class TestSearchForSeq(unittest.TestCase):
- def test_search_for_seq(self):
- '''Test that sequence search finds all hits'''
- tmp = 'tmp.search.fa'
- tasks.search_for_seq(os.path.join(data_dir, 'sequences_test_search_string.fa'), tmp, 'AGA')
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_search_string.fa.hits'), tmp))
- os.unlink(tmp)
-
-
-class TestTranslate(unittest.TestCase):
- def test_translate(self):
- '''Test translate works in each frame'''
- tmp = 'tmp.translated.fa'
- for i in range(3):
- tasks.translate(os.path.join(data_dir, 'sequences_test_translate.fa'), tmp, frame=i)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_translate.fa.frame' + str(i)), tmp))
-
- os.unlink(tmp)
-
-
-class TestTrim(unittest.TestCase):
- def test_trim(self):
- '''trim should correctly trim each seq in a file'''
- tmp = 'tmp.trim.fq'
- tasks.trim(os.path.join(data_dir, 'sequences_test_untrimmed.fq'), tmp, 2, 1)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_trimmed.fq'), tmp))
- os.unlink(tmp)
-
-
- def test_trim_Ns_at_end(self):
- '''Test Ns at ends of sequences trimmed OK'''
- tmp = 'tmp.trim.fa'
- tasks.trim_Ns_at_end(os.path.join(data_dir, 'sequences_test_trim_Ns_at_end.fa'), tmp)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_trim_Ns_at_end.fa.trimmed'), tmp))
- os.unlink(tmp)
-
-
-class TestFileToDict(unittest.TestCase):
- def test_file_to_dict(self):
- '''check file_to_dict fills dictionary correctly'''
- d_test = {}
- d = {}
- tasks.file_to_dict(os.path.join(data_dir, 'sequences_test.fa'), d_test)
- for i in range(1,5):
- d[str(i)] = sequences.Fasta(str(i),'ACGTA')
-
- self.assertSequenceEqual(d_test.keys(),d.keys())
- for i in range(1,5):
- key = str(i)
- self.assertEqual(d_test[key].id, d[key].id)
- self.assertEqual(d_test[key].seq, d[key].seq)
-
-
-class TestLengthsFromFai(unittest.TestCase):
- def test_lengths_from_fai(self):
- '''Check lengths_from_fai gets the length of each seq OK'''
- d = {}
- lengths = {str(x):x for x in range(1,5)}
- tasks.lengths_from_fai(os.path.join(data_dir, 'sequences_test_fai_test.fa.fai'), d)
- self.assertSequenceEqual(d.keys(), lengths.keys())
- for i in d:
- self.assertEqual(int(i), d[i])
-
-
-class TestSplit(unittest.TestCase):
- def test_split_by_base_count(self):
- '''Check that fasta/q files get split by base count correctly'''
- infile = os.path.join(data_dir, 'sequences_test_split_test.fa')
- outprefix = 'tmp.sequences_test_split_test.fa.test'
- length2files = {2: ['1','2','3','4'],
- 3: ['1','2','3'],
- 4: ['1', '2', '3'],
- 6: ['1', '2']}
- for l in length2files:
- tasks.split_by_base_count(infile, outprefix, l)
- for x in range(len(length2files[l])):
- file_index = str(length2files[l][x])
- fname = outprefix + '.' + file_index
- self.assertTrue(filecmp.cmp(fname, infile + '.' + str(l) + '.' + file_index))
- os.unlink(fname)
-
- # check that limiting the number of files works
- tasks.split_by_base_count(infile, outprefix, 6, 2)
- for i in range(1,4):
- test_file = outprefix + '.' + str(i)
- self.assertTrue(filecmp.cmp(test_file, os.path.join(data_dir, 'sequences_test_split_test.fa.6.limit2.') + str(i)))
- os.unlink(test_file)
-
- # check big sequence not broken
- tasks.split_by_base_count(os.path.join(data_dir, 'sequences_test_split_test.long.fa'), outprefix, 2)
- self.assertTrue(filecmp.cmp(outprefix + '.1', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.1')))
- self.assertTrue(filecmp.cmp(outprefix + '.2', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.2')))
- os.unlink(outprefix + '.1')
- os.unlink(outprefix + '.2')
-
- def test_split_by_fixed_size(self):
- '''Test fasta/q file split by fixed size'''
- infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa')
- outprefix = 'tmp.sequences_test_split'
- tasks.split_by_fixed_size(infile, outprefix, 4, 1)
-
- for i in range(1,7,1):
- correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.' + str(i))
- test = outprefix + '.' + str(i)
- self.assertTrue(filecmp.cmp(test, correct))
- os.unlink(test)
-
- test_coords = outprefix + '.coords'
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.coords'), test_coords))
- os.unlink(test_coords)
-
- def test_split_by_fixed_size_exclude_Ns(self):
- infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa')
- outprefix = 'tmp.sequences_test_split'
- tasks.split_by_fixed_size(infile, outprefix, 4, 1, skip_if_all_Ns=True)
-
- for i in range(1,5,1):
- correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.' + str(i))
- test = outprefix + '.' + str(i)
- self.assertTrue(filecmp.cmp(test, correct))
- os.unlink(test)
-
- test_coords = outprefix + '.coords'
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords'), test_coords))
- os.unlink(test_coords)
-
-class TestCountSequences(unittest.TestCase):
- def test_count_sequences(self):
- '''Check that count_sequences does as expected'''
- self.assertEqual(2, tasks.count_sequences(os.path.join(data_dir, 'sequences_test_good_file.fq')))
- self.assertEqual(4, tasks.count_sequences(os.path.join(data_dir, 'sequences_test.fa')))
- self.assertEqual(0, tasks.count_sequences(os.path.join(data_dir, 'sequences_test_empty_file')))
-
-class TestGetIds(unittest.TestCase):
- def test_get_ids(self):
- '''Check that IDs extracted correctly from fasta/q file'''
- tmpfile = 'tmp.ids'
- tasks.get_ids(os.path.join(data_dir, 'sequences_test.fa'), tmpfile)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.fa.ids'), tmpfile))
- os.unlink(tmpfile)
-
-class TestFastaToFastq(unittest.TestCase):
- def test_fasta_to_fastq(self):
- '''Check fasta_to_fastq converts files as expected'''
- tasks.fasta_to_fastq(os.path.join(data_dir, 'sequences_test.fa'),
- os.path.join(data_dir, 'sequences_test.fa.qual'),
- 'tmp.fq')
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.fasta_to_fastq.fq'), 'tmp.fq'))
-
- with self.assertRaises(tasks.Error):
- tasks.fasta_to_fastq(os.path.join(data_dir, 'sequences_test.fa'),
- os.path.join(data_dir, 'sequences_test.fa.qual.bad'),
- 'tmp.fq')
-
- os.unlink('tmp.fq')
-
-
-class TestReplaceBases(unittest.TestCase):
- def test_sequences_replace_bases(self):
- '''Check that fasta file gets all bases replaced OK'''
- tmpfile = 'tmp.replace_bases.fa'
- tasks.replace_bases(os.path.join(data_dir, 'sequences_test_fastaq_replace_bases.fa'), tmpfile, 'T', 'X')
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_fastaq_replace_bases.expected.fa'), tmpfile))
- os.unlink(tmpfile)
-
-
-class TestStripIlluminaSuffix(unittest.TestCase):
- def test_strip_illumina_suffix(self):
- '''Check illumina suffixes stripped correctly off read names'''
- tmpfile = 'tmp.stripped.fa'
- tasks.strip_illumina_suffix(os.path.join(data_dir, 'sequences_test_strip_illumina_suffix.fq'), tmpfile)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_illumina_suffix.fq.stripped'), tmpfile))
- os.unlink(tmpfile)
-
-
-class TestToQuasrPrimers(unittest.TestCase):
- def test_to_quasr_primers(self):
- '''Check that fasta file gets converted to QUASR sequence file'''
- tmpfile = 'tmp.primers'
- tasks.to_quasr_primers(os.path.join(data_dir, 'sequences_test_fastaq_to_quasr_primers.fa'), tmpfile)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_fastaq_to_quasr_primers.expected'), tmpfile))
- os.unlink(tmpfile)
-
-
-class TestToFasta(unittest.TestCase):
- def test_to_fasta(self):
- '''Test to_fasta'''
- tmpfile = 'tmp.to_fasta'
- infiles = [
- 'sequences_test_good_file.fq',
- 'sequences_test_gffv3.gff',
- 'sequences_test_gffv3.no_FASTA_line.gff',
- 'sequences_test.embl',
- 'sequences_test.gbk',
- 'sequences_test_phylip.interleaved',
- 'sequences_test_phylip.interleaved2',
- 'sequences_test_phylip.sequential'
- ]
- infiles = [os.path.join(data_dir, x) for x in infiles]
- expected_outfiles = [x + '.to_fasta' for x in infiles]
-
- for i in range(len(infiles)):
- tasks.to_fasta(infiles[i], tmpfile)
- self.assertTrue(filecmp.cmp(expected_outfiles[i], tmpfile))
-
- tasks.to_fasta(os.path.join(data_dir, 'sequences_test.fa'), tmpfile, line_length=3)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.line_length3.fa'), tmpfile))
- tasks.to_fasta(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa'), tmpfile, strip_after_first_whitespace=True)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa.to_fasta'), tmpfile))
- os.unlink(tmpfile)
-
-
-class TestToUniqueByID(unittest.TestCase):
- def test_to_unique_by_id(self):
- '''Test to_unique_by_id()'''
- tmpfile = 'tmp.unique_by_id.fa'
- tasks.to_unique_by_id(os.path.join(data_dir, 'sequences_test_to_unique_by_id.fa'), tmpfile)
- self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_to_unique_by_id.fa.out'), tmpfile))
- os.unlink(tmpfile)
-
-
-if __name__ == '__main__':
- unittest.main()
-
diff --git a/fastaq/tests/utils_test.py b/fastaq/tests/utils_test.py
deleted file mode 100644
index 731c944..0000000
--- a/fastaq/tests/utils_test.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import os
-import filecmp
-import unittest
-from fastaq import utils
-
-modules_dir = os.path.dirname(os.path.abspath(utils.__file__))
-data_dir = os.path.join(modules_dir, 'tests', 'data')
-
-class TestUtils(unittest.TestCase):
- def test_write_and_read(self):
- '''open_file_write() and open_file_read() should do the right thing depending gzipped or not'''
- for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']:
- f = utils.open_file_write(filename)
- for i in range(3):
- print(i, file=f)
- utils.close(f)
-
- counter = 0
-
- f = utils.open_file_read(filename)
- for line in f:
- self.assertEqual(counter, int(line.strip()))
- counter += 1
- utils.close(f)
-
- os.unlink(filename)
-
- f = utils.open_file_read('-')
- self.assertEqual(sys.stdin, f)
- f = utils.open_file_write('-')
- self.assertEqual(sys.stdout, f)
-
- def test_raise_exception(self):
- '''open_file_write() and open_file_read() should raise an exception when can't do the opening'''
- with self.assertRaises(utils.Error):
- utils.open_file_read('this_file_is_not_here_so_throw_error')
- with self.assertRaises(utils.Error):
- utils.open_file_read('this_file_is_not_here_so_throw_error.gz')
- with self.assertRaises(utils.Error):
- utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz'))
-
- with self.assertRaises(utils.Error):
- utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error'))
- with self.assertRaises(utils.Error):
- utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
-
- def test_file_transpose(self):
- '''Test that file_transpose() does what it should'''
- infile = os.path.join(data_dir, 'utils_test_file_transpose.txt')
- tmp_out = 'utils_test_file_transpose.tmp'
- correct_file = os.path.join(data_dir, 'utils_test_file_transposed.txt')
- utils.file_transpose(infile, tmp_out)
- self.assertTrue(filecmp.cmp(tmp_out, correct_file))
- os.unlink(tmp_out)
-
- def test_system_call(self):
- '''Test that system call appears to work and die as it should'''
- test_file = os.path.join(data_dir, 'utils_test_system_call.txt')
- tmp_out = 'utils_test_syscall.tmp'
- utils.syscall('cat ' + test_file + ' > ' + tmp_out)
- self.assertTrue(filecmp.cmp(tmp_out, test_file))
- os.unlink(tmp_out)
-
- with self.assertRaises(utils.Error):
- utils.syscall('thisisveryunlikelytoebarealcommandandshouldthrowerror')
-
- utils.syscall('echo "this is not the right string" > ' + tmp_out)
- self.assertFalse(filecmp.cmp(tmp_out, test_file))
- os.unlink(tmp_out)
-
- s = utils.syscall_get_stdout('echo bingo')
- self.assertListEqual(["bingo"], s)
- with self.assertRaises(utils.Error):
- utils.syscall_get_stdout('thisisveryunlikelytoebarealcommandandshouldthrowerror')
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/fastaq/utils.py b/fastaq/utils.py
deleted file mode 100644
index c30273b..0000000
--- a/fastaq/utils.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import os
-import sys
-import subprocess
-import shlex
-
-class Error (Exception): pass
-
-def open_file_read(filename):
- if filename == '-':
- f = sys.stdin
- elif filename.endswith('.gz'):
- # first check that the file is OK according to gunzip
- retcode = subprocess.call('gunzip -t ' + filename, shell=True)
- if retcode != 0:
- raise Error("Error opening for reading gzipped file '" + filename + "'")
-
- # now open the file
- f = os.popen('gunzip -c ' + filename)
- else:
- try:
- f = open(filename)
- except:
- raise Error("Error opening for reading file '" + filename + "'")
-
- return f
-
-
-def open_file_write(filename):
- if filename == '-':
- f = sys.stdout
- elif filename.endswith('.gz'):
- if not os.path.exists(os.path.abspath(os.path.dirname(filename))):
- raise Error("Error opening for writing gzipped file '" + filename + "'")
-
- try:
- f = os.popen('gzip -9 -c > ' + filename, 'w')
- except:
- raise Error("Error opening for writing gzipped file '" + filename + "'")
- else:
- try:
- f = open(filename, 'w')
- except:
- raise Error("Error opening for writing file '" + filename + "'")
-
- return f
-
-
-def close(filehandle):
- if filehandle not in [sys.stdout, sys.stderr]:
- filehandle.close()
-
-
-def file_transpose(f_in, f_out, sep_in=None, sep_out='\t'):
- rows = []
- f = open_file_read(f_in)
- for line in f:
- rows.append(line.rstrip().split(sep_in))
- close(f)
-
- columns_out = max([len(x) for x in rows])
-
- for r in rows:
- r += ['.'] * (columns_out - len(r))
-
- f = open_file_write(f_out)
- for i in range(columns_out):
- print(sep_out.join([str(rows[x][i]) for x in range(len(rows))]), file=f)
-
- close(f)
-
-
-def syscall(cmd):
- retcode = subprocess.call(cmd, shell=True)
-
- if retcode != 0:
- raise Error("Error in system call. Command was:\n" + cmd)
-
-
-def syscall_get_stdout(cmd):
- try:
- out = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE).communicate()[0].decode('utf-8').rstrip()
- return out.split('\n')
- except:
- raise Error('Error in system call. I tried to run:\n' + str(cmd))
-
-
diff --git a/scripts/fastaq_capillary_to_pairs b/src/fastaq_capillary_to_pairs
similarity index 100%
rename from scripts/fastaq_capillary_to_pairs
rename to src/fastaq_capillary_to_pairs
diff --git a/scripts/fastaq_chunker b/src/fastaq_chunker
similarity index 100%
rename from scripts/fastaq_chunker
rename to src/fastaq_chunker
diff --git a/scripts/fastaq_count_sequences b/src/fastaq_count_sequences
similarity index 100%
rename from scripts/fastaq_count_sequences
rename to src/fastaq_count_sequences
diff --git a/scripts/fastaq_deinterleave b/src/fastaq_deinterleave
similarity index 100%
rename from scripts/fastaq_deinterleave
rename to src/fastaq_deinterleave
diff --git a/scripts/fastaq_enumerate_names b/src/fastaq_enumerate_names
similarity index 100%
rename from scripts/fastaq_enumerate_names
rename to src/fastaq_enumerate_names
diff --git a/scripts/fastaq_extend_gaps b/src/fastaq_extend_gaps
similarity index 100%
rename from scripts/fastaq_extend_gaps
rename to src/fastaq_extend_gaps
diff --git a/scripts/fastaq_fasta_to_fastq b/src/fastaq_fasta_to_fastq
similarity index 100%
rename from scripts/fastaq_fasta_to_fastq
rename to src/fastaq_fasta_to_fastq
diff --git a/scripts/fastaq_filter b/src/fastaq_filter
similarity index 100%
rename from scripts/fastaq_filter
rename to src/fastaq_filter
diff --git a/scripts/fastaq_get_ids b/src/fastaq_get_ids
similarity index 100%
rename from scripts/fastaq_get_ids
rename to src/fastaq_get_ids
diff --git a/scripts/fastaq_get_seq_flanking_gaps b/src/fastaq_get_seq_flanking_gaps
similarity index 100%
rename from scripts/fastaq_get_seq_flanking_gaps
rename to src/fastaq_get_seq_flanking_gaps
diff --git a/scripts/fastaq_insert_or_delete_bases b/src/fastaq_insert_or_delete_bases
similarity index 100%
rename from scripts/fastaq_insert_or_delete_bases
rename to src/fastaq_insert_or_delete_bases
diff --git a/scripts/fastaq_interleave b/src/fastaq_interleave
similarity index 100%
rename from scripts/fastaq_interleave
rename to src/fastaq_interleave
diff --git a/scripts/fastaq_make_random_contigs b/src/fastaq_make_random_contigs
similarity index 100%
rename from scripts/fastaq_make_random_contigs
rename to src/fastaq_make_random_contigs
diff --git a/scripts/fastaq_replace_bases b/src/fastaq_replace_bases
similarity index 100%
rename from scripts/fastaq_replace_bases
rename to src/fastaq_replace_bases
diff --git a/scripts/fastaq_reverse_complement b/src/fastaq_reverse_complement
similarity index 100%
rename from scripts/fastaq_reverse_complement
rename to src/fastaq_reverse_complement
diff --git a/scripts/fastaq_scaffolds_to_contigs b/src/fastaq_scaffolds_to_contigs
similarity index 100%
rename from scripts/fastaq_scaffolds_to_contigs
rename to src/fastaq_scaffolds_to_contigs
diff --git a/scripts/fastaq_search_for_seq b/src/fastaq_search_for_seq
similarity index 100%
rename from scripts/fastaq_search_for_seq
rename to src/fastaq_search_for_seq
diff --git a/scripts/fastaq_split_by_base_count b/src/fastaq_split_by_base_count
similarity index 100%
rename from scripts/fastaq_split_by_base_count
rename to src/fastaq_split_by_base_count
diff --git a/scripts/fastaq_strip_illumina_suffix b/src/fastaq_strip_illumina_suffix
similarity index 100%
rename from scripts/fastaq_strip_illumina_suffix
rename to src/fastaq_strip_illumina_suffix
diff --git a/scripts/fastaq_to_fasta b/src/fastaq_to_fasta
similarity index 100%
rename from scripts/fastaq_to_fasta
rename to src/fastaq_to_fasta
diff --git a/scripts/fastaq_to_mira_xml b/src/fastaq_to_mira_xml
similarity index 100%
rename from scripts/fastaq_to_mira_xml
rename to src/fastaq_to_mira_xml
diff --git a/scripts/fastaq_to_perfect_reads b/src/fastaq_to_perfect_reads
similarity index 100%
rename from scripts/fastaq_to_perfect_reads
rename to src/fastaq_to_perfect_reads
diff --git a/scripts/fastaq_to_quasr_primers_file b/src/fastaq_to_quasr_primers_file
similarity index 100%
rename from scripts/fastaq_to_quasr_primers_file
rename to src/fastaq_to_quasr_primers_file
diff --git a/scripts/fastaq_to_random_subset b/src/fastaq_to_random_subset
similarity index 100%
rename from scripts/fastaq_to_random_subset
rename to src/fastaq_to_random_subset
diff --git a/scripts/fastaq_to_tiling_bam b/src/fastaq_to_tiling_bam
similarity index 100%
rename from scripts/fastaq_to_tiling_bam
rename to src/fastaq_to_tiling_bam
diff --git a/scripts/fastaq_to_unique_by_id b/src/fastaq_to_unique_by_id
similarity index 100%
rename from scripts/fastaq_to_unique_by_id
rename to src/fastaq_to_unique_by_id
diff --git a/scripts/fastaq_translate b/src/fastaq_translate
similarity index 100%
rename from scripts/fastaq_translate
rename to src/fastaq_translate
diff --git a/scripts/fastaq_trim_Ns_at_end b/src/fastaq_trim_Ns_at_end
similarity index 100%
rename from scripts/fastaq_trim_Ns_at_end
rename to src/fastaq_trim_Ns_at_end
diff --git a/scripts/fastaq_trim_ends b/src/fastaq_trim_ends
similarity index 100%
rename from scripts/fastaq_trim_ends
rename to src/fastaq_trim_ends
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastaq.git
More information about the debian-med-commit
mailing list