[med-svn] [fastaq] 01/02: Imported Upstream version 1.5.0

Jorge Soares jssoares-guest at moszumanska.debian.org
Tue Oct 21 08:03:21 UTC 2014


This is an automated email from the git hooks/post-receive script.

jssoares-guest pushed a commit to branch master
in repository fastaq.

commit 6657045bf845939f475c6f84c89f87d438223403
Author: js21 <js21 at localhost.localdomain>
Date:   Tue Oct 21 09:01:56 2014 +0100

    Imported Upstream version 1.5.0
---
 AUTHORS                                            |   1 +
 fastaq/sequences.py                                | 107 +++++++++++-
 fastaq/tasks.py                                    | 181 ++++++++++++++++++++-
 .../tests/data/sequences_test_merge_to_one_seq.fa  |   8 +
 .../tests/data/sequences_test_merge_to_one_seq.fq  |   8 +
 .../data/sequences_test_merge_to_one_seq.merged.fa |   2 +
 .../data/sequences_test_merge_to_one_seq.merged.fq |   4 +
 fastaq/tests/data/sequences_test_orfs.fa           |  18 ++
 fastaq/tests/data/sequences_test_orfs.gff          |  15 ++
 .../tests/data/sequences_test_to_fasta_union.in.fa |   7 +
 .../data/sequences_test_to_fasta_union.out.fa      |   2 +
 .../tests/data/tasks_test_expend_nucleotides.in.fa |   4 +
 .../tests/data/tasks_test_expend_nucleotides.in.fq |   8 +
 .../data/tasks_test_expend_nucleotides.out.fa      |   6 +
 .../data/tasks_test_expend_nucleotides.out.fq      |  12 ++
 .../tests/data/tasks_test_fasta_to_fake_qual.in.fa |   5 +
 .../tasks_test_fasta_to_fake_qual.out.default.qual |   5 +
 .../tasks_test_fasta_to_fake_qual.out.q42.qual     |   5 +
 .../tests/data/tasks_test_make_long_reads.input.fa |   2 +
 .../data/tasks_test_make_long_reads.output.fa      |  10 ++
 fastaq/tests/data/tasks_test_sequence_trim_1.fa    |  12 ++
 .../data/tasks_test_sequence_trim_1.trimmed.fa     |   8 +
 fastaq/tests/data/tasks_test_sequence_trim_2.fa    |  12 ++
 .../data/tasks_test_sequence_trim_2.trimmed.fa     |   8 +
 fastaq/tests/data/tasks_test_sequences_to_trim.fa  |   8 +
 fastaq/tests/sequences_test.py                     | 135 ++++++++++++++-
 fastaq/tests/tasks_test.py                         |  99 ++++++++++-
 scripts/fastaq_expand_nucleotides                  |  15 ++
 scripts/fastaq_long_read_simulate                  |  50 ++++++
 scripts/fastaq_merge                               |  18 ++
 scripts/fastaq_sequence_trim                       |  23 +++
 scripts/fastaq_to_fake_qual                        |  18 ++
 scripts/fastaq_to_orfs_gff                         |  13 ++
 setup.py                                           |   4 +-
 34 files changed, 805 insertions(+), 28 deletions(-)

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..711ae85
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+Martin Hunt (mh12 at sanger.ac.uk)
diff --git a/fastaq/sequences.py b/fastaq/sequences.py
index 0ce03f8..f7efb1a 100644
--- a/fastaq/sequences.py
+++ b/fastaq/sequences.py
@@ -1,5 +1,7 @@
 import re
 import string
+import random
+import itertools
 
 from fastaq import utils, intervals
 
@@ -79,6 +81,21 @@ codon2aa = {
 'TAG': '*',
 'TGA': '*'}
 
+
+redundant_nts = {
+    'R': ('A', 'G'),
+    'Y': ('C', 'T'),
+    'S': ('C', 'G'),
+    'W': ('A', 'T'),
+    'K': ('G', 'T'),
+    'M': ('A', 'C'),
+    'B': ('C', 'G', 'T'),
+    'D': ('A', 'G', 'T'),
+    'H': ('A', 'C', 'T'),
+    'V': ('A', 'C', 'G'),
+    'N': ('A', 'C', 'G', 'T')
+}
+
 def file_reader(fname, read_quals=False):
     '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences'''
     f = utils.open_file_read(fname)
@@ -98,7 +115,7 @@ def file_reader(fname, read_quals=False):
             if not line:
                 utils.close(f)
                 raise Error('No sequences found in GFF file "' + fname + '"')
-            
+
         seq = Fasta()
         previous_lines[f] = line
     elif line.startswith('ID   ') and line[5] != ' ':
@@ -137,7 +154,7 @@ def file_reader(fname, read_quals=False):
             sequential = True
         else:
             sequential = False
-            
+
         # if the 11th char of second sequence line is a space,  then the file is sequential, e.g.:
         # GAGCCCGGGC AATACAGGGT AT
         # as opposed to:
@@ -153,9 +170,9 @@ def file_reader(fname, read_quals=False):
                     current_id, new_bases = line[0:10].rstrip(), line.rstrip()[10:]
                 else:
                     new_bases = line.rstrip()
-                       
+
                 current_seq += new_bases.replace(' ','')
-            
+
             yield Fasta(current_id, current_seq.replace('-', ''))
         else:
             # seaview files start all seqs at pos >=12. Other files start
@@ -173,14 +190,14 @@ def file_reader(fname, read_quals=False):
             for i in range(number_of_seqs):
                 name, bases = seq_lines[i][0:first_seq_base].rstrip(), seq_lines[i][first_seq_base:]
                 seqs.append(Fasta(name, bases))
-            
+
             for i in range(number_of_seqs, len(seq_lines)):
                 seqs[i%number_of_seqs].seq += seq_lines[i]
 
             for fa in seqs:
                 fa.seq = fa.seq.replace(' ','').replace('-','')
                 yield fa
-                
+
         return
     elif line == '':
         utils.close(f)
@@ -236,6 +253,18 @@ class Fasta:
         except:
             raise Error('Error in split_capillary_id() on ID', self.id)
 
+    def expand_nucleotides(self):
+        '''Assumes sequence is nucleotides. Returns list of all combinations of redundant nucleotides. e.g. R is A or G, so CRT would have combinations CAT and CGT'''
+        s = list(self.seq)
+        for i in range(len(s)):
+            if s[i] in redundant_nts:
+                s[i] = ''.join(redundant_nts[s[i]])
+
+        seqs = []
+        for x in itertools.product(*s):
+            seqs.append(Fasta(self.id + '.' + str(len(seqs) + 1), ''.join(x)))
+        return seqs
+
     def strip_after_first_whitespace(self):
         '''Removes everything in the name after the first whitespace character'''
         self.id = self.id.split()[0]
@@ -267,6 +296,19 @@ class Fasta:
         '''Removes any leading or trailing N or n characters from the sequence'''
         self.seq = self.seq.strip('Nn')
 
+    def add_insertions(self, skip=10, window=1, test=False):
+        '''Adds a random base within window bases around every skip bases. e.g. skip=10, window=1 means a random base added somwhere in theintervals [9,11], [19,21] ... '''
+        assert 2 * window < skip
+        new_seq = list(self.seq)
+        for i in range(len(self) - skip, 0, -skip):
+            pos = random.randrange(i - window, i + window + 1)
+            base = random.choice(['A', 'C', 'G', 'T'])
+            if test:
+                base = 'N'
+            new_seq.insert(pos, base)
+
+        self.seq = ''.join(new_seq)
+
     def replace_bases(self, old, new):
         '''Replaces all occurences of 'old' with 'new' '''
         self.seq = self.seq.replace(old, new)
@@ -309,6 +351,38 @@ class Fasta:
 
 
 
+
+    def orfs(self, frame=0, revcomp=False):
+        assert frame in [0,1,2]
+        if revcomp:
+            self.revcomp()
+
+        aa_seq = self.translate(frame=frame).seq.rstrip('X')
+        if revcomp:
+            self.revcomp()
+
+        orfs = _orfs_from_aa_seq(aa_seq)
+        for i in range(len(orfs)):
+            if revcomp:
+                start = len(self) - (orfs[i].end * 3 + 3) - frame
+                end = len(self) - (orfs[i].start * 3) - 1 - frame
+            else:
+                start = orfs[i].start * 3 + frame
+                end = orfs[i].end * 3 + 2 + frame
+
+            orfs[i] = intervals.Interval(start, end)
+
+        return orfs
+
+
+    def all_orfs(self, min_length=300):
+        orfs = []
+        for frame in [0,1,2]:
+            for revcomp in [False, True]:
+                orfs.extend([(t, revcomp) for t in self.orfs(frame=frame, revcomp=revcomp) if len(t)>=min_length])
+
+        return sorted(orfs, key=lambda t:t[0])
+
     # Fills the object with the next sequence in the file. Returns
     # True if this was successful, False if no more sequences in the file.
     # If reading a file of quality scores, set read_quals = True
@@ -428,12 +502,12 @@ class Embl(Fasta):
 
         self.seq = ''
         seq_lines = []
- 
+
         while not (line.startswith('SQ') or line.rstrip() == 'ORIGIN'):
             line = f.readline()
             if line == '':
                 raise Error('Error! No SQ or ORIGIN line found for sequence ' + self.id)
-        
+
         line = f.readline()
 
         while not line.startswith('//'):
@@ -441,7 +515,7 @@ class Embl(Fasta):
                 raise Error('Error! Did not find end of sequence ' + self.id)
             seq_lines.append(''.join(line.rstrip().strip(' 0123456789').split()))
             line = f.readline()
-            
+
 
         while 1:
             if line.startswith('ID') or line.startswith('LOCUS'):
@@ -519,6 +593,8 @@ class Fastq(Fasta):
         quals = [ord(x) - 33 for x in self.qual]
         return (Fasta(self.id, self.seq), quals)
 
+    def expand_nucleotides(self):
+        return [Fastq(x.id, x.seq, self.qual) for x in super().expand_nucleotides()]
 
     def trim_Ns(self):
         '''Removes any leading or trailing N or n characters from the sequence'''
@@ -547,3 +623,16 @@ class Fastq(Fasta):
         fa = super().translate()
         return Fastq(fa.id, fa.seq, 'I'*len(fa.seq))
 
+
+def _orfs_from_aa_seq(seq):
+    orfs = []
+    pos = 0
+    while pos < len(seq):
+        next_stop = seq.find('*', pos)
+        if next_stop == -1:
+            orfs.append(intervals.Interval(pos, len(seq)-1))
+            break
+        elif next_stop > pos:
+            orfs.append(intervals.Interval(pos, next_stop))
+        pos = next_stop + 1
+    return orfs
diff --git a/fastaq/tasks.py b/fastaq/tasks.py
index ad10b06..068a640 100644
--- a/fastaq/tasks.py
+++ b/fastaq/tasks.py
@@ -1,6 +1,8 @@
 import re
+import sys
 import copy
 import random
+import numpy
 from fastaq import sequences, utils
 
 class Error (Exception): pass
@@ -120,6 +122,19 @@ def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False,
         utils.close(fout_rename)
 
 
+def expand_nucleotides(infile, outfile):
+    seq_reader = sequences.file_reader(infile)
+    fout = utils.open_file_write(outfile)
+
+    for seq in seq_reader:
+        seqs = seq.expand_nucleotides()
+        if len(seqs) > 1:
+            for s in seqs:
+                print(s, file=fout)
+        else:
+            print(seq, file=fout)
+
+
 def extend_gaps(infile, outfile, trim):
     seq_reader = sequences.file_reader(infile)
     fout = utils.open_file_write(outfile)
@@ -156,6 +171,21 @@ def extend_gaps(infile, outfile, trim):
     utils.close(fout)
 
 
+def fastaq_to_fake_qual(infile, outfile, q=40):
+    seq_reader = sequences.file_reader(infile)
+    fout = utils.open_file_write(outfile)
+
+    for seq in seq_reader:
+        print('>' + seq.id, file=fout)
+        if sequences.Fasta.line_length == 0:
+            print(' '.join([str(q)] * len(seq)), file=fout)
+        else:
+            for i in range(0, len(seq), sequences.Fasta.line_length):
+                print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout)
+
+    utils.close(fout)
+
+
 def fasta_to_fastq(fasta_in, qual_in, outfile):
     fa_reader = sequences.file_reader(fasta_in)
     qual_reader = sequences.file_reader(qual_in, read_quals=True)
@@ -190,6 +220,22 @@ def fastaq_to_mira_xml(infile, outfile):
     utils.close(fout)
 
 
+def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'):
+    seq_reader = sequences.file_reader(infile)
+    fout = utils.open_file_write(outfile)
+    for seq in seq_reader:
+        orfs = seq.all_orfs(min_length=min_length)
+        for coords, revcomp in orfs:
+            if revcomp:
+                strand = '-'
+            else:
+                strand = '+'
+
+            print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout)
+
+    utils.close(fout)
+
+
 def file_to_dict(infile, d):
     seq_reader = sequences.file_reader(infile)
     for seq in seq_reader:
@@ -217,7 +263,7 @@ def filter(infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids
         if hit != invert:
             print(seq, file=f_out)
     utils.close(f_out)
-    
+
 
 def get_ids(infile, outfile):
     seq_reader = sequences.file_reader(infile)
@@ -225,7 +271,7 @@ def get_ids(infile, outfile):
     for seq in seq_reader:
         print(seq.id, file=f_out)
     utils.close(f_out)
-    
+
 
 def get_seqs_flanking_gaps(infile, outfile, left, right):
     seq_reader = sequences.file_reader(infile)
@@ -298,6 +344,85 @@ def make_random_contigs(contigs, length, outfile, name_by_letters=False, prefix=
     utils.close(fout)
 
 
+def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, tile_step=10000, gamma_shape=1.2,  gamma_scale=6000, coverage=10, gamma_min_length=20000, seed=None, ins_skip=None, ins_window=None,):
+    assert method in ['tiling', 'gamma', 'uniform']
+    assert ins_skip == ins_window == None or None not in [ins_skip, ins_window]
+    if seed is not None:
+        random.seed(a=seed)
+    seq_reader = sequences.file_reader(infile)
+    f = utils.open_file_write(outfile)
+
+    for seq in seq_reader:
+        if method == 'tiling':
+            if len(seq) < fixed_read_length:
+                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
+                continue
+            for i in range(0, len(seq), tile_step):
+                end = min(len(seq), i + fixed_read_length)
+                fa = sequences.Fasta('_'.join([seq.id, str(i + 1), str(end)]), seq[i:end])
+                if ins_skip:
+                    fa.add_insertions(skip=ins_skip, window=ins_window)
+                print(fa, file=f)
+                if end >= len(seq):
+                    break
+        elif method == 'gamma':
+            if len(seq) < gamma_min_length:
+                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
+                continue
+            total_read_length = 0
+            while total_read_length < coverage * len(seq) - 0.5 * gamma_min_length:
+                read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale))
+                while read_length < gamma_min_length or read_length > len(seq):
+                    read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale))
+
+                start = random.randint(0, len(seq) - read_length)
+                end = start + read_length - 1
+                fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1])
+                total_read_length += len(fa)
+                if ins_skip:
+                    fa.add_insertions(skip=ins_skip, window=ins_window)
+                print(fa, file=f)
+        elif method == 'uniform':
+            if len(seq) < fixed_read_length:
+                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
+                continue
+            total_read_length = 0
+            while total_read_length < coverage * len(seq) - 0.5 * fixed_read_length:
+                start = random.randint(0, len(seq) - fixed_read_length)
+                end = start + fixed_read_length - 1
+                fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1])
+                total_read_length += len(fa)
+                if ins_skip:
+                    fa.add_insertions(skip=ins_skip, window=ins_window)
+                print(fa, file=f)
+
+
+    utils.close(f)
+
+
+def merge_to_one_seq(infile, outfile, seqname='union'):
+    '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order'''
+    seq_reader = sequences.file_reader(infile)
+    seqs = []
+
+    for seq in seq_reader:
+        seqs.append(copy.copy(seq))
+
+    new_seq = ''.join([seq.seq for seq in seqs])
+
+    if type(seqs[0]) == sequences.Fastq:
+        new_qual = ''.join([seq.qual for seq in seqs])
+        seqs[:] = []
+        merged = sequences.Fastq(seqname, new_seq, new_qual)
+    else:
+        merged = sequences.Fasta(seqname, new_seq)
+        seqs[:] = []
+
+    f = utils.open_file_write(outfile)
+    print(merged, file=f)
+    utils.close(f)
+
+
 def reverse_complement(infile, outfile):
     seq_reader = sequences.file_reader(infile)
     fout = utils.open_file_write(outfile)
@@ -342,6 +467,39 @@ def search_for_seq(infile, outfile, search_string):
     utils.close(fout)
 
 
+def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_length=50):
+    trim_seqs = {}
+    file_to_dict(to_trim_file, trim_seqs)
+    trim_seqs = [x.seq for x in trim_seqs.values()]
+
+    seq_reader_1 = sequences.file_reader(infile_1)
+    seq_reader_2 = sequences.file_reader(infile_2)
+    f_out_1 = utils.open_file_write(outfile_1)
+    f_out_2 = utils.open_file_write(outfile_2)
+
+    for seq_1 in seq_reader_1:
+        try:
+            seq_2 = next(seq_reader_2)
+        except:
+            utils.close(f_out)
+            raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue')
+
+        for seq in seq_1, seq_2:
+            for trim_seq in trim_seqs:
+                if seq.seq.startswith(trim_seq):
+                    seq.trim(len(trim_seq),0)
+                    break
+
+        if len(seq_1) >= min_length and len(seq_2) >= min_length:
+            print(seq_1, file=f_out_1)
+            print(seq_2, file=f_out_2)
+
+
+    utils.close(f_out_1)
+    utils.close(f_out_2)
+
+
+
 def translate(infile, outfile, frame=0):
     seq_reader = sequences.file_reader(infile)
     fout = utils.open_file_write(outfile)
@@ -350,7 +508,7 @@ def translate(infile, outfile, frame=0):
         print(seq.translate(frame=frame), file=fout)
 
     utils.close(fout)
-    
+
 
 def trim(infile, outfile, start, end):
     seq_reader = sequences.file_reader(infile)
@@ -473,7 +631,7 @@ def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_
                 f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
                 file_count += 1
                 base_count = 0
-              
+
             print(seq, file=f)
             base_count += len(seq)
 
@@ -532,7 +690,20 @@ def to_quasr_primers(infile, outfile):
 
     utils.close(f_out)
 
-    
+
+def to_fasta_union(infile, outfile, seqname='union'):
+    seq_reader = sequences.file_reader(infile)
+    new_seq = []
+
+    for seq in seq_reader:
+        new_seq.append(seq.seq)
+
+    f_out = utils.open_file_write(outfile)
+    print(sequences.Fasta(seqname, ''.join(new_seq)), file=f_out)
+    utils.close(f_out)
+
+
+
 def to_unique_by_id(infile, outfile):
     seq_reader = sequences.file_reader(infile)
     seqs = {}
diff --git a/fastaq/tests/data/sequences_test_merge_to_one_seq.fa b/fastaq/tests/data/sequences_test_merge_to_one_seq.fa
new file mode 100644
index 0000000..dc12fa4
--- /dev/null
+++ b/fastaq/tests/data/sequences_test_merge_to_one_seq.fa
@@ -0,0 +1,8 @@
+>1
+A
+>2
+G
+>3
+C
+>4
+T
diff --git a/fastaq/tests/data/sequences_test_merge_to_one_seq.fq b/fastaq/tests/data/sequences_test_merge_to_one_seq.fq
new file mode 100644
index 0000000..cd2d877
--- /dev/null
+++ b/fastaq/tests/data/sequences_test_merge_to_one_seq.fq
@@ -0,0 +1,8 @@
+ at 1
+A
++
+I
+ at 2
+G
++
+H
diff --git a/fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fa b/fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fa
new file mode 100644
index 0000000..ee6ca7b
--- /dev/null
+++ b/fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fa
@@ -0,0 +1,2 @@
+>union
+AGCT
diff --git a/fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fq b/fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fq
new file mode 100644
index 0000000..2a93ab6
--- /dev/null
+++ b/fastaq/tests/data/sequences_test_merge_to_one_seq.merged.fq
@@ -0,0 +1,4 @@
+ at union
+AG
++
+IH
diff --git a/fastaq/tests/data/sequences_test_orfs.fa b/fastaq/tests/data/sequences_test_orfs.fa
new file mode 100644
index 0000000..71b972a
--- /dev/null
+++ b/fastaq/tests/data/sequences_test_orfs.fa
@@ -0,0 +1,18 @@
+>1
+GTATGACGACTTCTCGGTCAAAGGTAAGGTGAACAAGGGATTGAATGCTTAAATCCCGTG
+CCTACACTCAGTACCGGTGCTTGGCTGAAGCGTTCCTATGCAAGAATGAGAACTGGCAAC
+ACGTCGCGGCCAGCCCGGGACCATCAGGACCCGAACGTGTACCGCGAATGTTTACATTTC
+ACCCAGTTACCCGGATTCGGGCCAAAGCAGGAGAGCCTCTGAATTAGATGGTGCCACGTA
+AGTCTATTTTCGCACGTTTTATTGATTCAAGTGAGTGTCAACGTAGATTTATTGGTGCTT
+GGCTAAAGACGTATGGATCACGGGATGGAACATCTGGATCCCCCATGTACGTAAGTGTGT
+CGTCAAACAAAATTCTGTATCCCGTCGCTCCTGCCAGGGCAATCGCGGAGCTACGGACAT
+AGTCCTTAGTGAACTAATGATGATGAACATCTCGAACCAGGTTAACACGATACGATGAAG
+CGGGTTACTGAACACACTTAACAGGAGCCTGAGCAAATGTCATTTACAAAAGGTTTCTAG
+ACCCCCTTGGTAAGTCACTTGACACGTCTCATGCGGGGCCTACGGTAAACCAGATGCTAG
+AGTAGCGAACGGTGGGTGCGCAGGCATGTCCGGTCTCTCGATGGTGCACTTACGGACATC
+TCCCTATACAGATCTATTCAGTCACGAAGGTCAGCGAACATAACCCACGGGAGTTATCTC
+AACGAGTACGGGAGCGAACGGTGCACGGATCTGTCTTAGCTCAGAGGCGTCACGCGGTCC
+TATCTAACGCAAGAGCATGTGCCATTCCGGCCCTCTGATGTGCCTATGTACATAGAGCCG
+ACCCCGGCGGATTGGAGTCCCTAGCTACCGTCGACAGAGACGCAAAGACTCAATTGCTAT
+GTATATTGTTACTCTTCAACCACTGGAAAGACAAATAATTGCGGGCAAGTGCGTTACCCA
+TCACTCTGTTCTGTACACGAAAGGCTGAATAGCAAGTGGC
diff --git a/fastaq/tests/data/sequences_test_orfs.gff b/fastaq/tests/data/sequences_test_orfs.gff
new file mode 100644
index 0000000..f89b79c
--- /dev/null
+++ b/fastaq/tests/data/sequences_test_orfs.gff
@@ -0,0 +1,15 @@
+1	fastaq	CDS	28	222	.	+	.
+1	fastaq	CDS	45	227	.	+	.
+1	fastaq	CDS	49	171	.	-	.
+1	fastaq	CDS	110	241	.	+	.
+1	fastaq	CDS	144	266	.	-	.
+1	fastaq	CDS	228	422	.	+	.
+1	fastaq	CDS	278	433	.	-	.
+1	fastaq	CDS	287	478	.	+	.
+1	fastaq	CDS	289	519	.	-	.
+1	fastaq	CDS	563	703	.	+	.
+1	fastaq	CDS	601	759	.	+	.
+1	fastaq	CDS	606	818	.	+	.
+1	fastaq	CDS	819	938	.	+	.
+1	fastaq	CDS	836	988	.	+	.
+1	fastaq	CDS	865	999	.	+	.
diff --git a/fastaq/tests/data/sequences_test_to_fasta_union.in.fa b/fastaq/tests/data/sequences_test_to_fasta_union.in.fa
new file mode 100644
index 0000000..922d090
--- /dev/null
+++ b/fastaq/tests/data/sequences_test_to_fasta_union.in.fa
@@ -0,0 +1,7 @@
+>1
+AA
+A
+>2
+GGG
+>3
+TTT
diff --git a/fastaq/tests/data/sequences_test_to_fasta_union.out.fa b/fastaq/tests/data/sequences_test_to_fasta_union.out.fa
new file mode 100644
index 0000000..2f55fb2
--- /dev/null
+++ b/fastaq/tests/data/sequences_test_to_fasta_union.out.fa
@@ -0,0 +1,2 @@
+>testname
+AAAGGGTTT
diff --git a/fastaq/tests/data/tasks_test_expend_nucleotides.in.fa b/fastaq/tests/data/tasks_test_expend_nucleotides.in.fa
new file mode 100644
index 0000000..55d2ad3
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_expend_nucleotides.in.fa
@@ -0,0 +1,4 @@
+>1
+A
+>2
+ART
diff --git a/fastaq/tests/data/tasks_test_expend_nucleotides.in.fq b/fastaq/tests/data/tasks_test_expend_nucleotides.in.fq
new file mode 100644
index 0000000..4aaf860
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_expend_nucleotides.in.fq
@@ -0,0 +1,8 @@
+ at 1
+A
++
+I
+ at 2
+ART
++
+HIG
diff --git a/fastaq/tests/data/tasks_test_expend_nucleotides.out.fa b/fastaq/tests/data/tasks_test_expend_nucleotides.out.fa
new file mode 100644
index 0000000..9d7b593
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_expend_nucleotides.out.fa
@@ -0,0 +1,6 @@
+>1
+A
+>2.1
+AAT
+>2.2
+AGT
diff --git a/fastaq/tests/data/tasks_test_expend_nucleotides.out.fq b/fastaq/tests/data/tasks_test_expend_nucleotides.out.fq
new file mode 100644
index 0000000..97090c1
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_expend_nucleotides.out.fq
@@ -0,0 +1,12 @@
+ at 1
+A
++
+I
+ at 2.1
+AAT
++
+HIG
+ at 2.2
+AGT
++
+HIG
diff --git a/fastaq/tests/data/tasks_test_fasta_to_fake_qual.in.fa b/fastaq/tests/data/tasks_test_fasta_to_fake_qual.in.fa
new file mode 100644
index 0000000..56c943a
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_fasta_to_fake_qual.in.fa
@@ -0,0 +1,5 @@
+>1
+CATTCATGATGCAGTCATCGATGCATGATGCATGCATGCATGCTGCAGTCAGTCATGCAA
+TC
+>2
+CATGT
diff --git a/fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.default.qual b/fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.default.qual
new file mode 100644
index 0000000..0420243
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.default.qual
@@ -0,0 +1,5 @@
+>1
+40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
+40 40
+>2
+40 40 40 40 40
diff --git a/fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.q42.qual b/fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.q42.qual
new file mode 100644
index 0000000..196a1d8
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_fasta_to_fake_qual.out.q42.qual
@@ -0,0 +1,5 @@
+>1
+42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42 42
+42 42
+>2
+42 42 42 42 42
diff --git a/fastaq/tests/data/tasks_test_make_long_reads.input.fa b/fastaq/tests/data/tasks_test_make_long_reads.input.fa
new file mode 100644
index 0000000..e4862bc
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_make_long_reads.input.fa
@@ -0,0 +1,2 @@
+>1
+ACGCTCTCGAGCGCGAGCGCGAGCGAC
diff --git a/fastaq/tests/data/tasks_test_make_long_reads.output.fa b/fastaq/tests/data/tasks_test_make_long_reads.output.fa
new file mode 100644
index 0000000..09ab3b0
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_make_long_reads.output.fa
@@ -0,0 +1,10 @@
+>1_1_10
+ACGCTCTCGA
+>1_6_15
+CTCGAGCGCG
+>1_11_20
+GCGCGAGCGC
+>1_16_25
+AGCGCGAGCG
+>1_21_27
+GAGCGAC
diff --git a/fastaq/tests/data/tasks_test_sequence_trim_1.fa b/fastaq/tests/data/tasks_test_sequence_trim_1.fa
new file mode 100644
index 0000000..28f665b
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_sequence_trim_1.fa
@@ -0,0 +1,12 @@
+>1/1
+TRIM1GCTCGAGCT
+>2/1
+TRIM1AGCTAGCTAG
+>3/1
+CGCTAGCTAG
+>4/1
+TRIM2AGCTAGCTAG
+>5/1
+AGCTAGCTAG
+>6/1
+TRIM4AGCTAGCTAG
diff --git a/fastaq/tests/data/tasks_test_sequence_trim_1.trimmed.fa b/fastaq/tests/data/tasks_test_sequence_trim_1.trimmed.fa
new file mode 100644
index 0000000..0bebad8
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_sequence_trim_1.trimmed.fa
@@ -0,0 +1,8 @@
+>3/1
+CGCTAGCTAG
+>4/1
+AGCTAGCTAG
+>5/1
+AGCTAGCTAG
+>6/1
+AGCTAGCTAG
diff --git a/fastaq/tests/data/tasks_test_sequence_trim_2.fa b/fastaq/tests/data/tasks_test_sequence_trim_2.fa
new file mode 100644
index 0000000..7514250
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_sequence_trim_2.fa
@@ -0,0 +1,12 @@
+>1/2
+TRIM1ACGTACGTAC
+>2/2
+TRIM2ACGTAGTGA
+>3/2
+ACGCTGCAGTCAGTCAGTAT
+>4/2
+TRIM3CGATCGATCG
+>5/2
+TRIM3CGATCGATCG
+>6/2
+CGATCGATCG
diff --git a/fastaq/tests/data/tasks_test_sequence_trim_2.trimmed.fa b/fastaq/tests/data/tasks_test_sequence_trim_2.trimmed.fa
new file mode 100644
index 0000000..ec80f40
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_sequence_trim_2.trimmed.fa
@@ -0,0 +1,8 @@
+>3/2
+ACGCTGCAGTCAGTCAGTAT
+>4/2
+CGATCGATCG
+>5/2
+CGATCGATCG
+>6/2
+CGATCGATCG
diff --git a/fastaq/tests/data/tasks_test_sequences_to_trim.fa b/fastaq/tests/data/tasks_test_sequences_to_trim.fa
new file mode 100644
index 0000000..395eaaa
--- /dev/null
+++ b/fastaq/tests/data/tasks_test_sequences_to_trim.fa
@@ -0,0 +1,8 @@
+>1
+TRIM1
+>2
+TRIM2
+>3
+TRIM3
+>4
+TRIM4
diff --git a/fastaq/tests/sequences_test.py b/fastaq/tests/sequences_test.py
index 4bafb66..fad6098 100644
--- a/fastaq/tests/sequences_test.py
+++ b/fastaq/tests/sequences_test.py
@@ -4,7 +4,7 @@ import sys
 import filecmp
 import os
 import unittest
-from fastaq import sequences, utils, intervals
+from fastaq import sequences, utils, intervals, tasks
 
 modules_dir = os.path.dirname(os.path.abspath(sequences.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
@@ -160,6 +160,70 @@ class TestFasta(unittest.TestCase):
             gaps = test_seqs[i].contig_coords()
             self.assertListEqual(correct_coords[i], gaps)
 
+
+
+
+    def test_orfs(self):
+        '''Test orfs()'''
+        test_seqs = [(sequences.Fasta('ID', 'AAACCCGG'), 0, False, [intervals.Interval(0,5)]),
+                     (sequences.Fasta('ID', 'AAAACCCGG'), 1, False, [intervals.Interval(1,6)]),
+                     (sequences.Fasta('ID', 'AAAAACCCGG'), 2, False, [intervals.Interval(2,7)]),
+                     (sequences.Fasta('ID', 'CCGGGTTT'), 0, True, [intervals.Interval(2,7)]),
+                     (sequences.Fasta('ID', 'CCGGGTTTT'), 1, True, [intervals.Interval(2,7)]),
+                     (sequences.Fasta('ID', 'CCGGGTTTTT'), 2, True, [intervals.Interval(2,7)]),
+                     (sequences.Fasta('ID', 'AAACCCTGA'), 0, False, [intervals.Interval(0,8)]),
+                     (sequences.Fasta('ID', 'AAACCCTGATAG'), 0, False, [intervals.Interval(0,8)]),
+                     (sequences.Fasta('ID', 'AAACCCTGA'), 1, False, [intervals.Interval(1,6)]),
+                     (sequences.Fasta('ID', ''), 0, False, []),
+                     (sequences.Fasta('ID', 'A'), 0, False, []),
+                     (sequences.Fasta('ID', 'AA'), 0, False, []),
+                     (sequences.Fasta('ID', 'AAA'), 0, False, [intervals.Interval(0,2)]),
+                     (sequences.Fasta('ID', 'AAAAAA'), 0, False, [intervals.Interval(0,5)]),
+                     (sequences.Fasta('ID', 'AAA'), 1, False, []),
+                     (sequences.Fasta('ID', 'AAA'), 2, False, []),
+                     (sequences.Fasta('ID', 'AAA'), 0, True, [intervals.Interval(0,2)]),
+                     (sequences.Fasta('ID', 'AAA'), 1, True, []),
+                     (sequences.Fasta('ID', 'AAA'), 2, True, []),
+                     (sequences.Fasta('ID', 'TAA'), 0, False, []),
+                     (sequences.Fasta('ID', 'CTA'), 0, True, [])]
+
+
+        for t in test_seqs:
+            orfs = t[0].orfs(frame=t[1], revcomp=t[2])
+            self.assertListEqual(orfs, t[3])
+
+    def test_all_orfs(self):
+        '''Test all_orfs()'''
+        d = {}
+        tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_orfs.fa'), d)
+        seq = d['1']
+        orfs = seq.all_orfs(min_length=120)
+        expected = [
+            (intervals.Interval(27, 221), False),
+            (intervals.Interval(44, 226), False),
+            (intervals.Interval(48, 170), True),
+            (intervals.Interval(109, 240), False),
+            (intervals.Interval(143, 265), True),
+            (intervals.Interval(227, 421), False),
+            (intervals.Interval(277, 432), True),
+            (intervals.Interval(286, 477), False),
+            (intervals.Interval(288, 518), True),
+            (intervals.Interval(562, 702), False),
+            (intervals.Interval(600, 758), False),
+            (intervals.Interval(605, 817), False),
+            (intervals.Interval(818, 937), False),
+            (intervals.Interval(835, 987), False),
+            (intervals.Interval(864, 998), False)
+        ]
+
+        self.assertEqual(len(orfs), len(expected))
+
+        for i in range(len(orfs)):
+            print(orfs[i][0], expected[i][0])
+            self.assertEqual(orfs[i][0], expected[i][0])
+            self.assertEqual(orfs[i][1], expected[i][1])
+
+
     def test_is_all_Ns(self):
         '''Test is_all_Ns()'''
         self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns())
@@ -195,6 +259,12 @@ class TestFasta(unittest.TestCase):
             s.trim_Ns()
             self.assertEqual(fa, s)
 
+    def test_add_insertions(self):
+        '''Test add_insertions'''
+        fa = sequences.Fasta('X', 'acgtacgtacgt')
+        fa.add_insertions(skip=4, window=0, test=True)
+        self.assertEqual(fa, sequences.Fasta('X', 'acgtNacgtNacgt'))
+
     def test_replace_bases(self):
         '''Check that bases get replaced correctly'''
         fa = sequences.Fasta('X', 'AUCGTUUACT')
@@ -267,6 +337,32 @@ class TestFasta(unittest.TestCase):
         print(fa.translate(frame=1))
         self.assertEqual(sequences.Fasta('ID', 'SRG*KATPA*Q*RLL*RATGRRGSPYNHFIATPA*KDVLSTPA*QFILVYNHDLVLCSRGLIV'), fa.translate(frame=2))
 
+    def test_expand_nucleotides(self):
+        '''Test expand_nucleotides'''
+        tests = [
+            (sequences.Fasta('1', 'A'), [sequences.Fasta('1.1', 'A')]),
+            (sequences.Fasta('2', 'C'), [sequences.Fasta('2.1', 'C')]),
+            (sequences.Fasta('3', 'G'), [sequences.Fasta('3.1', 'G')]),
+            (sequences.Fasta('4', 'T'), [sequences.Fasta('4.1', 'T')]),
+            (sequences.Fasta('6', 'R'), [sequences.Fasta('6.1', 'A'), sequences.Fasta('6.2', 'G')]),
+            (sequences.Fasta('7', 'Y'), [sequences.Fasta('7.1', 'C'), sequences.Fasta('7.2', 'T')]),
+            (sequences.Fasta('8', 'S'), [sequences.Fasta('8.1', 'C'), sequences.Fasta('8.2', 'G')]),
+            (sequences.Fasta('9', 'W'), [sequences.Fasta('9.1', 'A'), sequences.Fasta('9.2', 'T')]),
+            (sequences.Fasta('10', 'K'), [sequences.Fasta('10.1', 'G'), sequences.Fasta('10.2', 'T')]),
+            (sequences.Fasta('11', 'M'), [sequences.Fasta('11.1', 'A'), sequences.Fasta('11.2', 'C')]),
+            (sequences.Fasta('12', 'B'), [sequences.Fasta('12.1', 'C'), sequences.Fasta('12.2', 'G'), sequences.Fasta('12.3', 'T')]),
+            (sequences.Fasta('13', 'D'), [sequences.Fasta('13.1', 'A'), sequences.Fasta('13.2', 'G'), sequences.Fasta('13.3', 'T')]),
+            (sequences.Fasta('14', 'H'), [sequences.Fasta('14.1', 'A'), sequences.Fasta('14.2', 'C'), sequences.Fasta('14.3', 'T')]),
+            (sequences.Fasta('15', 'V'), [sequences.Fasta('15.1', 'A'), sequences.Fasta('15.2', 'C'), sequences.Fasta('15.3', 'G')]),
+            (sequences.Fasta('16', 'N'), [sequences.Fasta('16.1', 'A'), sequences.Fasta('16.2', 'C'), sequences.Fasta('16.3', 'G'), sequences.Fasta('16.4', 'T')]),
+            (sequences.Fasta('17', 'ART'), [sequences.Fasta('17.1', 'AAT'), sequences.Fasta('17.2', 'AGT')]),
+            (sequences.Fasta('18', 'ARRT'), [sequences.Fasta('18.1', 'AAAT'), sequences.Fasta('18.2', 'AAGT'), sequences.Fasta('18.3', 'AGAT'), sequences.Fasta('18.4', 'AGGT')]),
+            (sequences.Fasta('19', 'ARTR'), [sequences.Fasta('19.1', 'AATA'), sequences.Fasta('19.2', 'AATG'), sequences.Fasta('19.3', 'AGTA'), sequences.Fasta('19.4', 'AGTG')]),
+            (sequences.Fastq('20', 'ART', 'GHI'), [sequences.Fastq('20.1', 'AAT', 'GHI'), sequences.Fastq('20.2', 'AGT', 'GHI')]),
+        ]
+
+        for t in tests:
+            self.assertListEqual(t[0].expand_nucleotides(), t[1])
 
     def test_split_capillary_id(self):
         '''Tests that we get information from a sanger capillary read name OK'''
@@ -323,7 +419,7 @@ class TestEmbl(unittest.TestCase):
             counter += 1
 
         utils.close(f_in)
- 
+
 
 class TestFastq(unittest.TestCase):
     def setUp(self):
@@ -458,7 +554,7 @@ class TestFileReader(unittest.TestCase):
             for seq in reader:
                 self.assertEqual(seq, sequences.Fasta('seq' + str(counter), 'ACGTACGTAC'))
                 counter += 1
-        
+
         bad_files = [
             'sequences_test_gffv3.no_seq.gff',
             'sequences_test_gffv3.no_seq.2.gff'
@@ -479,7 +575,7 @@ class TestFileReader(unittest.TestCase):
         for seq in reader:
             self.assertEqual(seq, sequences.Fasta('seq' + str(counter), expected_embl[counter-1]))
             counter += 1
-        
+
         bad_files = [
             'sequences_test.embl.bad',
             'sequences_test.embl.bad2',
@@ -514,14 +610,14 @@ class TestFileReader(unittest.TestCase):
             for seq in reader:
                 self.assertEqual(expected_seqs[i], seq)
                 i += 1
-        
+
         # files made by seaview are a little different in the first line.
         # Test one of these
         expected_seqs = [
             sequences.Fasta('seq1', 96 * 'G' + 'T'),
             sequences.Fasta('seq2', 94 * 'A' + 'G')
         ]
-        
+
         reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview'))
         i = 0
         for seq in reader:
@@ -530,6 +626,31 @@ class TestFileReader(unittest.TestCase):
             i += 1
 
 
+class TestOther(unittest.TestCase):
+    def test_orfs_from_aa_seq(self):
+        '''Test _orfs_from_aa_seq()'''
+        test_seqs = ['',
+                     '*',
+                     '**',
+                     'A',
+                     'A*A*A',
+                     'AB**CDE*AB',
+                     '*ABCDE*',
+                     '**ABCDE**']
+
+        correct_coords = [[],
+                          [],
+                          [],
+                          [intervals.Interval(0, 0)],
+                          [intervals.Interval(0, 1), intervals.Interval(2, 3),intervals.Interval(4, 4)],
+                          [intervals.Interval(0, 2), intervals.Interval(4, 7), intervals.Interval(8, 9)],
+                          [intervals.Interval(1, 6)],
+                          [intervals.Interval(2, 7)]]
+
+        for i in range(len(test_seqs)):
+            orfs = sequences._orfs_from_aa_seq(test_seqs[i])
+            self.assertListEqual(correct_coords[i], orfs)
+
+
 if __name__ == '__main__':
     unittest.main()
-
diff --git a/fastaq/tests/tasks_test.py b/fastaq/tests/tasks_test.py
index 084425e..36ebfba 100644
--- a/fastaq/tests/tasks_test.py
+++ b/fastaq/tests/tasks_test.py
@@ -70,6 +70,22 @@ class TestEnumerateNames(unittest.TestCase):
         os.unlink(rename_out)
 
 
+class TestExpandNucleotides(unittest.TestCase):
+    def test_expand_nucleoties(self):
+        '''Test expand_nucleotides'''
+        tmp = 'tmp.expanded'
+        fq_in = os.path.join(data_dir, 'tasks_test_expend_nucleotides.in.fq')
+        fa_in = os.path.join(data_dir, 'tasks_test_expend_nucleotides.in.fa')
+        fq_expected = os.path.join(data_dir, 'tasks_test_expend_nucleotides.out.fq')
+        fa_expected = os.path.join(data_dir, 'tasks_test_expend_nucleotides.out.fa')
+        tasks.expand_nucleotides(fq_in, tmp)
+        self.assertTrue(filecmp.cmp(fq_expected, tmp, shallow=False))
+        os.unlink(tmp)
+        tasks.expand_nucleotides(fa_in, tmp)
+        self.assertTrue(filecmp.cmp(fa_expected, tmp, shallow=False))
+        os.unlink(tmp)
+
+
 class TestExtendGaps(unittest.TestCase):
     def test_extend_gaps(self):
         '''Test that gap extension works'''
@@ -88,6 +104,15 @@ class TestFastqToMiraXml(unittest.TestCase):
         os.unlink(tmp)
 
 
+class TestFastaqToOrfsGFF(unittest.TestCase):
+    def test_fastaq_to_orfs_gff(self):
+        '''Test fastaq_to_orfs_gff'''
+        outfile = 'tmp.orfs.gff'
+        tasks.fastaq_to_orfs_gff(os.path.join(data_dir, 'sequences_test_orfs.fa'), outfile, min_length=120)
+        self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_orfs.gff'), outfile, shallow=False))
+        os.unlink(outfile)
+
+
 class TestFilter(unittest.TestCase):
     def test_length_filter(self):
         '''Check that filtering by length works as expected'''
@@ -198,6 +223,30 @@ class TestMakeRandomContigs(unittest.TestCase):
         os.unlink(tmp)
 
 
+class TestMakeLongReads(unittest.TestCase):
+    def test_tiling_reads(self):
+        tmp = 'tmp.out.fa'
+        fa_in = os.path.join(data_dir, 'tasks_test_make_long_reads.input.fa')
+        tasks.make_long_reads(fa_in, tmp, method='tiling', fixed_read_length=10, tile_step=5)
+        self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_make_long_reads.output.fa'), tmp, shallow=False))
+        os.unlink(tmp)
+
+
+class TestMergeToOneSeq(unittest.TestCase):
+    def test_merge_to_one_seq_fa(self):
+        '''Test merge_to_one_seq with fasta'''
+        tmp = 'tmp.merged.fa'
+        tasks.merge_to_one_seq(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.fa'), tmp)
+        self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.merged.fa'), tmp, shallow=False))
+        os.unlink(tmp)
+
+    def test_merge_to_one_seq_fq(self):
+        '''Test merge_to_one_seq with fastq'''
+        tmp = 'tmp.merged.fq'
+        tasks.merge_to_one_seq(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.fq'), tmp)
+        self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_merge_to_one_seq.merged.fq'), tmp, shallow=False))
+        os.unlink(tmp)
+
 class TestReverseComplement(unittest.TestCase):
     def test_reverse_complement(self):
         '''reverse_complement should correctly reverse complement each seq in a file'''
@@ -232,6 +281,23 @@ class TestSearchForSeq(unittest.TestCase):
         os.unlink(tmp)
 
 
+class TestSequenceTrim(unittest.TestCase):
+    def test_sequence_trim(self):
+        '''Test sequence_trim'''
+        tmp1 = 'tmp.trimmed_1.fa'
+        tmp2 = 'tmp.trimmed_2.fa'
+        in1 = os.path.join(data_dir, 'tasks_test_sequence_trim_1.fa')
+        in2 = os.path.join(data_dir, 'tasks_test_sequence_trim_2.fa')
+        to_trim = os.path.join(data_dir, 'tasks_test_sequences_to_trim.fa')
+        expected1 = os.path.join(data_dir, 'tasks_test_sequence_trim_1.trimmed.fa')
+        expected2 = os.path.join(data_dir, 'tasks_test_sequence_trim_2.trimmed.fa')
+        tasks.sequence_trim(in1, in2, tmp1, tmp2, to_trim, min_length=10)
+        self.assertTrue(filecmp.cmp(expected1, tmp1))
+        self.assertTrue(filecmp.cmp(expected2, tmp2))
+        os.unlink(tmp1)
+        os.unlink(tmp2)
+
+
 class TestTranslate(unittest.TestCase):
     def test_translate(self):
         '''Test translate works in each frame'''
@@ -323,13 +389,13 @@ class TestSplit(unittest.TestCase):
         infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa')
         outprefix = 'tmp.sequences_test_split'
         tasks.split_by_fixed_size(infile, outprefix, 4, 1)
-  
+
         for i in range(1,7,1):
             correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.' + str(i))
             test = outprefix + '.' + str(i)
             self.assertTrue(filecmp.cmp(test, correct))
             os.unlink(test)
- 
+
         test_coords = outprefix + '.coords'
         self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.coords'), test_coords))
         os.unlink(test_coords)
@@ -338,13 +404,13 @@ class TestSplit(unittest.TestCase):
         infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa')
         outprefix = 'tmp.sequences_test_split'
         tasks.split_by_fixed_size(infile, outprefix, 4, 1, skip_if_all_Ns=True)
-  
+
         for i in range(1,5,1):
             correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.' + str(i))
             test = outprefix + '.' + str(i)
             self.assertTrue(filecmp.cmp(test, correct))
             os.unlink(test)
- 
+
         test_coords = outprefix + '.coords'
         self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords'), test_coords))
         os.unlink(test_coords)
@@ -364,6 +430,20 @@ class TestGetIds(unittest.TestCase):
         self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test.fa.ids'), tmpfile))
         os.unlink(tmpfile)
 
+
+class TestFastaToFakeQual(unittest.TestCase):
+    def test_fasta_to_fake_qual(self):
+        '''Test fasta_to_fake_qual'''
+        tmpfile = 'tmp.qual'
+        infile = os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.in.fa')
+        tasks.fastaq_to_fake_qual(infile, tmpfile)
+        self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.out.default.qual'), tmpfile, shallow=False))
+        os.unlink(tmpfile)
+        tasks.fastaq_to_fake_qual(infile, tmpfile, q=42)
+        self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'tasks_test_fasta_to_fake_qual.out.q42.qual'), tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
 class TestFastaToFastq(unittest.TestCase):
     def test_fasta_to_fastq(self):
         '''Check fasta_to_fastq converts files as expected'''
@@ -396,7 +476,7 @@ class TestStripIlluminaSuffix(unittest.TestCase):
         tasks.strip_illumina_suffix(os.path.join(data_dir, 'sequences_test_strip_illumina_suffix.fq'), tmpfile)
         self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_illumina_suffix.fq.stripped'), tmpfile))
         os.unlink(tmpfile)
-      
+
 
 class TestToQuasrPrimers(unittest.TestCase):
     def test_to_quasr_primers(self):
@@ -444,6 +524,15 @@ class TestToUniqueByID(unittest.TestCase):
         os.unlink(tmpfile)
 
 
+class TestToFastaUnion(unittest.TestCase):
+    def test_to_fasta_union(self):
+        '''Test to_fasta_union'''
+        tmpfile = 'tmp.to_fasta_union'
+        tasks.to_fasta_union(os.path.join(data_dir, 'sequences_test_to_fasta_union.in.fa'), tmpfile, seqname='testname')
+        self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_to_fasta_union.out.fa'), tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
 if __name__ == '__main__':
     unittest.main()
 
diff --git a/scripts/fastaq_expand_nucleotides b/scripts/fastaq_expand_nucleotides
new file mode 100755
index 0000000..2dbde36
--- /dev/null
+++ b/scripts/fastaq_expand_nucleotides
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+import argparse
+from fastaq import tasks
+
+parser = argparse.ArgumentParser(
+    description = 'Makes all combinations of sequences in input file by using all possibilities of redundant bases. e.g. ART could be AAT or AGT. Assumes input is nucleotides, not amino acids',
+    usage = '%(prog)s <infile> <outfile>')
+parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip')
+parser.add_argument('outfile', help='Name of output file')
+options = parser.parse_args()
+tasks.expand_nucleotides(
+    options.infile,
+    options.outfile,
+)
diff --git a/scripts/fastaq_long_read_simulate b/scripts/fastaq_long_read_simulate
new file mode 100755
index 0000000..23106f3
--- /dev/null
+++ b/scripts/fastaq_long_read_simulate
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+import argparse
+from fastaq import tasks
+
+parser = argparse.ArgumentParser(
+    description = 'Simulates long reads from a fasta/q file. Can optionally make insertions into the reads, like pacbio does. If insertions made, coverage calculation is done before the insertions (so total read length may appear longer then expected).',
+    usage = '%(prog)s [options] <infile> <outfile>')
+
+parser.add_argument('infile', help='Name of input fasta/q file')
+parser.add_argument('outfile', help='Name of output fasta file')
+
+parser.add_argument('--method', help='How to sample the read positions and lengths. Choose from 1) "tiling", where reads of fixed length are taken at equal intervals from the reference. 2) "unfiform", where reads of fixed length taken at positions sampled uniformly. 3) "gamma", where reads lengths are taken from a gamma distribution, and positions sampled uniformly. [%(default)s]', default='tiling', choices=['tiling', 'uniform', 'gamma'], metavar='tiling|uniform|gamma')
+parser.add_argument('--seed', type=int, help='Seed for random number generator [default: use python\'s default]', metavar='INT')
+parser.add_argument('--qual', help='Write a file of fake quality scores called outfile.qual, all bases same quality [%(default)s]', metavar='INT')
+parser.add_argument('--fixed_read_length', type=int, help='Length of each read. Only applies if method is tile or uniform. [%(default)s]', default=20000, metavar='INT')
+parser.add_argument('--coverage', type=float, help='Read coverage. Only applies if method is gamma or uniform. [%(default)s]', default=2, metavar='FLOAT')
+
+
+tiling_group = parser.add_argument_group('tiling options')
+tiling_group.add_argument('--tile_step', type=int, help='Distance between start of each read [%(default)s]', default=10000, metavar='INT')
+
+gamma_group = parser.add_argument_group('gamma options')
+gamma_group.add_argument('--gamma_shape', type=float, help='Shape parameter of gamma distribution [%(default)s]', default=1.2, metavar='FLOAT')
+gamma_group.add_argument('--gamma_scale', type=float, help='Scale parameter of gamma distribution [%(default)s]', default=6000, metavar='FLOAT')
+gamma_group.add_argument('--gamma_min_length', type=int, help='Minimum read length [%(default)s]', default=20000, metavar='INT')
+
+ins_group = parser.add_argument_group('options to add insertions to reads')
+ins_group.add_argument('--ins_skip', type=int, help='Insert a random base every --skip bases plus or minus --ins_window. If this option is used, must also use --ins_window.', metavar='INT')
+ins_group.add_argument('--ins_window', type=int, help='See --ins_skip. If this option is used, must also use --ins_skip.', metavar='INT')
+
+
+options = parser.parse_args()
+tasks.make_long_reads(
+    options.infile,
+    options.outfile,
+    method=options.method,
+    fixed_read_length=options.fixed_read_length,
+    coverage=options.coverage,
+    tile_step=options.tile_step,
+    gamma_shape=options.gamma_shape,
+    gamma_scale=options.gamma_scale,
+    gamma_min_length=options.gamma_min_length,
+    seed=options.seed,
+    ins_skip=options.ins_skip,
+    ins_window=options.ins_window
+)
+
+if options.qual:
+    tasks.fastaq_to_fake_qual(options.outfile, options.outfile + '.qual', q=options.qual)
diff --git a/scripts/fastaq_merge b/scripts/fastaq_merge
new file mode 100755
index 0000000..d919323
--- /dev/null
+++ b/scripts/fastaq_merge
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+import argparse
+from fastaq import tasks
+
+parser = argparse.ArgumentParser(
+    description = 'Converts multi fasta/q file to single sequence file, preserving original order of sequences',
+    usage = '%(prog)s <infile> <outfile>')
+parser.add_argument('infile', help='Name of input file. Can be any of FASTA, FASTQ, GFF3, EMBL, GBK, Phylip')
+parser.add_argument('outfile', help='Name of output file')
+parser.add_argument('-n', '--name', help='Name of sequence in output file [%(default)s]', default='union')
+options = parser.parse_args()
+tasks.merge_to_one_seq(
+    options.infile,
+    options.outfile,
+    seqname=options.name
+)
+
diff --git a/scripts/fastaq_sequence_trim b/scripts/fastaq_sequence_trim
new file mode 100755
index 0000000..50a4f34
--- /dev/null
+++ b/scripts/fastaq_sequence_trim
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import argparse
+from fastaq import tasks
+
+parser = argparse.ArgumentParser(
+    description = 'Trims sequences off the start of all sequences in a pair of fasta/q files, whenever there is a perfect match. Only keeps a read pair if both reads of the pair are at least a minimum length after any trimming',
+    usage = '%(prog)s [options] <fasta/q 1 in> <fastaq/2 in> <out 1> <out 2> <trim_seqs>')
+parser.add_argument('--min_length', type=int, help='Minimum length of output sequences [%(default)s]', default=50, metavar='INT')
+parser.add_argument('infile_1', help='Name of forward fasta/q file to be trimmed', metavar='fasta/q 1 in')
+parser.add_argument('infile_2', help='Name of reverse fasta/q file to be trimmed', metavar='fasta/q 2 in')
+parser.add_argument('outfile_1', help='Name of output forward fasta/q file', metavar='out_1')
+parser.add_argument('outfile_2', help='Name of output reverse fasta/q file', metavar='out_2')
+parser.add_argument('trim_seqs', help='Name of fasta/q file of sequences to search for at the start of each input sequence', metavar='trim_seqs')
+options = parser.parse_args()
+tasks.sequence_trim(
+    options.infile_1,
+    options.infile_2,
+    options.outfile_1,
+    options.outfile_2,
+    options.trim_seqs,
+    min_length=options.min_length
+)
diff --git a/scripts/fastaq_to_fake_qual b/scripts/fastaq_to_fake_qual
new file mode 100755
index 0000000..272f7a3
--- /dev/null
+++ b/scripts/fastaq_to_fake_qual
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+import argparse
+from fastaq import tasks
+
+parser = argparse.ArgumentParser(
+    description = 'Makes fake quality scores file from a fasta/q file',
+    usage = '%(prog)s <infile> <outfile>')
+parser.add_argument('infile', help='Name of input file')
+parser.add_argument('outfile', help='Name of output file')
+parser.add_argument('-q', '--qual', type=int, help='Quality score to assign to all bases [%(default)s]', default=40)
+options = parser.parse_args()
+tasks.fastaq_to_fake_qual(
+    options.infile,
+    options.outfile,
+    q=options.qual
+)
+
diff --git a/scripts/fastaq_to_orfs_gff b/scripts/fastaq_to_orfs_gff
new file mode 100755
index 0000000..0098023
--- /dev/null
+++ b/scripts/fastaq_to_orfs_gff
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+import argparse
+from fastaq import tasks
+
+parser = argparse.ArgumentParser(
+    description = 'Writes a GFF file of open reading frames from a fasta/q file',
+    usage = '%(prog)s [options] <fasta/q in> <gff_out>')
+parser.add_argument('--min_length', type=int, help='Minimum length of ORF, in nucleotides [%(default)s]', default=300, metavar='INT')
+parser.add_argument('infile', help='Name of input fasta/q file')
+parser.add_argument('gff_out', help='Name of output gff file')
+options = parser.parse_args()
+tasks.fastaq_to_orfs_gff(options.infile, options.gff_out, min_length=options.min_length)
diff --git a/setup.py b/setup.py
index 2fe7a90..3064862 100644
--- a/setup.py
+++ b/setup.py
@@ -7,13 +7,13 @@ def read(fname):
 
 setup(
     name='Fastaq',
-    version='0.1',
+    version='1.5.0',
     description='Scripts to manipulate FASTA and FASTQ files, plus API for developers',
     long_description=read('README.md'),
     packages = find_packages(),
     author='Martin Hunt',
     author_email='mh12 at sanger.ac.uk',
-    url='https://github.com/martinghunt/Fastaq',
+    url='https://github.com/sanger-pathogens/Fastaq',
     scripts=glob.glob('scripts/*'),
     test_suite='nose.collector',
     install_requires=['nose >= 1.3'],

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastaq.git



More information about the debian-med-commit mailing list