[med-svn] [fastaq] 01/05: Imported Upstream version 3.6.1

Fri Aug 21 22:32:22 UTC 2015

This is an automated email from the git hooks/post-receive script.

sascha-guest pushed a commit to branch master
in repository fastaq.

commit 08dda22ae0c1c9ee5e2d9ebcc49c01252ae66389
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date:   Fri Aug 21 08:55:00 2015 +0000

    Imported Upstream version 3.6.1
---
 pyfastaq/common.py                                 |  2 +-
 pyfastaq/genetic_codes.py                          | 31 +++++++++++
 pyfastaq/intervals.py                              |  7 +++
 pyfastaq/runners/to_fasta.py                       |  4 +-
 pyfastaq/runners/to_orfs_gff.py                    |  2 +-
 pyfastaq/sequences.py                              | 24 +++++++--
 pyfastaq/tasks.py                                  | 60 +++++++++++++++++++++-
 ...o_fasta.strip_after_whitespace_non_unique.in.fa |  6 +++
 ..._fasta.strip_after_whitespace_non_unique.out.fa |  6 +++
 ...st.to_fasta.strip_after_whitespace_unique.in.fa |  6 +++
 ...t.to_fasta.strip_after_whitespace_unique.out.fa |  6 +++
 pyfastaq/tests/data/tasks_test_to_fastg.fasta      |  4 ++
 pyfastaq/tests/data/tasks_test_to_fastg.fastg      |  8 +++
 .../data/tasks_test_to_fastg.ids_to_circularise    |  1 +
 pyfastaq/tests/intervals_test.py                   | 10 ++++
 pyfastaq/tests/sequences_test.py                   | 30 +++++++++++
 pyfastaq/tests/tasks_test.py                       | 45 ++++++++++++++++
 setup.py                                           | 13 ++---
 18 files changed, 248 insertions(+), 17 deletions(-)

diff --git a/pyfastaq/common.py b/pyfastaq/common.py
index 42b6ccb..f157366 100644
--- a/pyfastaq/common.py
+++ b/pyfastaq/common.py
@@ -1 +1 @@
-version = '3.2.0'
+version = '3.6.1'
diff --git a/pyfastaq/genetic_codes.py b/pyfastaq/genetic_codes.py
index c32c065..81a2279 100644
--- a/pyfastaq/genetic_codes.py
+++ b/pyfastaq/genetic_codes.py
@@ -1,4 +1,6 @@
+# see http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
 codes = {}
+starts = {}
 
 #standard genetic code
 codes[1] = {
@@ -68,6 +70,11 @@ codes[1] = {
     'GGG': 'G',
 }
 
+starts[1] = set([
+    'TTG',
+    'CTG',
+    'ATG',
+])
 
 #mycoplasma genetic code
 codes[4] = {
@@ -137,3 +144,27 @@ codes[4] = {
     'GGG': 'G'
 } 
 
+starts[4] = set([
+    'TTA',
+    'TTG',
+    'CTG',
+    'ATT',
+    'ATC',
+    'ATA',
+    'ATG',
+    'GTG',
+])
+
+
+# Bacterial, Archaeal and Plant Plastid Code
+codes[11] = codes[1]
+
+starts[11] = set([
+    'TTG',
+    'CTG',
+    'ATT',
+    'ATC',
+    'ATA',
+    'ATG',
+    'GTG',
+])
diff --git a/pyfastaq/intervals.py b/pyfastaq/intervals.py
index b320c63..ef344ed 100644
--- a/pyfastaq/intervals.py
+++ b/pyfastaq/intervals.py
@@ -31,6 +31,13 @@ class Interval:
     def __le__(self, i):
         return self.start < i.start or (self.start == i.start and self.end <= i.end)
 
+    def distance_to_point(self, p):
+        '''Returns the distance from the point to the interval. Zero if the point lies inside the interval.'''
+        if self.start <= p <= self.end:
+            return 0
+        else:
+            return min(abs(self.start - p), abs(self.end - p))
+
     def intersects(self, i):
         '''Returns true iff this interval intersects the interval i'''
         return self.start <= i.end and i.start <= self.end
diff --git a/pyfastaq/runners/to_fasta.py b/pyfastaq/runners/to_fasta.py
index 379abc6..7cecf1c 100644
--- a/pyfastaq/runners/to_fasta.py
+++ b/pyfastaq/runners/to_fasta.py
@@ -9,12 +9,14 @@ def run(description):
     parser.add_argument('outfile', help='Name of output file')
     parser.add_argument('-l', '--line_length', type=int, help='Number of bases on each sequence line of output file. Set to zero for no linebreaks in sequences [%(default)s]', default=60)
     parser.add_argument('-s', '--strip_after_whitespace', action='store_true', help='Remove everything after first whitespace in every sequence name')
+    parser.add_argument('-u', '--check_unique', action='store_true', help='Die if any of the output sequence names are not unique')
     options = parser.parse_args()
 
     tasks.to_fasta(
         options.infile,
         options.outfile,
         line_length=options.line_length,
-        strip_after_first_whitespace=options.strip_after_whitespace
+        strip_after_first_whitespace=options.strip_after_whitespace,
+        check_unique=options.check_unique
     )
 
diff --git a/pyfastaq/runners/to_orfs_gff.py b/pyfastaq/runners/to_orfs_gff.py
index 039016c..46b6e40 100644
--- a/pyfastaq/runners/to_orfs_gff.py
+++ b/pyfastaq/runners/to_orfs_gff.py
@@ -9,4 +9,4 @@ def run(description):
     parser.add_argument('infile', help='Name of input file')
     parser.add_argument('outfile', help='Name of output GFF file')
     options = parser.parse_args()
-    tasks.fastaq_to_orfs_gff(options.infile, options.gff_out, min_length=options.min_length)
+    tasks.fastaq_to_orfs_gff(options.infile, options.outfile, min_length=options.min_length)
diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py
index 4a3c2a1..fdffc60 100644
--- a/pyfastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -14,8 +14,7 @@ class Error (Exception): pass
 # of the file, for any given filehandle
 previous_lines = {}
 
-
-codon2aa = genetic_codes.codes[1]
+genetic_code = 1
 
 redundant_nts = {
     'R': ('A', 'G'),
@@ -322,6 +321,25 @@ class Fasta:
 
         return sorted(orfs, key=lambda t:t[0])
 
+
+    def is_complete_orf(self):
+        '''Returns true iff length is >= 6, is a multiple of 3, and there is exactly one stop codon in the sequence and it is at the end'''
+        if len(self) %3 != 0 or len(self) < 6:
+            return False
+
+        orfs = self.orfs()
+        complete_orf = intervals.Interval(0, len(self) - 1)
+        for orf in orfs:
+            if orf == complete_orf:
+                return True
+        return False
+
+
+    def looks_like_gene(self, translation_table=1):
+        '''Returns true iff: length >=6, length is a multiple of 3, first codon is start, last codon is a stop and has no other stop codons'''
+        return self.is_complete_orf() and len(self) >= 6 and len(self) %3 == 0 and self.seq[0:3] in genetic_codes.starts[genetic_code]
+        
+
     # Fills the object with the next sequence in the file. Returns
     # True if this was successful, False if no more sequences in the file.
     # If reading a file of quality scores, set read_quals = True
@@ -409,7 +427,7 @@ class Fasta:
 
     def translate(self, frame=0):
         '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2'''
-        return Fasta(self.id, ''.join([codon2aa.get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))
+        return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))
 
 
 class Embl(Fasta):
diff --git a/pyfastaq/tasks.py b/pyfastaq/tasks.py
index 7527910..e77e40e 100644
--- a/pyfastaq/tasks.py
+++ b/pyfastaq/tasks.py
@@ -597,6 +597,47 @@ def sort_by_size(infile, outfile, smallest_first=False):
     utils.close(fout)
 
 
+def to_fastg(infile, outfile, circular=None):
+    '''Writes a FASTG file in SPAdes format from input file. Currently only whether or not a sequence is circular is supported. Put circular=set of ids, or circular=filename to make those sequences circular in the output. Puts coverage=1 on all contigs'''
+    if circular is None:
+        to_circularise = set()
+    elif type(circular) is not set:
+        f = utils.open_file_read(circular)
+        to_circularise = set([x.rstrip() for x in f.readlines()])
+        utils.close(f)
+    else:
+        to_circularise = circular
+
+    seq_reader = sequences.file_reader(infile)
+    fout = utils.open_file_write(outfile)
+    nodes = 1
+
+    for seq in seq_reader:
+        new_id = '_'.join([
+            'NODE', str(nodes),
+            'length', str(len(seq)),
+            'cov', '1',
+            'ID', seq.id
+        ])
+
+        if seq.id in to_circularise:
+            seq.id = new_id + ':' + new_id + ';'
+            print(seq, file=fout)
+            seq.revcomp()
+            seq.id = new_id + "':" + new_id + "';"
+            print(seq, file=fout)
+        else:
+            seq.id = new_id + ';'
+            print(seq, file=fout)
+            seq.revcomp()
+            seq.id = new_id + "';"
+            print(seq, file=fout)
+
+        nodes += 1
+
+    utils.close(fout)
+
+
 def translate(infile, outfile, frame=0):
     seq_reader = sequences.file_reader(infile)
     fout = utils.open_file_write(outfile)
@@ -780,16 +821,21 @@ def strip_illumina_suffix(infile, outfile):
     utils.close(f_out)
 
 
-def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False):
+def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False):
     seq_reader = sequences.file_reader(infile)
     f_out = utils.open_file_write(outfile)
     original_line_length = sequences.Fasta.line_length
     sequences.Fasta.line_length = line_length
+    if check_unique:
+        used_names = {} 
 
     for seq in seq_reader:
         if strip_after_first_whitespace:
             seq.strip_after_first_whitespace()
 
+        if check_unique:
+            used_names[seq.id] = used_names.get(seq.id, 0) + 1
+
         if type(seq) == sequences.Fastq:
             print(sequences.Fasta(seq.id, seq.seq), file=f_out)
         else:
@@ -798,6 +844,18 @@ def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False
     utils.close(f_out)
     sequences.Fasta.line_length = original_line_length
 
+    if check_unique:
+        all_unique = True
+
+        for name, count in used_names.items():
+            if count > 1:
+                print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr)
+                all_unique = False
+
+        if not all_unique:
+            raise Error('Not all sequence names unique. Cannot continue')
+    
+
 
 def to_fasta_union(infile, outfile, seqname='union'):
     seq_reader = sequences.file_reader(infile)
diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa
new file mode 100644
index 0000000..c2e1044
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa
@@ -0,0 +1,6 @@
+>1 spam
+ACGT
+>1 eggs
+A
+>2
+GTTTG
diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa
new file mode 100644
index 0000000..d3080af
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa
@@ -0,0 +1,6 @@
+>1
+ACGT
+>1
+A
+>2
+GTTTG
diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa
new file mode 100644
index 0000000..d3000d3
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa
@@ -0,0 +1,6 @@
+>1 abcde
+ACGT
+>2 abcde
+G
+>3 hello
+GTACCA
diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa
new file mode 100644
index 0000000..dc8e2e1
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa
@@ -0,0 +1,6 @@
+>1
+ACGT
+>2
+G
+>3
+GTACCA
diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.fasta b/pyfastaq/tests/data/tasks_test_to_fastg.fasta
new file mode 100644
index 0000000..2c60bfd
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_to_fastg.fasta
@@ -0,0 +1,4 @@
+>seq1
+ATTTG
+>seq2
+ACCG
diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.fastg b/pyfastaq/tests/data/tasks_test_to_fastg.fastg
new file mode 100644
index 0000000..1721e72
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_to_fastg.fastg
@@ -0,0 +1,8 @@
+>NODE_1_length_5_cov_1_ID_seq1;
+ATTTG
+>NODE_1_length_5_cov_1_ID_seq1';
+CAAAT
+>NODE_2_length_4_cov_1_ID_seq2:NODE_2_length_4_cov_1_ID_seq2;
+ACCG
+>NODE_2_length_4_cov_1_ID_seq2':NODE_2_length_4_cov_1_ID_seq2';
+CGGT
diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise b/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise
new file mode 100644
index 0000000..a2da667
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise
@@ -0,0 +1 @@
+seq2
diff --git a/pyfastaq/tests/intervals_test.py b/pyfastaq/tests/intervals_test.py
index e899a63..2878859 100644
--- a/pyfastaq/tests/intervals_test.py
+++ b/pyfastaq/tests/intervals_test.py
@@ -32,6 +32,16 @@ class TestIntervals(unittest.TestCase):
         self.assertEqual(len(intervals.Interval(1,1)), 1)
         self.assertEqual(len(intervals.Interval(10,20)), 11)
 
+    def test_distance_to_point(self):
+        '''Test distance_to_point'''
+        self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(42))
+        self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(44))
+        self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(50))
+        self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(41))
+        self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(51))
+        self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(55))
+        self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(37))
+
     def test_intersects(self):
         '''Intersection of two intervals should do the right thing'''
         a = intervals.Interval(5, 10)
diff --git a/pyfastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py
index cc22c8d..51e8e2e 100644
--- a/pyfastaq/tests/sequences_test.py
+++ b/pyfastaq/tests/sequences_test.py
@@ -231,6 +231,36 @@ class TestFasta(unittest.TestCase):
             self.assertEqual(orfs[i][1], expected[i][1])
 
 
+    def test_is_complete_orf(self):
+        '''Test is_complete_orf'''
+        tests = [
+            (sequences.Fasta('ID', 'TTT'), False),
+            (sequences.Fasta('ID', 'TTTTAA'), True),
+            (sequences.Fasta('ID', 'TTTTAATAA'), False),
+            (sequences.Fasta('ID', 'TTGTAA'), True),
+            (sequences.Fasta('ID', 'TTTAAC'), True),
+            (sequences.Fasta('ID', 'TGA'), False),
+            (sequences.Fasta('ID', 'TGAA'), False),
+        ]
+
+        for t in tests:
+            self.assertEqual(t[0].is_complete_orf(), t[1])
+
+
+    def test_looks_like_gene(self):
+        '''Test looks_like_gene'''
+        tests = [
+            (sequences.Fasta('ID', 'TTT'), False),
+            (sequences.Fasta('ID', 'TTGTAA'), True),
+            (sequences.Fasta('ID', 'TTGTTTTAA'), True),
+            (sequences.Fasta('ID', 'TTGTAATTTTAA'), False),
+            (sequences.Fasta('ID', 'TTGTTTTGAA'), False),
+        ]
+
+        for t in tests:
+            self.assertEqual(t[0].looks_like_gene(), t[1])
+
+
     def test_is_all_Ns(self):
         '''Test is_all_Ns()'''
         self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns())
diff --git a/pyfastaq/tests/tasks_test.py b/pyfastaq/tests/tasks_test.py
index 6d14ef6..12a9870 100644
--- a/pyfastaq/tests/tasks_test.py
+++ b/pyfastaq/tests/tasks_test.py
@@ -357,6 +357,29 @@ class TestSequenceTrim(unittest.TestCase):
         os.unlink(tmp2)
 
 
+class ToFastg(unittest.TestCase):
+    def test_to_fastg_ids_set(self):
+        '''Test to_fastg when ids are a set'''
+        infile = os.path.join(data_dir, 'tasks_test_to_fastg.fasta')
+        tmpfile = 'tmp.to_fastg.fastg'
+        expected = os.path.join(data_dir, 'tasks_test_to_fastg.fastg')
+        ids = {'seq2'}
+        tasks.to_fastg(infile, tmpfile, circular=ids)
+        self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_to_fastg_ids_file(self):
+        '''Test to_fastg when ids in a file'''
+        infile = os.path.join(data_dir, 'tasks_test_to_fastg.fasta')
+        tmpfile = 'tmp.to_fastg.fastg'
+        expected = os.path.join(data_dir, 'tasks_test_to_fastg.fastg')
+        ids_file = os.path.join(data_dir, 'tasks_test_to_fastg.ids_to_circularise')
+        tasks.to_fastg(infile, tmpfile, circular=ids_file)
+        self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
 class TestTranslate(unittest.TestCase):
     def test_translate(self):
         '''Test translate works in each frame'''
@@ -544,6 +567,7 @@ class TestReplaceBases(unittest.TestCase):
         os.unlink(tmpfile)
 
 
+
 class TestSortBySize(unittest.TestCase):
     def test_sort_by_size(self):
         '''Test sort_by_size'''
@@ -592,6 +616,27 @@ class TestToFasta(unittest.TestCase):
         self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa.to_fasta'), tmpfile))
         os.unlink(tmpfile)
 
+    def test_to_fasta_strip_after_whitespace_non_unique(self):
+        '''Test strip_after_whitespace with non-unique names'''
+        tmpfile = 'tmp.strip_after_whitespace.fa'
+        infile = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa')
+        expected = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa')
+
+        with self.assertRaises(tasks.Error):
+            tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=True)
+
+        tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=False)
+        self.assertTrue(filecmp.cmp(tmpfile, expected, shallow=False))
+        os.unlink(tmpfile)
+
+    def test_to_fasta_strip_after_whitespace_unique(self):
+        '''Test strip_after_whitespace with unique names'''
+        tmpfile = 'tmp.strip_after_whitespace.fa'
+        infile = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_unique.in.fa')
+        expected = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_unique.out.fa')
+        tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=True)
+        self.assertTrue(filecmp.cmp(tmpfile, expected, shallow=False))
+        os.unlink(tmpfile)
 
 class TestToUniqueByID(unittest.TestCase):
     def test_to_unique_by_id(self):
diff --git a/setup.py b/setup.py
index ec726f7..9c355f2 100644
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,10 @@
-import os
 import glob
-import sys
 from setuptools import setup, find_packages
 
 
-try:
-    import numpy
-except ImportError:
-    print("Error! numpy for Python3 not found.\nPlease install it (e.g. apt-get install python3-numpy)", file=sys.stderr)
-    sys.exit(1)
-
 setup(
     name='pyfastaq',
-    version='3.2.0',
+    version='3.6.1',
     description='Script to manipulate FASTA and FASTQ files, plus API for developers',
     packages = find_packages(),
     author='Martin Hunt',
@@ -20,7 +12,8 @@ setup(
     url='https://github.com/sanger-pathogens/Fastaq',
     scripts=glob.glob('scripts/*'),
     test_suite='nose.collector',
-    install_requires=['nose >= 1.3'],
+    tests_require=['nose >= 1.3'],
+    install_requires=['numpy >= 1.7.1'],
     license='GPLv3',
     classifiers=[
         'Development Status :: 4 - Beta',

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastaq.git