[med-svn] [fastaq] 01/05: Imported Upstream version 3.6.1
Sascha Steinbiss
sascha-guest at moszumanska.debian.org
Fri Aug 21 22:32:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
sascha-guest pushed a commit to branch master
in repository fastaq.
commit 08dda22ae0c1c9ee5e2d9ebcc49c01252ae66389
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date: Fri Aug 21 08:55:00 2015 +0000
Imported Upstream version 3.6.1
---
pyfastaq/common.py | 2 +-
pyfastaq/genetic_codes.py | 31 +++++++++++
pyfastaq/intervals.py | 7 +++
pyfastaq/runners/to_fasta.py | 4 +-
pyfastaq/runners/to_orfs_gff.py | 2 +-
pyfastaq/sequences.py | 24 +++++++--
pyfastaq/tasks.py | 60 +++++++++++++++++++++-
...o_fasta.strip_after_whitespace_non_unique.in.fa | 6 +++
..._fasta.strip_after_whitespace_non_unique.out.fa | 6 +++
...st.to_fasta.strip_after_whitespace_unique.in.fa | 6 +++
...t.to_fasta.strip_after_whitespace_unique.out.fa | 6 +++
pyfastaq/tests/data/tasks_test_to_fastg.fasta | 4 ++
pyfastaq/tests/data/tasks_test_to_fastg.fastg | 8 +++
.../data/tasks_test_to_fastg.ids_to_circularise | 1 +
pyfastaq/tests/intervals_test.py | 10 ++++
pyfastaq/tests/sequences_test.py | 30 +++++++++++
pyfastaq/tests/tasks_test.py | 45 ++++++++++++++++
setup.py | 13 ++---
18 files changed, 248 insertions(+), 17 deletions(-)
diff --git a/pyfastaq/common.py b/pyfastaq/common.py
index 42b6ccb..f157366 100644
--- a/pyfastaq/common.py
+++ b/pyfastaq/common.py
@@ -1 +1 @@
-version = '3.2.0'
+version = '3.6.1'
diff --git a/pyfastaq/genetic_codes.py b/pyfastaq/genetic_codes.py
index c32c065..81a2279 100644
--- a/pyfastaq/genetic_codes.py
+++ b/pyfastaq/genetic_codes.py
@@ -1,4 +1,6 @@
+# see http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
codes = {}
+starts = {}
#standard genetic code
codes[1] = {
@@ -68,6 +70,11 @@ codes[1] = {
'GGG': 'G',
}
+starts[1] = set([
+ 'TTG',
+ 'CTG',
+ 'ATG',
+])
#mycoplasma genetic code
codes[4] = {
@@ -137,3 +144,27 @@ codes[4] = {
'GGG': 'G'
}
+starts[4] = set([
+ 'TTA',
+ 'TTG',
+ 'CTG',
+ 'ATT',
+ 'ATC',
+ 'ATA',
+ 'ATG',
+ 'GTG',
+])
+
+
+# Bacterial, Archaeal and Plant Plastid Code
+codes[11] = codes[1]
+
+starts[11] = set([
+ 'TTG',
+ 'CTG',
+ 'ATT',
+ 'ATC',
+ 'ATA',
+ 'ATG',
+ 'GTG',
+])
diff --git a/pyfastaq/intervals.py b/pyfastaq/intervals.py
index b320c63..ef344ed 100644
--- a/pyfastaq/intervals.py
+++ b/pyfastaq/intervals.py
@@ -31,6 +31,13 @@ class Interval:
def __le__(self, i):
return self.start < i.start or (self.start == i.start and self.end <= i.end)
+ def distance_to_point(self, p):
+ '''Returns the distance from the point to the interval. Zero if the point lies inside the interval.'''
+ if self.start <= p <= self.end:
+ return 0
+ else:
+ return min(abs(self.start - p), abs(self.end - p))
+
def intersects(self, i):
'''Returns true iff this interval intersects the interval i'''
return self.start <= i.end and i.start <= self.end
diff --git a/pyfastaq/runners/to_fasta.py b/pyfastaq/runners/to_fasta.py
index 379abc6..7cecf1c 100644
--- a/pyfastaq/runners/to_fasta.py
+++ b/pyfastaq/runners/to_fasta.py
@@ -9,12 +9,14 @@ def run(description):
parser.add_argument('outfile', help='Name of output file')
parser.add_argument('-l', '--line_length', type=int, help='Number of bases on each sequence line of output file. Set to zero for no linebreaks in sequences [%(default)s]', default=60)
parser.add_argument('-s', '--strip_after_whitespace', action='store_true', help='Remove everything after first whitespace in every sequence name')
+ parser.add_argument('-u', '--check_unique', action='store_true', help='Die if any of the output sequence names are not unique')
options = parser.parse_args()
tasks.to_fasta(
options.infile,
options.outfile,
line_length=options.line_length,
- strip_after_first_whitespace=options.strip_after_whitespace
+ strip_after_first_whitespace=options.strip_after_whitespace,
+ check_unique=options.check_unique
)
diff --git a/pyfastaq/runners/to_orfs_gff.py b/pyfastaq/runners/to_orfs_gff.py
index 039016c..46b6e40 100644
--- a/pyfastaq/runners/to_orfs_gff.py
+++ b/pyfastaq/runners/to_orfs_gff.py
@@ -9,4 +9,4 @@ def run(description):
parser.add_argument('infile', help='Name of input file')
parser.add_argument('outfile', help='Name of output GFF file')
options = parser.parse_args()
- tasks.fastaq_to_orfs_gff(options.infile, options.gff_out, min_length=options.min_length)
+ tasks.fastaq_to_orfs_gff(options.infile, options.outfile, min_length=options.min_length)
diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py
index 4a3c2a1..fdffc60 100644
--- a/pyfastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -14,8 +14,7 @@ class Error (Exception): pass
# of the file, for any given filehandle
previous_lines = {}
-
-codon2aa = genetic_codes.codes[1]
+genetic_code = 1
redundant_nts = {
'R': ('A', 'G'),
@@ -322,6 +321,25 @@ class Fasta:
return sorted(orfs, key=lambda t:t[0])
+
+ def is_complete_orf(self):
+ '''Returns true iff length is >= 6, is a multiple of 3, and there is exactly one stop codon in the sequence and it is at the end'''
+ if len(self) %3 != 0 or len(self) < 6:
+ return False
+
+ orfs = self.orfs()
+ complete_orf = intervals.Interval(0, len(self) - 1)
+ for orf in orfs:
+ if orf == complete_orf:
+ return True
+ return False
+
+
+ def looks_like_gene(self, translation_table=1):
+ '''Returns true iff: length >=6, length is a multiple of 3, first codon is start, last codon is a stop and has no other stop codons'''
+ return self.is_complete_orf() and len(self) >= 6 and len(self) %3 == 0 and self.seq[0:3] in genetic_codes.starts[genetic_code]
+
+
# Fills the object with the next sequence in the file. Returns
# True if this was successful, False if no more sequences in the file.
# If reading a file of quality scores, set read_quals = True
@@ -409,7 +427,7 @@ class Fasta:
def translate(self, frame=0):
'''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2'''
- return Fasta(self.id, ''.join([codon2aa.get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))
+ return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))
class Embl(Fasta):
diff --git a/pyfastaq/tasks.py b/pyfastaq/tasks.py
index 7527910..e77e40e 100644
--- a/pyfastaq/tasks.py
+++ b/pyfastaq/tasks.py
@@ -597,6 +597,47 @@ def sort_by_size(infile, outfile, smallest_first=False):
utils.close(fout)
+def to_fastg(infile, outfile, circular=None):
+ '''Writes a FASTG file in SPAdes format from input file. Currently only whether or not a sequence is circular is supported. Put circular=set of ids, or circular=filename to make those sequences circular in the output. Puts coverage=1 on all contigs'''
+ if circular is None:
+ to_circularise = set()
+ elif type(circular) is not set:
+ f = utils.open_file_read(circular)
+ to_circularise = set([x.rstrip() for x in f.readlines()])
+ utils.close(f)
+ else:
+ to_circularise = circular
+
+ seq_reader = sequences.file_reader(infile)
+ fout = utils.open_file_write(outfile)
+ nodes = 1
+
+ for seq in seq_reader:
+ new_id = '_'.join([
+ 'NODE', str(nodes),
+ 'length', str(len(seq)),
+ 'cov', '1',
+ 'ID', seq.id
+ ])
+
+ if seq.id in to_circularise:
+ seq.id = new_id + ':' + new_id + ';'
+ print(seq, file=fout)
+ seq.revcomp()
+ seq.id = new_id + "':" + new_id + "';"
+ print(seq, file=fout)
+ else:
+ seq.id = new_id + ';'
+ print(seq, file=fout)
+ seq.revcomp()
+ seq.id = new_id + "';"
+ print(seq, file=fout)
+
+ nodes += 1
+
+ utils.close(fout)
+
+
def translate(infile, outfile, frame=0):
seq_reader = sequences.file_reader(infile)
fout = utils.open_file_write(outfile)
@@ -780,16 +821,21 @@ def strip_illumina_suffix(infile, outfile):
utils.close(f_out)
-def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False):
+def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False):
seq_reader = sequences.file_reader(infile)
f_out = utils.open_file_write(outfile)
original_line_length = sequences.Fasta.line_length
sequences.Fasta.line_length = line_length
+ if check_unique:
+ used_names = {}
for seq in seq_reader:
if strip_after_first_whitespace:
seq.strip_after_first_whitespace()
+ if check_unique:
+ used_names[seq.id] = used_names.get(seq.id, 0) + 1
+
if type(seq) == sequences.Fastq:
print(sequences.Fasta(seq.id, seq.seq), file=f_out)
else:
@@ -798,6 +844,18 @@ def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False
utils.close(f_out)
sequences.Fasta.line_length = original_line_length
+ if check_unique:
+ all_unique = True
+
+ for name, count in used_names.items():
+ if count > 1:
+ print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr)
+ all_unique = False
+
+ if not all_unique:
+ raise Error('Not all sequence names unique. Cannot continue')
+
+
def to_fasta_union(infile, outfile, seqname='union'):
seq_reader = sequences.file_reader(infile)
diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa
new file mode 100644
index 0000000..c2e1044
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa
@@ -0,0 +1,6 @@
+>1 spam
+ACGT
+>1 eggs
+A
+>2
+GTTTG
diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa
new file mode 100644
index 0000000..d3080af
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa
@@ -0,0 +1,6 @@
+>1
+ACGT
+>1
+A
+>2
+GTTTG
diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa
new file mode 100644
index 0000000..d3000d3
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa
@@ -0,0 +1,6 @@
+>1 abcde
+ACGT
+>2 abcde
+G
+>3 hello
+GTACCA
diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa
new file mode 100644
index 0000000..dc8e2e1
--- /dev/null
+++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa
@@ -0,0 +1,6 @@
+>1
+ACGT
+>2
+G
+>3
+GTACCA
diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.fasta b/pyfastaq/tests/data/tasks_test_to_fastg.fasta
new file mode 100644
index 0000000..2c60bfd
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_to_fastg.fasta
@@ -0,0 +1,4 @@
+>seq1
+ATTTG
+>seq2
+ACCG
diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.fastg b/pyfastaq/tests/data/tasks_test_to_fastg.fastg
new file mode 100644
index 0000000..1721e72
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_to_fastg.fastg
@@ -0,0 +1,8 @@
+>NODE_1_length_5_cov_1_ID_seq1;
+ATTTG
+>NODE_1_length_5_cov_1_ID_seq1';
+CAAAT
+>NODE_2_length_4_cov_1_ID_seq2:NODE_2_length_4_cov_1_ID_seq2;
+ACCG
+>NODE_2_length_4_cov_1_ID_seq2':NODE_2_length_4_cov_1_ID_seq2';
+CGGT
diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise b/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise
new file mode 100644
index 0000000..a2da667
--- /dev/null
+++ b/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise
@@ -0,0 +1 @@
+seq2
diff --git a/pyfastaq/tests/intervals_test.py b/pyfastaq/tests/intervals_test.py
index e899a63..2878859 100644
--- a/pyfastaq/tests/intervals_test.py
+++ b/pyfastaq/tests/intervals_test.py
@@ -32,6 +32,16 @@ class TestIntervals(unittest.TestCase):
self.assertEqual(len(intervals.Interval(1,1)), 1)
self.assertEqual(len(intervals.Interval(10,20)), 11)
+ def test_distance_to_point(self):
+ '''Test distance_to_point'''
+ self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(42))
+ self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(44))
+ self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(50))
+ self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(41))
+ self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(51))
+ self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(55))
+ self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(37))
+
def test_intersects(self):
'''Intersection of two intervals should do the right thing'''
a = intervals.Interval(5, 10)
diff --git a/pyfastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py
index cc22c8d..51e8e2e 100644
--- a/pyfastaq/tests/sequences_test.py
+++ b/pyfastaq/tests/sequences_test.py
@@ -231,6 +231,36 @@ class TestFasta(unittest.TestCase):
self.assertEqual(orfs[i][1], expected[i][1])
+ def test_is_complete_orf(self):
+ '''Test is_complete_orf'''
+ tests = [
+ (sequences.Fasta('ID', 'TTT'), False),
+ (sequences.Fasta('ID', 'TTTTAA'), True),
+ (sequences.Fasta('ID', 'TTTTAATAA'), False),
+ (sequences.Fasta('ID', 'TTGTAA'), True),
+ (sequences.Fasta('ID', 'TTTAAC'), True),
+ (sequences.Fasta('ID', 'TGA'), False),
+ (sequences.Fasta('ID', 'TGAA'), False),
+ ]
+
+ for t in tests:
+ self.assertEqual(t[0].is_complete_orf(), t[1])
+
+
+ def test_looks_like_gene(self):
+ '''Test looks_like_gene'''
+ tests = [
+ (sequences.Fasta('ID', 'TTT'), False),
+ (sequences.Fasta('ID', 'TTGTAA'), True),
+ (sequences.Fasta('ID', 'TTGTTTTAA'), True),
+ (sequences.Fasta('ID', 'TTGTAATTTTAA'), False),
+ (sequences.Fasta('ID', 'TTGTTTTGAA'), False),
+ ]
+
+ for t in tests:
+ self.assertEqual(t[0].looks_like_gene(), t[1])
+
+
def test_is_all_Ns(self):
'''Test is_all_Ns()'''
self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns())
diff --git a/pyfastaq/tests/tasks_test.py b/pyfastaq/tests/tasks_test.py
index 6d14ef6..12a9870 100644
--- a/pyfastaq/tests/tasks_test.py
+++ b/pyfastaq/tests/tasks_test.py
@@ -357,6 +357,29 @@ class TestSequenceTrim(unittest.TestCase):
os.unlink(tmp2)
+class ToFastg(unittest.TestCase):
+ def test_to_fastg_ids_set(self):
+ '''Test to_fastg when ids are a set'''
+ infile = os.path.join(data_dir, 'tasks_test_to_fastg.fasta')
+ tmpfile = 'tmp.to_fastg.fastg'
+ expected = os.path.join(data_dir, 'tasks_test_to_fastg.fastg')
+ ids = {'seq2'}
+ tasks.to_fastg(infile, tmpfile, circular=ids)
+ self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+ os.unlink(tmpfile)
+
+
+ def test_to_fastg_ids_file(self):
+ '''Test to_fastg when ids in a file'''
+ infile = os.path.join(data_dir, 'tasks_test_to_fastg.fasta')
+ tmpfile = 'tmp.to_fastg.fastg'
+ expected = os.path.join(data_dir, 'tasks_test_to_fastg.fastg')
+ ids_file = os.path.join(data_dir, 'tasks_test_to_fastg.ids_to_circularise')
+ tasks.to_fastg(infile, tmpfile, circular=ids_file)
+ self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+ os.unlink(tmpfile)
+
+
class TestTranslate(unittest.TestCase):
def test_translate(self):
'''Test translate works in each frame'''
@@ -544,6 +567,7 @@ class TestReplaceBases(unittest.TestCase):
os.unlink(tmpfile)
+
class TestSortBySize(unittest.TestCase):
def test_sort_by_size(self):
'''Test sort_by_size'''
@@ -592,6 +616,27 @@ class TestToFasta(unittest.TestCase):
self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa.to_fasta'), tmpfile))
os.unlink(tmpfile)
+ def test_to_fasta_strip_after_whitespace_non_unique(self):
+ '''Test strip_after_whitespace with non-unique names'''
+ tmpfile = 'tmp.strip_after_whitespace.fa'
+ infile = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa')
+ expected = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa')
+
+ with self.assertRaises(tasks.Error):
+ tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=True)
+
+ tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=False)
+ self.assertTrue(filecmp.cmp(tmpfile, expected, shallow=False))
+ os.unlink(tmpfile)
+
+ def test_to_fasta_strip_after_whitespace_unique(self):
+ '''Test strip_after_whitespace with unique names'''
+ tmpfile = 'tmp.strip_after_whitespace.fa'
+ infile = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_unique.in.fa')
+ expected = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_unique.out.fa')
+ tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=True)
+ self.assertTrue(filecmp.cmp(tmpfile, expected, shallow=False))
+ os.unlink(tmpfile)
class TestToUniqueByID(unittest.TestCase):
def test_to_unique_by_id(self):
diff --git a/setup.py b/setup.py
index ec726f7..9c355f2 100644
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,10 @@
-import os
import glob
-import sys
from setuptools import setup, find_packages
-try:
- import numpy
-except ImportError:
- print("Error! numpy for Python3 not found.\nPlease install it (e.g. apt-get install python3-numpy)", file=sys.stderr)
- sys.exit(1)
-
setup(
name='pyfastaq',
- version='3.2.0',
+ version='3.6.1',
description='Script to manipulate FASTA and FASTQ files, plus API for developers',
packages = find_packages(),
author='Martin Hunt',
@@ -20,7 +12,8 @@ setup(
url='https://github.com/sanger-pathogens/Fastaq',
scripts=glob.glob('scripts/*'),
test_suite='nose.collector',
- install_requires=['nose >= 1.3'],
+ tests_require=['nose >= 1.3'],
+ install_requires=['numpy >= 1.7.1'],
license='GPLv3',
classifiers=[
'Development Status :: 4 - Beta',
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastaq.git
More information about the debian-med-commit
mailing list