[med-svn] [fastaq] 01/04: Imported Upstream version 3.14.0

Sascha Steinbiss satta at debian.org
Thu Sep 1 12:51:01 UTC 2016


This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository fastaq.

commit ac3122418d00a5f7e110c391707cd09dc5c617c4
Author: Sascha Steinbiss <satta at debian.org>
Date:   Thu Sep 1 12:21:02 2016 +0000

    Imported Upstream version 3.14.0
---
 README.md                                       |  1 +
 pyfastaq/runners/acgtn_only.py                  | 12 ++++++++++++
 pyfastaq/runners/to_random_subset.py            |  2 +-
 pyfastaq/sequences.py                           |  6 ++++++
 pyfastaq/tasks.py                               | 19 ++++++++++++++-----
 pyfastaq/tests/data/test_acgtn_only.expected.fa |  4 ++++
 pyfastaq/tests/data/test_acgtn_only.in.fa       |  4 ++++
 pyfastaq/tests/sequences_test.py                | 14 ++++++++++++++
 pyfastaq/tests/tasks_test.py                    | 12 ++++++++++++
 scripts/fastaq                                  |  3 ++-
 setup.py                                        |  2 +-
 11 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 6986255..c17c54f 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ Available commands
 
 | Command               | Description                                                          |
 |-----------------------|----------------------------------------------------------------------|
+| acgtn_only            | Replace every non acgtnACGTN with an N                               |
 | add_indels            | Deletes or inserts bases at given position(s)                        |
 | caf_to_fastq          | Converts a CAF file to FASTQ format                                  |
 | capillary_to_pairs    | Converts file of capillary reads to paired and unpaired files        |
diff --git a/pyfastaq/runners/acgtn_only.py b/pyfastaq/runners/acgtn_only.py
new file mode 100644
index 0000000..cbcf793
--- /dev/null
+++ b/pyfastaq/runners/acgtn_only.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+    parser = argparse.ArgumentParser(
+    description = 'Replaces any character that is not one of acgtACGTnN with an N',
+    usage = 'fastaq acgtn_only [options] <infile> <outfile>')
+    parser.add_argument('infile', help='Name of input file')
+    parser.add_argument('outfile', help='Name of output file')
+    options = parser.parse_args()
+    tasks.acgtn_only(options.infile, options.outfile)
+
diff --git a/pyfastaq/runners/to_random_subset.py b/pyfastaq/runners/to_random_subset.py
index 0f94c33..2ca6cd3 100644
--- a/pyfastaq/runners/to_random_subset.py
+++ b/pyfastaq/runners/to_random_subset.py
@@ -6,7 +6,7 @@ from pyfastaq import sequences, utils
 def run(description):
     parser = argparse.ArgumentParser(
         description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' +
-                      'from a mates file.  Ouptut is interleaved if mates file given',
+                      'from a mates file.  Output is interleaved if mates file given',
         usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>')
     parser.add_argument('--mate_file', help='Name of mates file')
     parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT')
diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py
index 55dc7e5..ee20b7c 100644
--- a/pyfastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -252,6 +252,12 @@ class Fasta:
         '''Replaces all occurrences of 'old' with 'new' '''
         self.seq = self.seq.replace(old, new)
 
+
+    def replace_non_acgt(self):
+        '''Replace all non acgt characters with an N (case insensitive)'''
+        self.seq = re.sub(r'''[^acgtACGTnN]''', 'N', self.seq)
+
+
     def replace_interval(self, start, end, new):
         '''Replaces the sequence from start to end with the sequence "new"'''
         if start > end or start > len(self) - 1 or end > len(self) - 1:
diff --git a/pyfastaq/tasks.py b/pyfastaq/tasks.py
index 0085b83..3107672 100644
--- a/pyfastaq/tasks.py
+++ b/pyfastaq/tasks.py
@@ -6,6 +6,15 @@ from pyfastaq import sequences, utils, caf
 
 class Error (Exception): pass
 
+def acgtn_only(infile, outfile):
+    '''Replace every non-acgtn (case insensitve) character with an N'''
+    f = utils.open_file_write(outfile)
+    for seq in sequences.file_reader(infile):
+        seq.replace_non_acgt()
+        print(seq, file=f)
+    utils.close(f)
+
+
 def caf_to_fastq(infile, outfile, min_length=0, trim=False):
     '''Convert a CAF file to fastq. Reads shorter than min_length are not output. If clipping information is in the CAF file (with a line Clipping QUAL ...) and trim=True, then trim the reads'''
     caf_reader = caf.file_reader(infile)
@@ -355,7 +364,7 @@ def get_seqs_flanking_gaps(infile, outfile, left, right):
 
 
 def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None):
-    '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end 
+    '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
     of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.'''
     seq_reader_1 = sequences.file_reader(infile_1)
     seq_reader_2 = sequences.file_reader(infile_2)
@@ -810,7 +819,7 @@ def stats_from_fai(infile):
         stats['total_length'] = sum(lengths)
         stats['mean'] = stats['total_length'] / len(lengths)
         stats['number'] = len(lengths)
- 
+
         cumulative_length = 0
         for length in lengths:
             cumulative_length += length
@@ -819,7 +828,7 @@ def stats_from_fai(infile):
                 break
     else:
         stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')}
-           
+
     return stats
 
 
@@ -842,7 +851,7 @@ def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False
     original_line_length = sequences.Fasta.line_length
     sequences.Fasta.line_length = line_length
     if check_unique:
-        used_names = {} 
+        used_names = {}
 
     for seq in seq_reader:
         if strip_after_first_whitespace:
@@ -869,7 +878,7 @@ def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False
 
         if not all_unique:
             raise Error('Not all sequence names unique. Cannot continue')
-    
+
 
 
 def to_fasta_union(infile, outfile, seqname='union'):
diff --git a/pyfastaq/tests/data/test_acgtn_only.expected.fa b/pyfastaq/tests/data/test_acgtn_only.expected.fa
new file mode 100644
index 0000000..9f6a22d
--- /dev/null
+++ b/pyfastaq/tests/data/test_acgtn_only.expected.fa
@@ -0,0 +1,4 @@
+>seq1
+acgtACGTnN
+>seq2
+aNcNgNNT
diff --git a/pyfastaq/tests/data/test_acgtn_only.in.fa b/pyfastaq/tests/data/test_acgtn_only.in.fa
new file mode 100644
index 0000000..9b14690
--- /dev/null
+++ b/pyfastaq/tests/data/test_acgtn_only.in.fa
@@ -0,0 +1,4 @@
+>seq1
+acgtACGTnN
+>seq2
+aXcRg.?T
diff --git a/pyfastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py
index 2cc2396..8e4e18b 100644
--- a/pyfastaq/tests/sequences_test.py
+++ b/pyfastaq/tests/sequences_test.py
@@ -396,6 +396,20 @@ class TestFasta(unittest.TestCase):
         fa.replace_bases('U', 'T')
         self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT'))
 
+
+    def test_replace_non_acgt(self):
+        '''test replace_non_acgt'''
+        tests = [
+            ('acgtACGTnN', 'acgtACGTnN'),
+            ('abc.g-T?aRC1T', 'aNcNgNTNaNCNT')
+        ]
+
+        for seq, expected in tests:
+            fa = sequences.Fasta('id', seq)
+            fa.replace_non_acgt()
+            self.assertEqual(expected, fa.seq)
+
+
     def test_replace_interval(self):
         '''Test replace_interval()'''
         fa = sequences.Fasta('ID', 'ACGTA')
diff --git a/pyfastaq/tests/tasks_test.py b/pyfastaq/tests/tasks_test.py
index e8688b9..b77dbf8 100644
--- a/pyfastaq/tests/tasks_test.py
+++ b/pyfastaq/tests/tasks_test.py
@@ -11,6 +11,18 @@ data_dir = os.path.join(modules_dir, 'tests', 'data')
 
 class Error (Exception): pass
 
+
+class TestACGTN_only(unittest.TestCase):
+    def test_acgtn_only(self):
+        '''Test acgtn_only'''
+        tmpfile = 'tmp.test_acgtn_only.fa'
+        infile = os.path.join(data_dir, 'test_acgtn_only.in.fa')
+        expected = os.path.join(data_dir, 'test_acgtn_only.expected.fa')
+        tasks.acgtn_only(infile, tmpfile)
+        self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
 class TestCafToFastq(unittest.TestCase):
     def test_caf_to_fastq_default(self):
         '''Test caf_to_fastq with no filtering'''
diff --git a/scripts/fastaq b/scripts/fastaq
index cffaf4b..e0c470a 100755
--- a/scripts/fastaq
+++ b/scripts/fastaq
@@ -4,6 +4,7 @@ import argparse
 import sys
 
 tasks = {
+    'acgtn_only':             'Replace every non acgtnACGTN with an N',
     'add_indels':             'Deletes or inserts bases at given position(s)',
     'caf_to_fastq':           'Converts a CAF file to FASTQ format',
     'capillary_to_pairs':     'Converts file of capillary reads to paired and unpaired files',
@@ -20,7 +21,7 @@ tasks = {
     'make_random_contigs':    'Make contigs of random sequence',
     'merge':                  'Converts multi sequence file to a single sequence',
     'replace_bases':          'Replaces all occurrences of one letter with another',
-    'reverse_complement':     'Reverse complement all sequences', 
+    'reverse_complement':     'Reverse complement all sequences',
     'scaffolds_to_contigs':   'Creates a file of contigs from a file of scaffolds',
     'search_for_seq':         'Find all exact matches to a string (and its reverse complement)',
     'sequence_trim':          'Trim exact matches to a given string off the start of every sequence',
diff --git a/setup.py b/setup.py
index cb326b6..f9a6ed2 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ from setuptools import setup, find_packages
 
 setup(
     name='pyfastaq',
-    version='3.13.0',
+    version='3.14.0',
     description='Script to manipulate FASTA and FASTQ files, plus API for developers',
     packages = find_packages(),
     author='Martin Hunt',

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastaq.git



More information about the debian-med-commit mailing list