[med-svn] [fastaq] 01/04: Imported Upstream version 3.14.0
Sascha Steinbiss
satta at debian.org
Thu Sep 1 12:51:01 UTC 2016
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository fastaq.
commit ac3122418d00a5f7e110c391707cd09dc5c617c4
Author: Sascha Steinbiss <satta at debian.org>
Date: Thu Sep 1 12:21:02 2016 +0000
Imported Upstream version 3.14.0
---
README.md | 1 +
pyfastaq/runners/acgtn_only.py | 12 ++++++++++++
pyfastaq/runners/to_random_subset.py | 2 +-
pyfastaq/sequences.py | 6 ++++++
pyfastaq/tasks.py | 19 ++++++++++++++-----
pyfastaq/tests/data/test_acgtn_only.expected.fa | 4 ++++
pyfastaq/tests/data/test_acgtn_only.in.fa | 4 ++++
pyfastaq/tests/sequences_test.py | 14 ++++++++++++++
pyfastaq/tests/tasks_test.py | 12 ++++++++++++
scripts/fastaq | 3 ++-
setup.py | 2 +-
11 files changed, 71 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index 6986255..c17c54f 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ Available commands
| Command | Description |
|-----------------------|----------------------------------------------------------------------|
+| acgtn_only | Replace every non acgtnACGTN with an N |
| add_indels | Deletes or inserts bases at given position(s) |
| caf_to_fastq | Converts a CAF file to FASTQ format |
| capillary_to_pairs | Converts file of capillary reads to paired and unpaired files |
diff --git a/pyfastaq/runners/acgtn_only.py b/pyfastaq/runners/acgtn_only.py
new file mode 100644
index 0000000..cbcf793
--- /dev/null
+++ b/pyfastaq/runners/acgtn_only.py
@@ -0,0 +1,12 @@
+import argparse
+from pyfastaq import tasks
+
+def run(description):
+ parser = argparse.ArgumentParser(
+ description = 'Replaces any character that is not one of acgtACGTnN with an N',
+ usage = 'fastaq acgtn_only [options] <infile> <outfile>')
+ parser.add_argument('infile', help='Name of input file')
+ parser.add_argument('outfile', help='Name of output file')
+ options = parser.parse_args()
+ tasks.acgtn_only(options.infile, options.outfile)
+
diff --git a/pyfastaq/runners/to_random_subset.py b/pyfastaq/runners/to_random_subset.py
index 0f94c33..2ca6cd3 100644
--- a/pyfastaq/runners/to_random_subset.py
+++ b/pyfastaq/runners/to_random_subset.py
@@ -6,7 +6,7 @@ from pyfastaq import sequences, utils
def run(description):
parser = argparse.ArgumentParser(
description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' +
- 'from a mates file. Ouptut is interleaved if mates file given',
+ 'from a mates file. Output is interleaved if mates file given',
usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>')
parser.add_argument('--mate_file', help='Name of mates file')
parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT')
diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py
index 55dc7e5..ee20b7c 100644
--- a/pyfastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -252,6 +252,12 @@ class Fasta:
'''Replaces all occurrences of 'old' with 'new' '''
self.seq = self.seq.replace(old, new)
+
+ def replace_non_acgt(self):
+ '''Replace all non acgt characters with an N (case insensitive)'''
+ self.seq = re.sub(r'''[^acgtACGTnN]''', 'N', self.seq)
+
+
def replace_interval(self, start, end, new):
'''Replaces the sequence from start to end with the sequence "new"'''
if start > end or start > len(self) - 1 or end > len(self) - 1:
diff --git a/pyfastaq/tasks.py b/pyfastaq/tasks.py
index 0085b83..3107672 100644
--- a/pyfastaq/tasks.py
+++ b/pyfastaq/tasks.py
@@ -6,6 +6,15 @@ from pyfastaq import sequences, utils, caf
class Error (Exception): pass
+def acgtn_only(infile, outfile):
+ '''Replace every non-acgtn (case insensitve) character with an N'''
+ f = utils.open_file_write(outfile)
+ for seq in sequences.file_reader(infile):
+ seq.replace_non_acgt()
+ print(seq, file=f)
+ utils.close(f)
+
+
def caf_to_fastq(infile, outfile, min_length=0, trim=False):
'''Convert a CAF file to fastq. Reads shorter than min_length are not output. If clipping information is in the CAF file (with a line Clipping QUAL ...) and trim=True, then trim the reads'''
caf_reader = caf.file_reader(infile)
@@ -355,7 +364,7 @@ def get_seqs_flanking_gaps(infile, outfile, left, right):
def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None):
- '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
+ '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.'''
seq_reader_1 = sequences.file_reader(infile_1)
seq_reader_2 = sequences.file_reader(infile_2)
@@ -810,7 +819,7 @@ def stats_from_fai(infile):
stats['total_length'] = sum(lengths)
stats['mean'] = stats['total_length'] / len(lengths)
stats['number'] = len(lengths)
-
+
cumulative_length = 0
for length in lengths:
cumulative_length += length
@@ -819,7 +828,7 @@ def stats_from_fai(infile):
break
else:
stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')}
-
+
return stats
@@ -842,7 +851,7 @@ def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False
original_line_length = sequences.Fasta.line_length
sequences.Fasta.line_length = line_length
if check_unique:
- used_names = {}
+ used_names = {}
for seq in seq_reader:
if strip_after_first_whitespace:
@@ -869,7 +878,7 @@ def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False
if not all_unique:
raise Error('Not all sequence names unique. Cannot continue')
-
+
def to_fasta_union(infile, outfile, seqname='union'):
diff --git a/pyfastaq/tests/data/test_acgtn_only.expected.fa b/pyfastaq/tests/data/test_acgtn_only.expected.fa
new file mode 100644
index 0000000..9f6a22d
--- /dev/null
+++ b/pyfastaq/tests/data/test_acgtn_only.expected.fa
@@ -0,0 +1,4 @@
+>seq1
+acgtACGTnN
+>seq2
+aNcNgNNT
diff --git a/pyfastaq/tests/data/test_acgtn_only.in.fa b/pyfastaq/tests/data/test_acgtn_only.in.fa
new file mode 100644
index 0000000..9b14690
--- /dev/null
+++ b/pyfastaq/tests/data/test_acgtn_only.in.fa
@@ -0,0 +1,4 @@
+>seq1
+acgtACGTnN
+>seq2
+aXcRg.?T
diff --git a/pyfastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py
index 2cc2396..8e4e18b 100644
--- a/pyfastaq/tests/sequences_test.py
+++ b/pyfastaq/tests/sequences_test.py
@@ -396,6 +396,20 @@ class TestFasta(unittest.TestCase):
fa.replace_bases('U', 'T')
self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT'))
+
+ def test_replace_non_acgt(self):
+ '''test replace_non_acgt'''
+ tests = [
+ ('acgtACGTnN', 'acgtACGTnN'),
+ ('abc.g-T?aRC1T', 'aNcNgNTNaNCNT')
+ ]
+
+ for seq, expected in tests:
+ fa = sequences.Fasta('id', seq)
+ fa.replace_non_acgt()
+ self.assertEqual(expected, fa.seq)
+
+
def test_replace_interval(self):
'''Test replace_interval()'''
fa = sequences.Fasta('ID', 'ACGTA')
diff --git a/pyfastaq/tests/tasks_test.py b/pyfastaq/tests/tasks_test.py
index e8688b9..b77dbf8 100644
--- a/pyfastaq/tests/tasks_test.py
+++ b/pyfastaq/tests/tasks_test.py
@@ -11,6 +11,18 @@ data_dir = os.path.join(modules_dir, 'tests', 'data')
class Error (Exception): pass
+
+class TestACGTN_only(unittest.TestCase):
+ def test_acgtn_only(self):
+ '''Test acgtn_only'''
+ tmpfile = 'tmp.test_acgtn_only.fa'
+ infile = os.path.join(data_dir, 'test_acgtn_only.in.fa')
+ expected = os.path.join(data_dir, 'test_acgtn_only.expected.fa')
+ tasks.acgtn_only(infile, tmpfile)
+ self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+ os.unlink(tmpfile)
+
+
class TestCafToFastq(unittest.TestCase):
def test_caf_to_fastq_default(self):
'''Test caf_to_fastq with no filtering'''
diff --git a/scripts/fastaq b/scripts/fastaq
index cffaf4b..e0c470a 100755
--- a/scripts/fastaq
+++ b/scripts/fastaq
@@ -4,6 +4,7 @@ import argparse
import sys
tasks = {
+ 'acgtn_only': 'Replace every non acgtnACGTN with an N',
'add_indels': 'Deletes or inserts bases at given position(s)',
'caf_to_fastq': 'Converts a CAF file to FASTQ format',
'capillary_to_pairs': 'Converts file of capillary reads to paired and unpaired files',
@@ -20,7 +21,7 @@ tasks = {
'make_random_contigs': 'Make contigs of random sequence',
'merge': 'Converts multi sequence file to a single sequence',
'replace_bases': 'Replaces all occurrences of one letter with another',
- 'reverse_complement': 'Reverse complement all sequences',
+ 'reverse_complement': 'Reverse complement all sequences',
'scaffolds_to_contigs': 'Creates a file of contigs from a file of scaffolds',
'search_for_seq': 'Find all exact matches to a string (and its reverse complement)',
'sequence_trim': 'Trim exact matches to a given string off the start of every sequence',
diff --git a/setup.py b/setup.py
index cb326b6..f9a6ed2 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ from setuptools import setup, find_packages
setup(
name='pyfastaq',
- version='3.13.0',
+ version='3.14.0',
description='Script to manipulate FASTA and FASTQ files, plus API for developers',
packages = find_packages(),
author='Martin Hunt',
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastaq.git
More information about the debian-med-commit
mailing list