[med-svn] [ariba] 01/03: New upstream version 2.10.0+ds
Sascha Steinbiss
satta at debian.org
Wed Jun 28 17:08:22 UTC 2017
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository ariba.
commit 5fd263169499b4c03aa3a95136fe246cead68a7e
Author: Sascha Steinbiss <satta at debian.org>
Date: Wed Jun 28 19:04:37 2017 +0200
New upstream version 2.10.0+ds
---
ariba/__init__.py | 1 +
ariba/flag.py | 3 ++
ariba/mic_plotter.py | 2 +-
ariba/ref_preparer.py | 8 ++++-
ariba/reference_data.py | 19 +++++++----
ariba/report_filter.py | 3 ++
ariba/report_flag_expander.py | 37 ++++++++++++++++++++++
ariba/tasks/__init__.py | 1 +
ariba/tasks/expandflag.py | 8 +++++
.../01.filter.check_genes.log | 10 +++---
.../data/reference_data_test_remove_bad_genes.log | 10 +++---
ariba/tests/data/report_flag_expander.run.in.tsv | 3 ++
ariba/tests/data/report_flag_expander.run.out.tsv | 3 ++
ariba/tests/flag_test.py | 8 +++++
ariba/tests/reference_data_test.py | 17 ++++++----
ariba/tests/report_filter_test.py | 15 +++++++++
ariba/tests/report_flag_expander_test.py | 20 ++++++++++++
scripts/ariba | 12 +++++++
setup.py | 2 +-
19 files changed, 157 insertions(+), 25 deletions(-)
diff --git a/ariba/__init__.py b/ariba/__init__.py
index ca6fc15..57037a3 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -39,6 +39,7 @@ __all__ = [
'ref_seq_chooser',
'report',
'report_filter',
+ 'report_flag_expander',
'scaffold_graph',
'samtools_variants',
'sequence_metadata',
diff --git a/ariba/flag.py b/ariba/flag.py
index 201d4e7..f0e9171 100644
--- a/ariba/flag.py
+++ b/ariba/flag.py
@@ -61,3 +61,6 @@ class Flag:
def has(self, s):
return self.flags[s]
+
+ def to_comma_separated_string(self):
+ return ','.join([f for f in flags_in_order if self.flags[f]])
diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py
index ca5992f..644e5b2 100644
--- a/ariba/mic_plotter.py
+++ b/ariba/mic_plotter.py
@@ -605,7 +605,7 @@ class MicPlotter:
y_tick_positions, y_tick_labels = MicPlotter._top_plot_y_ticks(mic_data, self.antibiotic, self.log_y)
plots[0].yaxis.set_ticks(y_tick_positions)
plots[0].set_yticklabels(y_tick_labels)
- ylabel = r'$\log_' + str(int(self.log_y)) + '$(MIC) $\mu$g/mL' if self.log_y > 0 else r'MIC $\mu$g/mL'
+ ylabel = r'MIC ($\mu$g/mL)'
plots[0].set_ylabel(ylabel)
plots[0].set_xticklabels([])
plots[0].set_title(self.main_title, fontsize=18)
diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py
index e139dd8..4f2ab81 100644
--- a/ariba/ref_preparer.py
+++ b/ariba/ref_preparer.py
@@ -183,7 +183,7 @@ class RefPreparer:
print('\nLoading and checking input data', flush=True)
self.refdata.rename_sequences(os.path.join(outdir, '00.rename_info'))
- self.refdata.sanity_check(os.path.join(outdir, '01.filter'))
+ number_of_removed_seqs, number_of_bad_variants_logged = self.refdata.sanity_check(os.path.join(outdir, '01.filter'))
if self.verbose:
print('\nRunning cdhit', flush=True)
@@ -210,3 +210,9 @@ class RefPreparer:
with open(clusters_pickle_file, 'wb') as f:
pickle.dump(clusters, f)
+ if number_of_removed_seqs > 0:
+ print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the log file 01.filter.check_genes.log for details. This will show them:', file=sys.stderr)
+ print(' grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr)
+
+ if number_of_bad_variants_logged > 0:
+ print('WARNING. Problem with at least one variant. Problem variants are rmoved. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
diff --git a/ariba/reference_data.py b/ariba/reference_data.py
index 69f46f5..92e56d4 100644
--- a/ariba/reference_data.py
+++ b/ariba/reference_data.py
@@ -196,10 +196,12 @@ class ReferenceData:
log_file = out_prefix + '.check_metadata.log'
tsv_file = out_prefix + '.check_metadata.tsv'
log_fh = pyfastaq.utils.open_file_write(log_file)
+ log_lines = 0
for sequence_name, metadata_dict in sorted(all_metadata.items()):
if sequence_name in removed_sequences:
print(sequence_name, 'was removed because does not look like a gene, so removing its metadata', file=log_fh)
+ log_lines += 1
del all_metadata[sequence_name]
continue
@@ -210,6 +212,7 @@ class ReferenceData:
for position in metadata_dict['p']:
for metadata in metadata_dict['p'][position]:
print(sequence_name, 'variant is an amino acid change, but sequence is non-coding. Removing. Line of file was:', metadata, file=log_fh)
+ log_lines += 1
metadata_dict['p'] = {}
@@ -223,6 +226,7 @@ class ReferenceData:
if not metadata.variant.sanity_check_against_seq(sequences[sequence_name], translate_seq=to_translate):
print(sequence_name, 'variant does not match reference. Removing. Line of file was:', metadata, file=log_fh)
+ log_lines += 1
meta_to_remove.append(metadata)
continue
@@ -239,6 +243,7 @@ class ReferenceData:
if metadata_dict['variant_only'] and len(metadata_dict['n']) == len(metadata_dict['p']) == len(metadata_dict['.']) == 0:
print(sequence_name, 'No remaining data after checks. Removing this sequence because it is variants only', file=log_fh)
+ log_lines += 1
genes_to_remove.add(sequence_name)
for sequence_name in genes_to_remove:
@@ -247,21 +252,22 @@ class ReferenceData:
pyfastaq.utils.close(log_fh)
ReferenceData._write_metadata_tsv(all_metadata, tsv_file)
+ return log_lines
@classmethod
def _try_to_get_gene_seq(cls, seq, min_length, max_length):
seq.seq = seq.seq.upper()
if len(seq) < min_length:
- return None, 'Remove: too short. Length: ' + str(len(seq))
+ return None, 'REMOVE\tToo short. Length: ' + str(len(seq))
elif len(seq) > max_length:
- return None, 'Remove: too long. Length: ' + str(len(seq))
+ return None, 'REMOVE\tToo long. Length: ' + str(len(seq))
else:
got = seq.make_into_gene()
if got is None:
- return None, 'Does not look like a gene (tried both strands and all reading frames) ' + seq.seq
+ return None, 'REMOVE\tDoes not look like a gene (tried both strands and all reading frames) ' + seq.seq
else:
- return got[0], 'Made ' + seq.id + ' into gene. strand=' + got[1] + ', frame=' + str(got[2])
+ return got[0], 'KEEP\tMade into gene. strand=' + got[1] + ', frame=' + str(got[2])
@classmethod
@@ -284,7 +290,7 @@ class ReferenceData:
sequences[name] = new_seq
if message is not None:
- print(name, message, file=log_fh)
+ print(name, message, sep='\t', file=log_fh)
pyfastaq.utils.close(log_fh)
@@ -296,7 +302,8 @@ class ReferenceData:
def sanity_check(self, outprefix):
removed_seqs = self._remove_bad_genes(self.sequences, self.metadata, outprefix + '.check_genes.log', self.min_gene_length, self.max_gene_length)
- ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, removed_seqs)
+ log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, removed_seqs)
+ return len(removed_seqs), log_lines
@classmethod
diff --git a/ariba/report_filter.py b/ariba/report_filter.py
index b89de9b..d7cf54d 100644
--- a/ariba/report_filter.py
+++ b/ariba/report_filter.py
@@ -152,6 +152,9 @@ class ReportFilter:
@staticmethod
def _remove_all_after_first_frameshift(dicts_list):
+ if len(dicts_list) == 0 or dicts_list[0]['flag'].has('complete_gene'):
+ return dicts_list
+
fshift_starts = [int(d['ref_start']) for d in dicts_list if d.get('ref_ctg_effect', None) == 'FSHIFT']
if len(fshift_starts) == 0:
return dicts_list
diff --git a/ariba/report_flag_expander.py b/ariba/report_flag_expander.py
new file mode 100644
index 0000000..da72fea
--- /dev/null
+++ b/ariba/report_flag_expander.py
@@ -0,0 +1,37 @@
+import copy
+import sys
+
+import pyfastaq
+
+from ariba import flag
+
+class Error (Exception): pass
+
+class ReportFlagExpander:
+ def __init__(self, infile, outfile):
+ self.infile = infile
+ self.outfile = outfile
+
+
+ def run(self):
+ f_in = pyfastaq.utils.open_file_read(self.infile)
+ f_out = pyfastaq.utils.open_file_write(self.outfile)
+ flag_index = None
+
+ for line in f_in:
+ fields = line.rstrip().split()
+
+ if flag_index is None:
+ try:
+ flag_index = fields.index('flag')
+ except:
+ raise Error('"flag" column not found in first line of file ' + self.infile +'. Cannot continue')
+ else:
+ f = flag.Flag(int(fields[flag_index]))
+ fields[flag_index] = f.to_comma_separated_string()
+
+ print(*fields, sep='\t', file=f_out)
+
+ f_in.close()
+ f_out.close()
+
diff --git a/ariba/tasks/__init__.py b/ariba/tasks/__init__.py
index 299f518..5d501d7 100644
--- a/ariba/tasks/__init__.py
+++ b/ariba/tasks/__init__.py
@@ -1,5 +1,6 @@
__all__ = [
'aln2meta',
+ 'expandflag',
'flag',
'getref',
'micplot',
diff --git a/ariba/tasks/expandflag.py b/ariba/tasks/expandflag.py
new file mode 100644
index 0000000..bf257e7
--- /dev/null
+++ b/ariba/tasks/expandflag.py
@@ -0,0 +1,8 @@
+import argparse
+import sys
+import ariba
+
+def run(options):
+ expander = ariba.report_flag_expander.ReportFlagExpander(options.infile, options.outfile)
+ expander.run()
+
diff --git a/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_genes.log b/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_genes.log
index 26666ff..1ba935e 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_genes.log
+++ b/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_genes.log
@@ -1,5 +1,5 @@
-cannot_make_into_a_gene Does not look like a gene (tried both strands and all reading frames) AAAAAAAAAAAAAAAA
-gene1 Made gene1 into gene. strand=+, frame=0
-gene2 Made gene2 into gene. strand=+, frame=0
-gene3 Made gene3 into gene. strand=+, frame=0
-gene4.var_only Made gene4.var_only into gene. strand=+, frame=0
+cannot_make_into_a_gene REMOVE Does not look like a gene (tried both strands and all reading frames) AAAAAAAAAAAAAAAA
+gene1 KEEP Made into gene. strand=+, frame=0
+gene2 KEEP Made into gene. strand=+, frame=0
+gene3 KEEP Made into gene. strand=+, frame=0
+gene4.var_only KEEP Made into gene. strand=+, frame=0
diff --git a/ariba/tests/data/reference_data_test_remove_bad_genes.log b/ariba/tests/data/reference_data_test_remove_bad_genes.log
index a1223f6..9314ba2 100644
--- a/ariba/tests/data/reference_data_test_remove_bad_genes.log
+++ b/ariba/tests/data/reference_data_test_remove_bad_genes.log
@@ -1,5 +1,5 @@
-g1 Remove: too short. Length: 5
-g2 Remove: too long. Length: 100
-g3 Does not look like a gene (tried both strands and all reading frames) GAGGAGCCG
-g4 Does not look like a gene (tried both strands and all reading frames) ATGTAACCT
-g5 Made g5 into gene. strand=+, frame=0
+g1 REMOVE Too short. Length: 5
+g2 REMOVE Too long. Length: 100
+g3 REMOVE Does not look like a gene (tried both strands and all reading frames) GAGGAGCCG
+g4 REMOVE Does not look like a gene (tried both strands and all reading frames) ATGTAACCT
+g5 KEEP Made into gene. strand=+, frame=0
diff --git a/ariba/tests/data/report_flag_expander.run.in.tsv b/ariba/tests/data/report_flag_expander.run.in.tsv
new file mode 100644
index 0000000..c1919fa
--- /dev/null
+++ b/ariba/tests/data/report_flag_expander.run.in.tsv
@@ -0,0 +1,3 @@
+#ariba column1 flag foo
+name 1 1 foo
+name 2 27 bar
diff --git a/ariba/tests/data/report_flag_expander.run.out.tsv b/ariba/tests/data/report_flag_expander.run.out.tsv
new file mode 100644
index 0000000..c520cf9
--- /dev/null
+++ b/ariba/tests/data/report_flag_expander.run.out.tsv
@@ -0,0 +1,3 @@
+#ariba column1 flag foo
+name 1 assembled foo
+name 2 assembled,assembled_into_one_contig,complete_gene,unique_contig bar
diff --git a/ariba/tests/flag_test.py b/ariba/tests/flag_test.py
index 7704965..5ebe403 100644
--- a/ariba/tests/flag_test.py
+++ b/ariba/tests/flag_test.py
@@ -64,3 +64,11 @@ class TestFlag(unittest.TestCase):
self.assertFalse(f.has(x))
f.add(x)
self.assertTrue(f.has(x))
+
+
+ def test_to_comma_separated_string(self):
+ '''Test to_comma_separated_string'''
+ f = flag.Flag(27)
+ expected = 'assembled,assembled_into_one_contig,complete_gene,unique_contig'
+ self.assertEqual(expected, f.to_comma_separated_string())
+
diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py
index 02f74bd..fc36c1f 100644
--- a/ariba/tests/reference_data_test.py
+++ b/ariba/tests/reference_data_test.py
@@ -228,9 +228,14 @@ class TestReferenceData(unittest.TestCase):
metadata_tsv = os.path.join(data_dir, 'reference_data_filter_bad_data_metadata.in.tsv')
sequences, metadata = reference_data.ReferenceData._load_input_files_and_check_seq_names([fasta_in], [metadata_tsv])
tmp_prefix = 'tmp.test_filter_bad_variant_data'
- reference_data.ReferenceData._filter_bad_variant_data(sequences, metadata, tmp_prefix, set())
+ got_line_count = reference_data.ReferenceData._filter_bad_variant_data(sequences, metadata, tmp_prefix, set())
expected_prefix = os.path.join(data_dir, 'reference_data_filter_bad_data.expected')
+ with open(os.path.join(data_dir, 'reference_data_filter_bad_data.expected.check_metadata.log')) as f:
+ expected_line_count = len(f.readlines())
+
+ self.assertEqual(expected_line_count, got_line_count)
+
for suffix in ['check_metadata.log', 'check_metadata.tsv']:
expected = expected_prefix + '.' + suffix
got = tmp_prefix + '.' + suffix
@@ -245,11 +250,11 @@ class TestReferenceData(unittest.TestCase):
def test_try_to_get_gene_seq(self):
'''Test _try_to_get_gene_seq'''
tests = [
- (pyfastaq.sequences.Fasta('x', 'ACGTG'), None, 'Remove: too short. Length: 5'),
- (pyfastaq.sequences.Fasta('x', 'A' * 100), None, 'Remove: too long. Length: 100'),
- (pyfastaq.sequences.Fasta('x', 'GAGGAGCCG'), None, 'Does not look like a gene (tried both strands and all reading frames) GAGGAGCCG'),
- (pyfastaq.sequences.Fasta('x', 'ATGTAACCT'), None, 'Does not look like a gene (tried both strands and all reading frames) ATGTAACCT'),
- (pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), 'Made x into gene. strand=+, frame=0')
+ (pyfastaq.sequences.Fasta('x', 'ACGTG'), None, 'REMOVE\tToo short. Length: 5'),
+ (pyfastaq.sequences.Fasta('x', 'A' * 100), None, 'REMOVE\tToo long. Length: 100'),
+ (pyfastaq.sequences.Fasta('x', 'GAGGAGCCG'), None, 'REMOVE\tDoes not look like a gene (tried both strands and all reading frames) GAGGAGCCG'),
+ (pyfastaq.sequences.Fasta('x', 'ATGTAACCT'), None, 'REMOVE\tDoes not look like a gene (tried both strands and all reading frames) ATGTAACCT'),
+ (pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), 'KEEP\tMade into gene. strand=+, frame=0')
]
for seq, got_seq, message in tests:
diff --git a/ariba/tests/report_filter_test.py b/ariba/tests/report_filter_test.py
index 31ff036..8c70455 100644
--- a/ariba/tests/report_filter_test.py
+++ b/ariba/tests/report_filter_test.py
@@ -266,6 +266,21 @@ class TestReportFilter(unittest.TestCase):
self.assertEqual([dict1, dict2], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1, dict2, dict3]))
+ def test_remove_all_after_first_frameshift_complete_gene(self):
+ '''Test _remove_all_after_first_frameshift when gene is complete'''
+ self.assertEqual([], report_filter.ReportFilter._remove_all_after_first_frameshift([]))
+ line1 = 'ariba_cluster1\tcluster1\t1\t0\t538\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tE89G\tNONSYN\t65\t265\tA;A\t766\t766\tG;C\t88;90\t.;.\t87;90\t.\t.'
+ line2 = 'ariba_cluster1\tcluster1\t1\t0\t538\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tQ37fs\tFSHIFT\t109\t109\tA\t634\t634\t.\t67\t.\t67\t.\t.'
+ line3 = 'ariba_cluster1\tcluster1\t1\t0\t538\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tE89G\tNONSYN\t265\t265\tA;A\t766\t766\tG;C\t88;90\t.;.\t87;90\t.\t.'
+ dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
+ dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
+ dict3 = report_filter.ReportFilter._report_line_to_dict(line3)
+ self.assertEqual([dict1], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1]))
+ self.assertEqual([dict1, dict2], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1, dict2]))
+ self.assertEqual([dict2, dict3], report_filter.ReportFilter._remove_all_after_first_frameshift([dict2, dict3]))
+ self.assertEqual([dict1, dict2, dict3], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1, dict2, dict3]))
+
+
def test_filter_dicts(self):
'''Test _filter_dicts'''
rf = report_filter.ReportFilter(min_ref_base_assembled=10, ignore_not_has_known_variant=True)
diff --git a/ariba/tests/report_flag_expander_test.py b/ariba/tests/report_flag_expander_test.py
new file mode 100644
index 0000000..99f6671
--- /dev/null
+++ b/ariba/tests/report_flag_expander_test.py
@@ -0,0 +1,20 @@
+import unittest
+import os
+import filecmp
+from ariba import report_flag_expander
+
+modules_dir = os.path.dirname(os.path.abspath(report_flag_expander.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestReportFlagExpander(unittest.TestCase):
+ def test_run(self):
+ '''test run'''
+ infile = os.path.join(data_dir, 'report_flag_expander.run.in.tsv')
+ expected = os.path.join(data_dir, 'report_flag_expander.run.out.tsv')
+ tmp_out = 'tmp.report_flag_expander.out.tsv'
+ expander = report_flag_expander.ReportFlagExpander(infile, tmp_out)
+ expander.run()
+ self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False))
+ os.unlink(tmp_out)
+
diff --git a/scripts/ariba b/scripts/ariba
index 9a76385..498dddd 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -29,6 +29,18 @@ subparser_aln2meta.add_argument('outprefix', help='Prefix of output filenames')
subparser_aln2meta.set_defaults(func=ariba.tasks.aln2meta.run)
+#---------------------------- expandflag ------------------------------
+subparser_expandflag = subparsers.add_parser(
+ 'expandflag',
+ help='Expands flag column of report file',
+ usage='ariba expandflag <in.report.tsv> <out.tsv',
+ description='Expands the flag column in a report file from number to comma-separated list of flag bits',
+)
+
+subparser_expandflag.add_argument('infile', help='Name of input report TSV file')
+subparser_expandflag.add_argument('outfile', help='Name of output report TSV file')
+subparser_expandflag.set_defaults(func=ariba.tasks.expandflag.run)
+
#---------------------------- flag ------------------------------------
subparser_flag = subparsers.add_parser(
diff --git a/setup.py b/setup.py
index f391e55..eda17e3 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@ vcfcall_mod = Extension(
setup(
ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
name='ariba',
- version='2.9.4',
+ version='2.10.0',
description='ARIBA: Antibiotic Resistance Identification By Assembly',
packages = find_packages(),
package_data={'ariba': ['test_run_data/*']},
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ariba.git
More information about the debian-med-commit
mailing list