[med-svn] [ariba] 01/03: New upstream version 2.10.0+ds

Sascha Steinbiss satta at debian.org
Wed Jun 28 17:08:22 UTC 2017


This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository ariba.

commit 5fd263169499b4c03aa3a95136fe246cead68a7e
Author: Sascha Steinbiss <satta at debian.org>
Date:   Wed Jun 28 19:04:37 2017 +0200

    New upstream version 2.10.0+ds
---
 ariba/__init__.py                                  |  1 +
 ariba/flag.py                                      |  3 ++
 ariba/mic_plotter.py                               |  2 +-
 ariba/ref_preparer.py                              |  8 ++++-
 ariba/reference_data.py                            | 19 +++++++----
 ariba/report_filter.py                             |  3 ++
 ariba/report_flag_expander.py                      | 37 ++++++++++++++++++++++
 ariba/tasks/__init__.py                            |  1 +
 ariba/tasks/expandflag.py                          |  8 +++++
 .../01.filter.check_genes.log                      | 10 +++---
 .../data/reference_data_test_remove_bad_genes.log  | 10 +++---
 ariba/tests/data/report_flag_expander.run.in.tsv   |  3 ++
 ariba/tests/data/report_flag_expander.run.out.tsv  |  3 ++
 ariba/tests/flag_test.py                           |  8 +++++
 ariba/tests/reference_data_test.py                 | 17 ++++++----
 ariba/tests/report_filter_test.py                  | 15 +++++++++
 ariba/tests/report_flag_expander_test.py           | 20 ++++++++++++
 scripts/ariba                                      | 12 +++++++
 setup.py                                           |  2 +-
 19 files changed, 157 insertions(+), 25 deletions(-)

diff --git a/ariba/__init__.py b/ariba/__init__.py
index ca6fc15..57037a3 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -39,6 +39,7 @@ __all__ = [
     'ref_seq_chooser',
     'report',
     'report_filter',
+    'report_flag_expander',
     'scaffold_graph',
     'samtools_variants',
     'sequence_metadata',
diff --git a/ariba/flag.py b/ariba/flag.py
index 201d4e7..f0e9171 100644
--- a/ariba/flag.py
+++ b/ariba/flag.py
@@ -61,3 +61,6 @@ class Flag:
     def has(self, s):
         return self.flags[s]
 
+
+    def to_comma_separated_string(self):
+        return ','.join([f for f in flags_in_order if self.flags[f]])
diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py
index ca5992f..644e5b2 100644
--- a/ariba/mic_plotter.py
+++ b/ariba/mic_plotter.py
@@ -605,7 +605,7 @@ class MicPlotter:
         y_tick_positions, y_tick_labels = MicPlotter._top_plot_y_ticks(mic_data, self.antibiotic, self.log_y)
         plots[0].yaxis.set_ticks(y_tick_positions)
         plots[0].set_yticklabels(y_tick_labels)
-        ylabel = r'$\log_' + str(int(self.log_y)) + '$(MIC) $\mu$g/mL' if self.log_y > 0 else r'MIC $\mu$g/mL'
+        ylabel = r'MIC ($\mu$g/mL)'
         plots[0].set_ylabel(ylabel)
         plots[0].set_xticklabels([])
         plots[0].set_title(self.main_title, fontsize=18)
diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py
index e139dd8..4f2ab81 100644
--- a/ariba/ref_preparer.py
+++ b/ariba/ref_preparer.py
@@ -183,7 +183,7 @@ class RefPreparer:
             print('\nLoading and checking input data', flush=True)
 
         self.refdata.rename_sequences(os.path.join(outdir, '00.rename_info'))
-        self.refdata.sanity_check(os.path.join(outdir, '01.filter'))
+        number_of_removed_seqs, number_of_bad_variants_logged = self.refdata.sanity_check(os.path.join(outdir, '01.filter'))
 
         if self.verbose:
             print('\nRunning cdhit', flush=True)
@@ -210,3 +210,9 @@ class RefPreparer:
         with open(clusters_pickle_file, 'wb') as f:
             pickle.dump(clusters, f)
 
+        if number_of_removed_seqs > 0:
+            print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the log file 01.filter.check_genes.log for details. This will show them:', file=sys.stderr)
+            print('    grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr)
+
+        if number_of_bad_variants_logged > 0:
+            print('WARNING. Problem with at least one variant. Problem variants are rmoved. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
diff --git a/ariba/reference_data.py b/ariba/reference_data.py
index 69f46f5..92e56d4 100644
--- a/ariba/reference_data.py
+++ b/ariba/reference_data.py
@@ -196,10 +196,12 @@ class ReferenceData:
         log_file = out_prefix + '.check_metadata.log'
         tsv_file = out_prefix + '.check_metadata.tsv'
         log_fh = pyfastaq.utils.open_file_write(log_file)
+        log_lines = 0
 
         for sequence_name, metadata_dict in sorted(all_metadata.items()):
             if sequence_name in removed_sequences:
                 print(sequence_name, 'was removed because does not look like a gene, so removing its metadata', file=log_fh)
+                log_lines += 1
                 del all_metadata[sequence_name]
                 continue
 
@@ -210,6 +212,7 @@ class ReferenceData:
                 for position in metadata_dict['p']:
                     for metadata in metadata_dict['p'][position]:
                         print(sequence_name, 'variant is an amino acid change, but sequence is non-coding. Removing. Line of file was:', metadata, file=log_fh)
+                        log_lines += 1
 
                 metadata_dict['p'] = {}
 
@@ -223,6 +226,7 @@ class ReferenceData:
 
                         if not metadata.variant.sanity_check_against_seq(sequences[sequence_name], translate_seq=to_translate):
                             print(sequence_name, 'variant does not match reference. Removing. Line of file was:', metadata, file=log_fh)
+                            log_lines += 1
                             meta_to_remove.append(metadata)
                             continue
 
@@ -239,6 +243,7 @@ class ReferenceData:
 
             if metadata_dict['variant_only'] and len(metadata_dict['n']) == len(metadata_dict['p']) == len(metadata_dict['.']) == 0:
                 print(sequence_name, 'No remaining data after checks. Removing this sequence because it is variants only', file=log_fh)
+                log_lines += 1
                 genes_to_remove.add(sequence_name)
 
         for sequence_name in genes_to_remove:
@@ -247,21 +252,22 @@ class ReferenceData:
 
         pyfastaq.utils.close(log_fh)
         ReferenceData._write_metadata_tsv(all_metadata, tsv_file)
+        return log_lines
 
 
     @classmethod
     def _try_to_get_gene_seq(cls, seq, min_length, max_length):
         seq.seq = seq.seq.upper()
         if len(seq) < min_length:
-            return None, 'Remove: too short. Length: ' + str(len(seq))
+            return None, 'REMOVE\tToo short. Length: ' + str(len(seq))
         elif len(seq) > max_length:
-            return None, 'Remove: too long. Length: ' + str(len(seq))
+            return None, 'REMOVE\tToo long. Length: ' + str(len(seq))
         else:
             got = seq.make_into_gene()
             if got is None:
-                return None, 'Does not look like a gene (tried both strands and all reading frames) ' + seq.seq
+                return None, 'REMOVE\tDoes not look like a gene (tried both strands and all reading frames) ' + seq.seq
             else:
-                return got[0], 'Made ' + seq.id + ' into gene. strand=' + got[1] + ', frame=' + str(got[2])
+                return got[0], 'KEEP\tMade into gene. strand=' + got[1] + ', frame=' + str(got[2])
 
 
     @classmethod
@@ -284,7 +290,7 @@ class ReferenceData:
                 sequences[name] = new_seq
 
             if message is not None:
-                print(name, message, file=log_fh)
+                print(name, message, sep='\t', file=log_fh)
 
         pyfastaq.utils.close(log_fh)
 
@@ -296,7 +302,8 @@ class ReferenceData:
 
     def sanity_check(self, outprefix):
         removed_seqs = self._remove_bad_genes(self.sequences, self.metadata, outprefix + '.check_genes.log', self.min_gene_length, self.max_gene_length)
-        ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, removed_seqs)
+        log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, removed_seqs)
+        return len(removed_seqs), log_lines
 
 
     @classmethod
diff --git a/ariba/report_filter.py b/ariba/report_filter.py
index b89de9b..d7cf54d 100644
--- a/ariba/report_filter.py
+++ b/ariba/report_filter.py
@@ -152,6 +152,9 @@ class ReportFilter:
 
     @staticmethod
     def _remove_all_after_first_frameshift(dicts_list):
+        if len(dicts_list) == 0 or dicts_list[0]['flag'].has('complete_gene'):
+            return dicts_list
+
         fshift_starts = [int(d['ref_start']) for d in dicts_list if d.get('ref_ctg_effect', None) == 'FSHIFT']
         if len(fshift_starts) == 0:
             return dicts_list
diff --git a/ariba/report_flag_expander.py b/ariba/report_flag_expander.py
new file mode 100644
index 0000000..da72fea
--- /dev/null
+++ b/ariba/report_flag_expander.py
@@ -0,0 +1,37 @@
+import copy
+import sys
+
+import pyfastaq
+
+from ariba import flag
+
+class Error (Exception): pass
+
+class ReportFlagExpander:
+    def __init__(self, infile, outfile):
+        self.infile = infile
+        self.outfile = outfile
+
+
+    def run(self):
+        f_in = pyfastaq.utils.open_file_read(self.infile)
+        f_out = pyfastaq.utils.open_file_write(self.outfile)
+        flag_index = None
+
+        for line in f_in:
+            fields = line.rstrip().split()
+
+            if flag_index is None:
+                try:
+                    flag_index = fields.index('flag')
+                except:
+                    raise Error('"flag" column not found in first line of file ' + self.infile +'. Cannot continue')
+            else:
+                f = flag.Flag(int(fields[flag_index]))
+                fields[flag_index] = f.to_comma_separated_string()
+
+            print(*fields, sep='\t', file=f_out)
+
+        f_in.close()
+        f_out.close()
+
diff --git a/ariba/tasks/__init__.py b/ariba/tasks/__init__.py
index 299f518..5d501d7 100644
--- a/ariba/tasks/__init__.py
+++ b/ariba/tasks/__init__.py
@@ -1,5 +1,6 @@
 __all__ = [
     'aln2meta',
+    'expandflag',
     'flag',
     'getref',
     'micplot',
diff --git a/ariba/tasks/expandflag.py b/ariba/tasks/expandflag.py
new file mode 100644
index 0000000..bf257e7
--- /dev/null
+++ b/ariba/tasks/expandflag.py
@@ -0,0 +1,8 @@
+import argparse
+import sys
+import ariba
+
+def run(options):
+    expander = ariba.report_flag_expander.ReportFlagExpander(options.infile, options.outfile)
+    expander.run()
+
diff --git a/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_genes.log b/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_genes.log
index 26666ff..1ba935e 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_genes.log
+++ b/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_genes.log
@@ -1,5 +1,5 @@
-cannot_make_into_a_gene Does not look like a gene (tried both strands and all reading frames) AAAAAAAAAAAAAAAA
-gene1 Made gene1 into gene. strand=+, frame=0
-gene2 Made gene2 into gene. strand=+, frame=0
-gene3 Made gene3 into gene. strand=+, frame=0
-gene4.var_only Made gene4.var_only into gene. strand=+, frame=0
+cannot_make_into_a_gene	REMOVE	Does not look like a gene (tried both strands and all reading frames) AAAAAAAAAAAAAAAA
+gene1	KEEP	Made into gene. strand=+, frame=0
+gene2	KEEP	Made into gene. strand=+, frame=0
+gene3	KEEP	Made into gene. strand=+, frame=0
+gene4.var_only	KEEP	Made into gene. strand=+, frame=0
diff --git a/ariba/tests/data/reference_data_test_remove_bad_genes.log b/ariba/tests/data/reference_data_test_remove_bad_genes.log
index a1223f6..9314ba2 100644
--- a/ariba/tests/data/reference_data_test_remove_bad_genes.log
+++ b/ariba/tests/data/reference_data_test_remove_bad_genes.log
@@ -1,5 +1,5 @@
-g1 Remove: too short. Length: 5
-g2 Remove: too long. Length: 100
-g3 Does not look like a gene (tried both strands and all reading frames) GAGGAGCCG
-g4 Does not look like a gene (tried both strands and all reading frames) ATGTAACCT
-g5 Made g5 into gene. strand=+, frame=0
+g1	REMOVE	Too short. Length: 5
+g2	REMOVE	Too long. Length: 100
+g3	REMOVE	Does not look like a gene (tried both strands and all reading frames) GAGGAGCCG
+g4	REMOVE	Does not look like a gene (tried both strands and all reading frames) ATGTAACCT
+g5	KEEP	Made into gene. strand=+, frame=0
diff --git a/ariba/tests/data/report_flag_expander.run.in.tsv b/ariba/tests/data/report_flag_expander.run.in.tsv
new file mode 100644
index 0000000..c1919fa
--- /dev/null
+++ b/ariba/tests/data/report_flag_expander.run.in.tsv
@@ -0,0 +1,3 @@
+#ariba	column1	flag	foo
+name	1	1	foo
+name	2	27	bar
diff --git a/ariba/tests/data/report_flag_expander.run.out.tsv b/ariba/tests/data/report_flag_expander.run.out.tsv
new file mode 100644
index 0000000..c520cf9
--- /dev/null
+++ b/ariba/tests/data/report_flag_expander.run.out.tsv
@@ -0,0 +1,3 @@
+#ariba	column1	flag	foo
+name	1	assembled	foo
+name	2	assembled,assembled_into_one_contig,complete_gene,unique_contig	bar
diff --git a/ariba/tests/flag_test.py b/ariba/tests/flag_test.py
index 7704965..5ebe403 100644
--- a/ariba/tests/flag_test.py
+++ b/ariba/tests/flag_test.py
@@ -64,3 +64,11 @@ class TestFlag(unittest.TestCase):
             self.assertFalse(f.has(x))
             f.add(x)
             self.assertTrue(f.has(x))
+
+
+    def test_to_comma_separated_string(self):
+        '''Test to_comma_separated_string'''
+        f = flag.Flag(27)
+        expected = 'assembled,assembled_into_one_contig,complete_gene,unique_contig'
+        self.assertEqual(expected, f.to_comma_separated_string())
+
diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py
index 02f74bd..fc36c1f 100644
--- a/ariba/tests/reference_data_test.py
+++ b/ariba/tests/reference_data_test.py
@@ -228,9 +228,14 @@ class TestReferenceData(unittest.TestCase):
         metadata_tsv = os.path.join(data_dir, 'reference_data_filter_bad_data_metadata.in.tsv')
         sequences, metadata = reference_data.ReferenceData._load_input_files_and_check_seq_names([fasta_in], [metadata_tsv])
         tmp_prefix = 'tmp.test_filter_bad_variant_data'
-        reference_data.ReferenceData._filter_bad_variant_data(sequences, metadata, tmp_prefix, set())
+        got_line_count = reference_data.ReferenceData._filter_bad_variant_data(sequences, metadata, tmp_prefix, set())
         expected_prefix = os.path.join(data_dir, 'reference_data_filter_bad_data.expected')
 
+        with open(os.path.join(data_dir, 'reference_data_filter_bad_data.expected.check_metadata.log')) as f:
+            expected_line_count = len(f.readlines())
+
+        self.assertEqual(expected_line_count, got_line_count)
+
         for suffix in ['check_metadata.log', 'check_metadata.tsv']:
             expected = expected_prefix + '.' + suffix
             got = tmp_prefix + '.' + suffix
@@ -245,11 +250,11 @@ class TestReferenceData(unittest.TestCase):
     def test_try_to_get_gene_seq(self):
         '''Test _try_to_get_gene_seq'''
         tests = [
-            (pyfastaq.sequences.Fasta('x', 'ACGTG'), None, 'Remove: too short. Length: 5'),
-            (pyfastaq.sequences.Fasta('x', 'A' * 100), None, 'Remove: too long. Length: 100'),
-            (pyfastaq.sequences.Fasta('x', 'GAGGAGCCG'), None, 'Does not look like a gene (tried both strands and all reading frames) GAGGAGCCG'),
-            (pyfastaq.sequences.Fasta('x', 'ATGTAACCT'), None, 'Does not look like a gene (tried both strands and all reading frames) ATGTAACCT'),
-            (pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), 'Made x into gene. strand=+, frame=0')
+            (pyfastaq.sequences.Fasta('x', 'ACGTG'), None, 'REMOVE\tToo short. Length: 5'),
+            (pyfastaq.sequences.Fasta('x', 'A' * 100), None, 'REMOVE\tToo long. Length: 100'),
+            (pyfastaq.sequences.Fasta('x', 'GAGGAGCCG'), None, 'REMOVE\tDoes not look like a gene (tried both strands and all reading frames) GAGGAGCCG'),
+            (pyfastaq.sequences.Fasta('x', 'ATGTAACCT'), None, 'REMOVE\tDoes not look like a gene (tried both strands and all reading frames) ATGTAACCT'),
+            (pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), 'KEEP\tMade into gene. strand=+, frame=0')
         ]
 
         for seq, got_seq, message in tests:
diff --git a/ariba/tests/report_filter_test.py b/ariba/tests/report_filter_test.py
index 31ff036..8c70455 100644
--- a/ariba/tests/report_filter_test.py
+++ b/ariba/tests/report_filter_test.py
@@ -266,6 +266,21 @@ class TestReportFilter(unittest.TestCase):
         self.assertEqual([dict1, dict2], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1, dict2, dict3]))
 
 
+    def test_remove_all_after_first_frameshift_complete_gene(self):
+        '''Test _remove_all_after_first_frameshift when gene is complete'''
+        self.assertEqual([], report_filter.ReportFilter._remove_all_after_first_frameshift([]))
+        line1 = 'ariba_cluster1\tcluster1\t1\t0\t538\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tE89G\tNONSYN\t65\t265\tA;A\t766\t766\tG;C\t88;90\t.;.\t87;90\t.\t.'
+        line2 = 'ariba_cluster1\tcluster1\t1\t0\t538\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tQ37fs\tFSHIFT\t109\t109\tA\t634\t634\t.\t67\t.\t67\t.\t.'
+        line3 = 'ariba_cluster1\tcluster1\t1\t0\t538\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tE89G\tNONSYN\t265\t265\tA;A\t766\t766\tG;C\t88;90\t.;.\t87;90\t.\t.'
+        dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
+        dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
+        dict3 = report_filter.ReportFilter._report_line_to_dict(line3)
+        self.assertEqual([dict1], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1]))
+        self.assertEqual([dict1, dict2], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1, dict2]))
+        self.assertEqual([dict2, dict3], report_filter.ReportFilter._remove_all_after_first_frameshift([dict2, dict3]))
+        self.assertEqual([dict1, dict2, dict3], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1, dict2, dict3]))
+
+
     def test_filter_dicts(self):
         '''Test _filter_dicts'''
         rf = report_filter.ReportFilter(min_ref_base_assembled=10, ignore_not_has_known_variant=True)
diff --git a/ariba/tests/report_flag_expander_test.py b/ariba/tests/report_flag_expander_test.py
new file mode 100644
index 0000000..99f6671
--- /dev/null
+++ b/ariba/tests/report_flag_expander_test.py
@@ -0,0 +1,20 @@
+import unittest
+import os
+import filecmp
+from ariba import report_flag_expander
+
+modules_dir = os.path.dirname(os.path.abspath(report_flag_expander.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestReportFlagExpander(unittest.TestCase):
+    def test_run(self):
+        '''test run'''
+        infile = os.path.join(data_dir, 'report_flag_expander.run.in.tsv')
+        expected = os.path.join(data_dir, 'report_flag_expander.run.out.tsv')
+        tmp_out = 'tmp.report_flag_expander.out.tsv'
+        expander = report_flag_expander.ReportFlagExpander(infile, tmp_out)
+        expander.run()
+        self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False))
+        os.unlink(tmp_out)
+
diff --git a/scripts/ariba b/scripts/ariba
index 9a76385..498dddd 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -29,6 +29,18 @@ subparser_aln2meta.add_argument('outprefix', help='Prefix of output filenames')
 subparser_aln2meta.set_defaults(func=ariba.tasks.aln2meta.run)
 
 
+#---------------------------- expandflag ------------------------------
+subparser_expandflag = subparsers.add_parser(
+    'expandflag',
+    help='Expands flag column of report file',
+    usage='ariba expandflag <in.report.tsv> <out.tsv',
+    description='Expands the flag column in a report file from number to comma-separated list of flag bits',
+)
+
+subparser_expandflag.add_argument('infile', help='Name of input report TSV file')
+subparser_expandflag.add_argument('outfile', help='Name of output report TSV file')
+subparser_expandflag.set_defaults(func=ariba.tasks.expandflag.run)
+
 
 #---------------------------- flag ------------------------------------
 subparser_flag = subparsers.add_parser(
diff --git a/setup.py b/setup.py
index f391e55..eda17e3 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@ vcfcall_mod = Extension(
 setup(
     ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
     name='ariba',
-    version='2.9.4',
+    version='2.10.0',
     description='ARIBA: Antibiotic Resistance Identification By Assembly',
     packages = find_packages(),
     package_data={'ariba': ['test_run_data/*']},

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ariba.git



More information about the debian-med-commit mailing list