[med-svn] [ariba] 01/02: New upstream version 2.9.0+ds
Sascha Steinbiss
satta at debian.org
Thu Apr 27 08:15:20 UTC 2017
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository ariba.
commit 83edd461dc1e38a8b31fb423e6f2d75315a17a6e
Author: Sascha Steinbiss <sascha.steinbiss at dcso.de>
Date: Tue Apr 4 13:45:15 2017 +0200
New upstream version 2.9.0+ds
---
README.md | 10 +-
ariba/__init__.py | 1 +
ariba/assembly_compare.py | 33 ++
ariba/assembly_variants.py | 23 +-
ariba/cluster.py | 4 +-
ariba/ext/vcfcall_ariba.cpp | 3 +-
ariba/mic_plotter.py | 651 +++++++++++++++++++++
ariba/ref_genes_getter.py | 53 ++
ariba/summary.py | 37 +-
ariba/summary_cluster.py | 23 +-
ariba/summary_cluster_variant.py | 58 +-
ariba/summary_sample.py | 2 +-
ariba/tasks/__init__.py | 1 +
ariba/tasks/micplot.py | 39 ++
ariba/tasks/summary.py | 10 +-
ariba/tests/assembly_compare_test.py | 31 +
ariba/tests/assembly_variants_test.py | 17 +-
ariba/tests/data/mic_plotter_load_mic_file.tsv | 7 +
ariba/tests/data/mic_plotter_load_summary_file.tsv | 3 +
...oxplot_tsv.antibio1.exclude.no_combinations.tsv | 3 +
...mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv | 2 +
..._to_boxplot_tsv.antibio1.no.no_combinations.tsv | 4 +
.../mic_plotter_to_boxplot_tsv.antibio1.no.tsv | 3 +
...to_boxplot_tsv.antibio1.yes.no_combinations.tsv | 5 +
.../mic_plotter_to_boxplot_tsv.antibio1.yes.tsv | 3 +
...oxplot_tsv.antibio2.exclude.no_combinations.tsv | 4 +
...mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv | 3 +
..._to_boxplot_tsv.antibio2.no.no_combinations.tsv | 4 +
.../mic_plotter_to_boxplot_tsv.antibio2.no.tsv | 3 +
...to_boxplot_tsv.antibio2.yes.no_combinations.tsv | 4 +
.../mic_plotter_to_boxplot_tsv.antibio2.yes.tsv | 3 +
ariba/tests/data/summary_test_load_fofn | 4 +
ariba/tests/data/summary_test_whole_run.out.csv | 6 +-
ariba/tests/mic_plotter_test.py | 283 +++++++++
ariba/tests/summary_cluster_test.py | 16 +-
ariba/tests/summary_cluster_variant_test.py | 104 ++--
ariba/tests/summary_sample_test.py | 9 +-
ariba/tests/summary_test.py | 129 ++--
scripts/ariba | 55 +-
setup.py | 3 +-
40 files changed, 1494 insertions(+), 162 deletions(-)
diff --git a/README.md b/README.md
index beb1ae1..d67ab84 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ user support is not provided at this time.
ARIBA
=====
-Antibiotic Resistance Identification By Assembly
+Antimicrobial Resistance Identification By Assembly
For how to use ARIBA, please see the [ARIBA wiki page][ARIBA wiki].
@@ -32,7 +32,8 @@ Once the dependencies are installed, install ARIBA using pip:
ARIBA also depends on several Python packages, all of which are available
via pip, so the above command will get those automatically if they
-are not installed. The packages are dendropy >= 4.2.0,
+are not installed. The packages are dendropy >= 4.2.0, matplotlib (no
+minimum version required, but only tested on 2.0.0),
pyfastaq >= 3.12.0, pysam >= 0.9.1, and pymummer >= 0.10.1.
Alternatively, you can download the latest release from this github repository,
@@ -86,8 +87,9 @@ it would try to use
$HOME/bowtie2-2.1.0/bowtie2-build
-
-### Temporary files
+
+### Temporary files
+
ARIBA can temporarily make a large number of files whilst running, which
are put in a temporary directory made by ARIBA. The total size of these
diff --git a/ariba/__init__.py b/ariba/__init__.py
index 1ca0437..ca6fc15 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -25,6 +25,7 @@ __all__ = [
'mapping',
'megares_data_finder',
'megares_zip_parser',
+ 'mic_plotter',
'mlst_profile',
'mlst_reporter',
'pubmlst_getter',
diff --git a/ariba/assembly_compare.py b/ariba/assembly_compare.py
index a0eb6a0..bd8215d 100644
--- a/ariba/assembly_compare.py
+++ b/ariba/assembly_compare.py
@@ -135,6 +135,39 @@ class AssemblyCompare:
return coords
+ @classmethod
+ def nucmer_hits_to_ref_and_qry_coords(cls, nucmer_hits, contig=None):
+ '''Same as nucmer_hits_to_ref_coords, except removes containing hits first,
+ and returns ref and qry coords lists'''
+ if contig is None:
+ ctg_coords = {key: [] for key in nucmer_hits.keys()}
+ else:
+ ctg_coords = {contig: []}
+
+ ref_coords = {}
+
+ for key in ctg_coords:
+ hits = copy.copy(nucmer_hits[key])
+ hits.sort(key=lambda x: len(x.ref_coords()))
+
+ if len(hits) > 1:
+ i = 0
+ while i < len(hits) - 1:
+ c1 = hits[i].ref_coords()
+ c2 = hits[i+1].ref_coords()
+ if c2.contains(c1):
+ hits.pop(i)
+ else:
+ i += 1
+
+ ref_coords[key] = [hit.ref_coords() for hit in hits]
+ ctg_coords[key] = [hit.qry_coords() for hit in hits]
+ pyfastaq.intervals.merge_overlapping_in_list(ref_coords[key])
+ pyfastaq.intervals.merge_overlapping_in_list(ctg_coords[key])
+
+ return ctg_coords, ref_coords
+
+
@staticmethod
def ref_cov_per_contig(nucmer_hits):
'''Input is hits made by self._parse_nucmer_coords_file.
diff --git a/ariba/assembly_variants.py b/ariba/assembly_variants.py
index 733334c..fa4e2e2 100644
--- a/ariba/assembly_variants.py
+++ b/ariba/assembly_variants.py
@@ -260,7 +260,7 @@ class AssemblyVariants:
return variants
- def get_variants(self, ref_sequence_name, nucmer_coords):
+ def get_variants(self, ref_sequence_name, allowed_ctg_coords, allowed_ref_coords, nucmer_matches=None):
'''Nucmr coords = dict. Key=contig name. Value = list of intervals of ref coords that match the contig.
Made by assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords
Returns dictionary. Key=contig name. Value = list of variants. Each variant
@@ -287,12 +287,27 @@ class AssemblyVariants:
known_non_wild_variants_in_ref = self.refdata.all_non_wild_type_variants(ref_sequence_name)
- for contig in nucmer_coords:
+ for contig in allowed_ctg_coords:
+ if contig not in allowed_ref_coords:
+ continue
+
used_known_variants = set()
variants[contig] = []
if contig in mummer_variants:
for mummer_variant_list in mummer_variants[contig]:
+ ref_start = min([x.ref_start for x in mummer_variant_list])
+ ref_end = max([x.ref_end for x in mummer_variant_list])
+ ctg_start = min([x.qry_start for x in mummer_variant_list])
+ ctg_end = min([x.qry_end for x in mummer_variant_list])
+ ref_interval = intervals.Interval(ref_start, ref_end)
+ ctg_interval = intervals.Interval(ctg_start, ctg_end)
+ ref_ok = True in {x.intersects(ref_interval) for x in allowed_ref_coords[contig]}
+ qry_ok = True in {x.intersects(ctg_interval) for x in allowed_ctg_coords[contig]}
+
+ if not (ref_ok and qry_ok):
+ continue
+
if seq_type == 'p':
new_variant, used_variants = self._get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variant_list)
else:
@@ -306,10 +321,10 @@ class AssemblyVariants:
# for this contig, need to know all the ref sequence and coords it maps to.
# Then report just the unused known variants, as the contig also has these variants
if seq_type == 'p':
- new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['p'], used_known_variants, nucmer_coords[contig])
+ new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['p'], used_known_variants, allowed_ref_coords[contig])
else:
- new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['n'], used_known_variants, nucmer_coords[contig])
+ new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['n'], used_known_variants, allowed_ref_coords[contig])
if is_variant_only:
new_variants = [x for x in new_variants if len(x[5]) > 0]
diff --git a/ariba/cluster.py b/ariba/cluster.py
index 4d41d60..6285513 100644
--- a/ariba/cluster.py
+++ b/ariba/cluster.py
@@ -382,9 +382,9 @@ class Cluster:
self.assembly_compare.run()
self.status_flag = self.assembly_compare.update_flag(self.status_flag)
- nucmer_hits_to_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(self.assembly_compare.nucmer_hits)
+ allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(self.assembly_compare.nucmer_hits)
assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file)
- self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, nucmer_hits_to_ref)
+ self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos)
for var_list in self.assembly_variants.values():
for var in var_list:
diff --git a/ariba/ext/vcfcall_ariba.cpp b/ariba/ext/vcfcall_ariba.cpp
index 3ead497..c2dba61 100644
--- a/ariba/ext/vcfcall_ariba.cpp
+++ b/ariba/ext/vcfcall_ariba.cpp
@@ -133,7 +133,8 @@ void vectorSumAndMax(const std::vector<uint32_t>& v, uint32_t& maxValue, uint32_
bool depthOk(const uint32_t& totalDepth, const uint32_t& testNum, const uint32_t& minSecondDepth, const float& maxAlleleFreq)
{
- return (testNum >= minSecondDepth && 1.0 * testNum / totalDepth <= maxAlleleFreq);
+ double depthFreq = 1.0 * testNum / totalDepth;
+ return (testNum >= minSecondDepth && depthFreq >= (1 - maxAlleleFreq) && depthFreq <= maxAlleleFreq);
}
diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py
new file mode 100644
index 0000000..ca5992f
--- /dev/null
+++ b/ariba/mic_plotter.py
@@ -0,0 +1,651 @@
+import csv
+import sys
+import random
+import re
+import os
+import itertools
+import collections
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import matplotlib.cm as cmx
+import math
+import pyfastaq
+from ariba import common, reference_data
+
+class Error (Exception): pass
+
+regex_string_to_float = re.compile(r'\s*(?P<lt_or_gt>[<>]?)\s*(?P<equals>=?)\s*(?P<number>[0-9.]+)\s*$')
+
+regex_position_from_var = re.compile(r'^[^0-9]*(?P<coord>[0-9]+)[^0-9]*$')
+
+class MicPlotter:
+ def __init__(self,
+ refdata_dir,
+ antibiotic,
+ mic_file,
+ summary_file,
+ outprefix,
+ use_hets='yes',
+ main_title=None,
+ plot_height=7,
+ plot_width=7,
+ log_y=2,
+ plot_types="points,violin",
+ jitter_width=0.1,
+ no_combinations=False,
+ hlines='',
+ point_size=4,
+ point_scale=1,
+ dot_size=100,
+ dot_outline=False,
+ dot_y_text_size=7,
+ panel_heights='9,2',
+ panel_widths='5,1',
+ colourmap='Accent',
+ number_of_colours=0,
+ colour_skip=None,
+ interrupted=False,
+ violin_width=0.75,
+ xkcd=False,
+ min_samples=1,
+ count_legend_x=-2,
+ out_format='pdf',
+ p_cutoff=0.05
+ ):
+ refdata_fa = os.path.join(refdata_dir, '02.cdhit.all.fa')
+ refdata_tsv = os.path.join(refdata_dir, '01.filter.check_metadata.tsv')
+ self.refdata = reference_data.ReferenceData([refdata_fa], [refdata_tsv])
+ self.antibiotic = antibiotic
+ self.mic_file = mic_file
+ self.summary_file = summary_file
+ self.outprefix = outprefix
+
+ allowed_use_hets = {'yes', 'no', 'exclude'}
+ if not use_hets in allowed_use_hets:
+ raise Error('Error in use_hets option. Allowed options are: ' + str(allowed_use_hets) + '. Got: ' + use_hets)
+ self.use_hets = use_hets
+
+ self.main_title = self.antibiotic if main_title is None else main_title
+ self.plot_height = plot_height
+ self.plot_width = plot_width
+ self.log_y = log_y
+ self.plot_types = set(plot_types.split(','))
+
+ allowed_plot_types = {'point', 'violin'}
+ if not self.plot_types.issubset(allowed_plot_types):
+ raise Error('Error in plot_types option. Allowed types are: ' + str(allowed_plot_types) + '. Got: ' + str(self.plot_types))
+
+ self.jitter_width = jitter_width
+ self.no_combinations = no_combinations
+
+ try:
+ if len(hlines) == 0:
+ self.hlines = []
+ else:
+ self.hlines = [float(x) for x in hlines.split(',')]
+ except:
+ raise Error('Error in hlines option. Needs to be a list of numbers separated by commas, or empty. Got this:\n' + hlines)
+
+ self.point_size = point_size
+ self.point_scale = point_scale
+ self.dot_size = dot_size
+ self.dot_outline = dot_outline
+ self.dot_y_text_size = dot_y_text_size
+
+ try:
+ self.panel_heights = [int(x) for x in panel_heights.split(',')]
+ except:
+ raise Error('Error in panel_heights option. Needs to be of the form integer1,integer2. Got this:\n' + panel_heights)
+
+ try:
+ self.panel_widths = [int(x) for x in panel_widths.split(',')]
+ except:
+ raise Error('Error in panel_widths option. Needs to be of the form integer1,integer2. Got this:\n' + panel_widths)
+
+ self.colourmap = colourmap
+ self.number_of_colours = number_of_colours
+
+ if colour_skip is None:
+ self.colour_skip = None
+ else:
+ try:
+ self.colour_skip = [float(x) for x in colour_skip.split(',')]
+ except:
+ raise Error('Error in colour_skip option. Needs to be of the form a,b where 0 <= a < b <= 1. Got this:\n' + colour_skip)
+
+ self.interrupted = interrupted
+ self.violin_width = violin_width
+ if xkcd:
+ plt.xkcd()
+ self.min_samples = min_samples
+ self.count_legend_x = count_legend_x
+ self.out_format = out_format
+ self.p_cutoff = p_cutoff
+
+
+ @classmethod
+ def _mic_string_to_float(cls, s):
+ regex_match = regex_string_to_float.match(s)
+
+ if regex_match is None or regex_match.group('number') == '.':
+ if s.strip() in {'NA', 'na', '', '.'}:
+ return 'NA'
+ else:
+ return None
+
+ try:
+ flt = float(regex_match.group('number'))
+ except:
+ return None
+
+ if regex_match.group('equals') == '':
+ if regex_match.group('lt_or_gt') == '<':
+ return 0.5 * flt
+ elif regex_match.group('lt_or_gt') == '>':
+ return 2 * flt
+
+ return flt
+
+
+ @classmethod
+ def _load_mic_file(cls, infile):
+ mic_data = {}
+
+ with open(infile) as f:
+ reader = csv.DictReader(f, delimiter='\t')
+ if reader.fieldnames[0] != 'Sample':
+ raise Error('Error. Expected first column of MIC file "' + infile + '" to be "Sample"')
+
+ for row in reader:
+ mic_data[row['Sample']] = {x: MicPlotter._mic_string_to_float(row[x]) for x in reader.fieldnames[1:]}
+
+ return mic_data
+
+
+ @classmethod
+ def _load_summary_file(cls, infile):
+ data = {}
+
+ with open(infile) as f:
+ reader = csv.DictReader(f, delimiter=',')
+
+ if reader.fieldnames[0] != 'name':
+ raise Error('Error. Expected first column of summary file "' + infile + '" to be "name"')
+
+ clusters = [x.split('.', maxsplit=1)[0] for x in reader.fieldnames[1:]]
+
+ for row in reader:
+ data[row['name']] = {}
+
+ for field in row:
+ if field == 'name':
+ continue
+
+ cluster, col = field.split('.', maxsplit=1)
+ if cluster not in clusters:
+ raise Error('Cluster "' + cluster + '" not recognised. Cannot continue')
+ if cluster not in data[row['name']]:
+ data[row['name']][cluster] = {}
+
+ try:
+ value = float(row[field])
+ except:
+ value = row[field]
+ data[row['name']][cluster][col] = value
+
+ return data
+
+
+ @classmethod
+ def _get_colours(cls, total_length, number_of_colours, colormap, skip=None):
+ if number_of_colours == 1:
+ return ["black"] * total_length
+ elif number_of_colours == 0:
+ cmap = cmx.get_cmap(colormap)
+ if skip is None:
+ vals = [1.0 * x / (total_length - 1) for x in range(total_length)]
+ else:
+ assert len(skip) == 2 and 0 <= skip[0] <= 1 and 0 <= skip[1] <= 1
+ if skip[-1] == 1:
+ vals = [skip[0] * x / (total_length - 1) for x in range(total_length)]
+ elif skip[0] == 0:
+ vals = [skip[1] + (1 - skip[1]) * x / (total_length - 1) for x in range(total_length)]
+ else:
+ length = 1 - (skip[1] - skip[0])
+ vals = [(length) * x / (total_length - 1) for x in range(total_length)]
+ vals = [x if x < skip[0] else x + (1-length) for x in vals]
+
+ return [cmap(x) for x in vals]
+ else:
+ cmap = cmx.get_cmap(colormap)
+ colours = []
+ for i in itertools.cycle(range(number_of_colours)):
+ colours.append(cmap(i))
+ if len(colours) >= total_length:
+ break
+ return colours
+
+
+ @classmethod
+ def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, refdata=None, no_combinations=False, interrupted=False, outfile=None):
+ assert use_hets in {'yes', 'no', 'exclude'}
+ if outfile is not None:
+ f = pyfastaq.utils.open_file_write(outfile)
+ print('Sample\tMIC\tMutations', file=f)
+
+ ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var', 'MULTIPLE'}
+ all_mutations = set()
+ all_mutations_seen_combinations = set()
+ top_plot_data = {} # cluster combination -> list of y coords (MIC values)
+
+ for sample in sorted(summary_data):
+ if sample not in mic_data:
+ raise Error('No MIC data found for sample "' + sample + '". Cannot continue')
+ if antibiotic not in mic_data[sample]:
+ raise Error('Antibiotic "' + antibiotic + '" not found. Cannot continue')
+
+ if mic_data[sample][antibiotic] == 'NA':
+ continue
+
+ mutations = set()
+ found_het_and_exclude = False
+
+ for cluster in summary_data[sample]:
+ if 'assembled' in summary_data[sample][cluster] and summary_data[sample][cluster]['assembled'] == 'interrupted' and interrupted:
+ mutations.add(cluster + '.interrupted')
+
+ if refdata is not None and 'match' in summary_data[sample][cluster] and summary_data[sample][cluster]['match'] == 'yes' and 'ref_seq' in summary_data[sample][cluster]:
+ ref_type, variant_only = refdata.sequence_type(summary_data[sample][cluster]['ref_seq'])
+ if not variant_only:
+ mutations.add(cluster + '.present')
+
+ for column, value in summary_data[sample][cluster].items():
+ if column in ignore_columns or column.endswith('.%'):
+ continue
+
+ if value == 'yes' or (use_hets == 'yes' and value == 'het'):
+ mutations.add(cluster + '.' + column.strip())
+ elif use_hets == 'exclude' and value == 'het':
+ found_het_and_exclude = True
+ break
+
+ if found_het_and_exclude:
+ break
+
+ if found_het_and_exclude:
+ continue
+
+
+ if len(mutations) == 0:
+ mutations.add('without_mutation')
+
+ all_mutations.update(mutations)
+ mutations = list(mutations)
+ mutations.sort()
+ if no_combinations:
+ for mutation in mutations:
+ all_mutations_seen_combinations.add((mutation,))
+ if mutation not in top_plot_data:
+ top_plot_data[mutation] = []
+ top_plot_data[mutation].append(mic_data[sample][antibiotic])
+ if outfile is not None:
+ print(sample, mic_data[sample][antibiotic], mutation, sep='\t', file=f)
+ else:
+ all_mutations_seen_combinations.add(tuple(mutations))
+ mutations = '.'.join(mutations)
+ if mutations not in top_plot_data:
+ top_plot_data[mutations] = []
+ top_plot_data[mutations].append(mic_data[sample][antibiotic])
+ if outfile is not None:
+ print(sample, mic_data[sample][antibiotic], mutations, sep='\t', file=f)
+
+
+ if outfile is not None:
+ pyfastaq.utils.close(f)
+
+ return top_plot_data, all_mutations, all_mutations_seen_combinations
+
+
+ @classmethod
+ def _filter_top_plot_data(cls, top_plot_data, all_mutations, seen_combinations, min_samples):
+ if min_samples == 1:
+ return top_plot_data, all_mutations, seen_combinations
+
+ new_top_plot_data = {}
+ new_all_mutations = set()
+ new_seen_combinations = set()
+
+ for mutation_tuple in seen_combinations:
+ mutation_string = '.'.join(mutation_tuple)
+ mics = top_plot_data[mutation_string]
+
+ if len(mics) >= min_samples:
+ new_top_plot_data[mutation_string] = mics
+ new_seen_combinations.add(mutation_tuple)
+ new_all_mutations.update(mutation_tuple)
+
+ return new_top_plot_data, new_all_mutations, new_seen_combinations
+
+
+ @classmethod
+ def _top_plot_y_ticks(cls, mic_data, antibiotic, log_y):
+ mic_values = set()
+ for sample in mic_data:
+ mic = mic_data[sample][antibiotic]
+ if mic not in [None, 'NA']:
+ mic_values.add(mic)
+
+ max_mic = max(mic_values)
+ min_mic = min(mic_values)
+ new_mic_values = []
+ i = 1
+ while i < max_mic * 2:
+ new_mic_values.append(i)
+ i *= 2
+
+ i = 0.5
+ while i > min_mic / 2:
+ new_mic_values.append(i)
+ i *= 0.5
+
+ new_mic_values.sort()
+ new_mic_values = [round(x, 4) for x in new_mic_values]
+
+ if log_y > 0:
+ tick_positions = [math.log(x, log_y) for x in new_mic_values]
+ else:
+ tick_positions = new_mic_values
+
+ return tick_positions, new_mic_values
+
+
+ @classmethod
+ def _top_plot_scatter_counts(cls, mutations, top_plot_data, colours, log_y):
+ x_coords = []
+ y_coords = []
+ sizes = []
+ colour_list = []
+
+ for i, mutation in enumerate(mutations):
+ counts = collections.Counter(top_plot_data[mutation])
+ for mic in sorted(counts):
+ x_coords.append(i + 1)
+ if log_y > 0:
+ y_coords.append(math.log(mic, log_y))
+ else:
+ y_coords.append(mic)
+ sizes.append(counts[mic])
+ colour_list.append(colours[i])
+
+ return x_coords, y_coords, sizes, colour_list
+
+
+ @classmethod
+ def _top_plot_scatter_data(cls, mutations, top_plot_data, colours, log_y, x_jitter):
+ x_coords = []
+ y_coords = []
+ colour_list = []
+
+ for i, mutation in enumerate(mutations):
+ for mic in top_plot_data[mutation]:
+ if len(top_plot_data[mutation]) > 1:
+ x_coords.append(i + 1 + random.uniform(-x_jitter, x_jitter))
+ else:
+ x_coords.append(i + 1)
+
+ if log_y > 0:
+ y_coords.append(math.log(mic, log_y))
+ else:
+ y_coords.append(mic)
+ colour_list.append(colours[i])
+
+ return x_coords, y_coords, colour_list
+
+
+ @classmethod
+ def _top_plot_violin_data(cls, mutations, top_plot_data, log_y):
+ violin_data = []
+ violin_pos = []
+
+ for i, mutation in enumerate(mutations):
+ if log_y > 0:
+ violin_data.append([math.log(x, log_y) for x in top_plot_data[mutation]])
+ else:
+ violin_data.append(top_plot_data[mutation])
+ violin_pos.append(i + 1)
+
+ return violin_data, violin_pos
+
+
+ @classmethod
+ def _ordered_bottom_plot_rows(cls, mutations):
+ l = []
+ infinity = float('inf')
+
+ for x in mutations:
+ try:
+ cluster, variant = x.split('.', maxsplit=1)
+ except:
+ l.append((x, infinity, x))
+ continue
+
+ if '.' in variant:
+ try:
+ var_group, var = variant.split('.', maxsplit=1)
+ except:
+ var_group = None
+ var = variant
+
+ variant = var
+
+ regex_match = regex_position_from_var.match(variant)
+ if regex_match is not None and regex_match.group('coord') != '':
+ coord = int(regex_match.group('coord'))
+ else:
+ coord = infinity
+
+ l.append((cluster, coord, x))
+
+ l.sort()
+ return [x[-1] for x in l]
+
+
+ @classmethod
+ def _ordered_columns(cls, mutations, top_plot_data):
+ # FIXME
+ return sorted(list(mutations))
+
+
+ @classmethod
+ def _bottom_scatter_data(cls, bottom_plot_rows, columns, colours, outline=False):
+ x_coords = []
+ y_coords = []
+ colour_list = []
+
+ for i, row in enumerate(bottom_plot_rows):
+ for j, col in enumerate(columns):
+ if row in col:
+ x_coords.append(j + 1)
+ y_coords.append(len(bottom_plot_rows) - i)
+ colour_list.append(colours[j])
+ elif outline:
+ x_coords.append(j + 1)
+ y_coords.append(len(bottom_plot_rows) - i)
+ colour_list.append("white")
+
+ return x_coords, y_coords, colour_list
+
+
+ @classmethod
+ def _right_plot_data(cls, scatter_count_sizes, x_pos):
+ y_max = max(scatter_count_sizes)
+ if y_max > 100:
+ y_max = int(math.ceil(y_max / 100.0)) * 100
+ sizes = [5, 50]
+ else:
+ y_max = int(math.ceil(y_max / 10.0)) * 10
+ sizes = [5, 10]
+
+ while sizes[-1] < y_max:
+ sizes.append(sizes[-1]*2)
+
+ x_coords = [x_pos] * len(sizes)
+ y_coords = [x + 1 for x in range(len(sizes))]
+ y_coords.reverse()
+ return x_coords, y_coords, sizes
+
+
+ @classmethod
+ def _pairwise_compare(cls, violin_data, columns, outfile, p_cutoff, compare_test):
+ try:
+ import scipy.stats
+ except:
+ print('WARNING: skipping Mann Whitney tests because scipy.stats not found', file=sys.stderr)
+ return
+
+ output = []
+
+ for i, list1 in enumerate(violin_data):
+ for j, list2 in enumerate(violin_data):
+ if j <= i or len(list1) < 2 or len(list2) < 2:
+ continue
+
+ list1set = set(list1)
+
+ if len(list1set) == 1 and list1set == set(list2):
+ statistic = 'NA'
+ pvalue = 1
+ else:
+ if compare_test == 'mannwhitneyu':
+ statistic, pvalue = scipy.stats.mannwhitneyu(list1, list2, alternative='two-sided')
+ elif compare_test == 'ks_2samp':
+ statistic, pvalue = scipy.stats.ks_2samp(list1, list2)
+ else:
+ raise Error('Test "' + compare_test + '" not recognised. Cannot continue')
+
+ effect_size = abs(scipy.stats.norm.ppf(pvalue) / math.sqrt(len(list1) + len(list2)))
+ significant = 'yes' if pvalue < p_cutoff else 'no'
+ output.append((columns[i], columns[j], len(list1), len(list2), pvalue, significant, effect_size))
+
+ output.sort(key=lambda x: x[4])
+
+ with open(outfile, 'w') as f:
+ print('Combination1', 'Combination2', 'Size1', 'Size2', 'p-value', 'significant', 'effect_size', 'corrected_p-value', 'corrected_significant', 'corrected_effect_size', sep='\t', file=f)
+ for x in output:
+ corrected_p = min(1, len(output) * x[4])
+ corrected_significant = 'yes' if corrected_p < p_cutoff else 'no'
+ corrected_effect_size = scipy.stats.norm.ppf(corrected_p) / math.sqrt(x[2] + x[3])
+ print('\t'.join([str(z) for z in x]), corrected_p, corrected_significant, corrected_effect_size, sep='\t', file=f)
+
+
+ def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations):
+ bottom_plot_rows = MicPlotter._ordered_bottom_plot_rows(all_mutations)
+ columns = MicPlotter._ordered_columns(mut_combinations, top_plot_data)
+ colours = MicPlotter._get_colours(len(columns), self.number_of_colours, self.colourmap, self.colour_skip)
+ bottom_scatter_x, bottom_scatter_y, bottom_colours = MicPlotter._bottom_scatter_data(bottom_plot_rows, columns, colours, outline=self.dot_outline)
+ columns = ['.'.join(x) for x in columns]
+ assert len(colours) == len(columns)
+ max_x = len(colours) + 1
+
+ scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y)
+ scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y, self.jitter_width)
+ violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y)
+ MicPlotter._pairwise_compare(violin_data, columns, self.outprefix + '.mannwhitney.tsv', self.p_cutoff, 'mannwhitneyu')
+ MicPlotter._pairwise_compare(violin_data, columns, self.outprefix + '.ks_2sample.tsv', self.p_cutoff, 'ks_2samp')
+
+ # -------------------- SET UP GRID & PLOTS -----------------
+ fig=plt.figure(figsize=(self.plot_width, self.plot_height))
+ if 'point' not in self.plot_types:
+ self.point_size = 42
+
+ if self.point_size == 0:
+ gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=self.panel_widths)
+ else:
+ gs = gridspec.GridSpec(2, 1, height_ratios=self.panel_heights)
+
+ plots=[]
+ plots.append(plt.subplot(gs[0]))
+ plots.append(plt.subplot(gs[1]))
+ if self.point_size == 0:
+ plots.append(plt.subplot(gs[2]))
+ bottom_plot_index = 2
+ else:
+ bottom_plot_index = 1
+
+ # ------------------------- TOP PLOT -----------------------
+ for h in self.hlines:
+ if self.log_y > 0:
+ h = math.log(h, self.log_y)
+ plots[0].hlines(h, 0, max_x, linestyle='--', linewidth=1, color='black')
+
+
+ if 'violin' in self.plot_types:
+ violins = plots[0].violinplot(violin_data, violin_positions, widths=self.violin_width, showmeans=False, showextrema=False, showmedians=False)
+ for x, pc in enumerate(violins['bodies']):
+ pc.set_facecolor(colours[x])
+ pc.set_edgecolor(colours[x])
+
+ scaled_count_sizes = [self.point_scale * x for x in scatter_count_sizes]
+
+ if 'point' in self.plot_types:
+ if self.point_size == 0:
+ plots[0].scatter(scatter_count_x, scatter_count_y, s=scaled_count_sizes, c=scatter_count_colours, linewidth=0)
+ else:
+ plots[0].scatter(scatter_data_x, scatter_data_y, c=scatter_data_colours, s=self.point_size)
+
+ if self.log_y > 0:
+ miny = min(scatter_count_y) - 0.5
+ maxy = max(scatter_count_y) + 0.5
+ else:
+ miny = 0
+ maxy = 1.05 * max(scatter_count_y)
+
+ plots[0].axis([0,max(bottom_scatter_x) + 1, miny, maxy])
+
+ y_tick_positions, y_tick_labels = MicPlotter._top_plot_y_ticks(mic_data, self.antibiotic, self.log_y)
+ plots[0].yaxis.set_ticks(y_tick_positions)
+ plots[0].set_yticklabels(y_tick_labels)
+ ylabel = r'$\log_' + str(int(self.log_y)) + '$(MIC) $\mu$g/mL' if self.log_y > 0 else r'MIC $\mu$g/mL'
+ plots[0].set_ylabel(ylabel)
+ plots[0].set_xticklabels([])
+ plots[0].set_title(self.main_title, fontsize=18)
+
+ # ------------------------- BOTTOM PLOT -----------------------
+ edgecolor = "black" if self.dot_outline else bottom_colours
+ plots[bottom_plot_index].axis([0,max(bottom_scatter_x) + 1,0,max(bottom_scatter_y) + 1])
+ plots[bottom_plot_index].scatter(bottom_scatter_x, bottom_scatter_y, marker='o', s=self.dot_size, c=bottom_colours, edgecolor=edgecolor, lw=1)
+ plots[bottom_plot_index].spines["top"].set_visible(False)
+ plots[bottom_plot_index].spines["right"].set_visible(False)
+ plots[bottom_plot_index].spines["bottom"].set_visible(False)
+ plots[bottom_plot_index].spines["left"].set_visible(False)
+ plots[bottom_plot_index].yaxis.set_tick_params(length=0)
+ plots[bottom_plot_index].xaxis.set_ticks([])
+ plots[bottom_plot_index].set_xticklabels([])
+ plots[bottom_plot_index].yaxis.set_ticks([(i+1) for i in range(len(bottom_plot_rows))])
+ plots[bottom_plot_index].set_yticklabels(bottom_plot_rows[::-1], fontsize=self.dot_y_text_size)
+
+ # ------------------------- RIGHT PLOT -------------------------
+ if self.point_size == 0:
+ right_x_coord = 0.75
+ right_x, right_y, right_sizes = MicPlotter._right_plot_data(scatter_count_sizes, right_x_coord)
+ right_scaled_sizes = [self.point_scale * x for x in right_sizes]
+ plots[1].scatter(right_x, right_y, s=right_scaled_sizes, c="black")
+ plots[1].axis('off')
+ plots[1].axis([0,4,-2*len(right_y),len(right_y)+1])
+ for i, y in enumerate(right_y):
+ plots[1].annotate(right_sizes[i], [right_x_coord + 0.75, y-0.2])
+ plots[1].annotate("Counts", [right_x_coord - 0.1, len(right_y) + 0.5])
+
+ plt.tight_layout(w_pad=self.count_legend_x)
+ plt.savefig(self.outprefix + '.' + self.out_format)
+
+
+
+
+ def run(self):
+ mic_data = MicPlotter._load_mic_file(self.mic_file)
+ summary_data = MicPlotter._load_summary_file(self.summary_file)
+ boxplot_tsv = self.outprefix + '.data.tsv'
+ top_plot_data, all_mutations, combinations = MicPlotter._get_top_plot_data(summary_data, mic_data, self.antibiotic, self.use_hets, refdata=self.refdata, no_combinations=self.no_combinations, interrupted=self.interrupted, outfile=boxplot_tsv)
+ top_plot_data, all_mutations, combinations = MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, combinations, self.min_samples)
+ self._make_plot(mic_data, top_plot_data, all_mutations, combinations)
diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py
index a6d82e9..82dc57f 100644
--- a/ariba/ref_genes_getter.py
+++ b/ariba/ref_genes_getter.py
@@ -19,6 +19,7 @@ allowed_ref_dbs = {
'srst2_argannot',
'vfdb_core',
'vfdb_full',
+ 'virulencefinder',
}
argannot_ref = '"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n'
@@ -425,6 +426,58 @@ class RefGenesGetter:
print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
+ def _get_from_virulencefinder(self, outprefix):
+ outprefix = os.path.abspath(outprefix)
+ final_fasta = outprefix + '.fa'
+ final_tsv = outprefix + '.tsv'
+ tmpdir = outprefix + '.tmp.download'
+ current_dir = os.getcwd()
+
+ try:
+ os.mkdir(tmpdir)
+ os.chdir(tmpdir)
+ except:
+ raise Error('Error mkdir/chdir ' + tmpdir)
+
+ zipfile = 'plasmidfinder.zip'
+ cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
+ print('Downloading data with:', cmd, sep='\n')
+ common.syscall(cmd)
+ common.syscall('unzip ' + zipfile)
+
+ print('Combining downloaded fasta files...')
+ fout_fa = pyfastaq.utils.open_file_write(final_fasta)
+ fout_tsv = pyfastaq.utils.open_file_write(final_tsv)
+ name_count = {}
+
+ for filename in os.listdir(tmpdir):
+ if filename.endswith('.fsa'):
+ print(' ', filename)
+ file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename))
+ for seq in file_reader:
+ original_id = seq.id
+ seq.id = seq.id.replace('_', '.', 1)
+ seq.id = seq.id.replace(' ', '_')
+ if seq.id in name_count:
+ name_count[seq.id] += 1
+ seq.id = seq.id + '.' + str(name_count[seq.id])
+ else:
+ name_count[seq.id] = 1
+ print(seq, file=fout_fa)
+ print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv)
+
+ pyfastaq.utils.close(fout_fa)
+ pyfastaq.utils.close(fout_tsv)
+ print('\nFinished combining files\n')
+ os.chdir(current_dir)
+ if not self.debug:
+ shutil.rmtree(tmpdir)
+ print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
+ print('You can use them with ARIBA like this:')
+ print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
+ print('If you use this downloaded data, please cite:')
+ print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n')
+
def run(self, outprefix):
exec('self._get_from_' + self.ref_db + '(outprefix)')
diff --git a/ariba/summary.py b/ariba/summary.py
index de9f06f..b98f35e 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -18,7 +18,7 @@ class Summary:
filter_rows=True,
filter_columns=True,
min_id=90.0,
- cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var',
+ cluster_cols='assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
make_phandango_tree=True,
only_clusters=None,
show_var_groups=False,
@@ -30,12 +30,12 @@ class Summary:
raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue')
if filenames is None:
- self.filenames = []
+ self.filenames = {}
else:
- self.filenames = filenames
+ self.filenames = {x: None for x in filenames}
if fofn is not None:
- self.filenames.extend(self._load_fofn(fofn))
+ self.filenames.update(self._load_fofn(fofn))
self.cluster_columns = self._determine_cluster_cols(cluster_cols)
self.filter_rows = filter_rows
@@ -62,13 +62,25 @@ class Summary:
@staticmethod
def _determine_cluster_cols(cols_string):
- allowed_cols = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'}
+ allowed_cols = {'assembled', 'match', 'ref_seq', 'pct_id', 'ctg_cov', 'known_var', 'novel_var'}
return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns')
- def _load_fofn(self, fofn):
+ @classmethod
+ def _load_fofn(cls, fofn):
+ '''Returns dictionary of filename -> short name. Value is None
+ whenever short name is not provided'''
+ filenames = {}
f = pyfastaq.utils.open_file_read(fofn)
- filenames = [x.rstrip() for x in f.readlines()]
+ for line in f:
+ fields = line.rstrip().split()
+ if len(fields) == 1:
+ filenames[fields[0]] = None
+ elif len(fields) == 2:
+ filenames[fields[0]] = fields[1]
+ else:
+ raise Error('Error at the following line of file ' + fofn + '. Expected at most 2 fields.\n' + line)
+
pyfastaq.utils.close(f)
return filenames
@@ -110,6 +122,7 @@ class Summary:
'match': 'no',
'novel_var': 'NA',
'pct_id': 'NA',
+ 'ctg_cov': 'NA',
'ref_seq': 'NA'
}
else:
@@ -152,15 +165,15 @@ class Summary:
matrix = []
making_header_lines = True
phandango_header = ['name']
- phandango_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'}
+ phandango_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'ctg_cov': ':c3', 'known_var': ':o1', 'novel_var': ':o1'}
ref_seq_counter = 2
csv_header = ['name']
- summary_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
- summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
+ summary_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'ctg_cov', 'known_var', 'novel_var']
+ summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'ctg_cov', 'known_var', 'novel_var'])
summary_cols_in_order = [x for x in summary_cols_in_order if cluster_cols[x]]
- for filename in filenames:
- line = [filename]
+ for filename in sorted(filenames):
+ line = [filename if filenames[filename] is None else filenames[filename]]
for cluster_name in sorted(all_potential_columns):
group_cols = sorted(list(all_potential_columns[cluster_name]['groups']))
diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index f3f952c..2676852 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -1,3 +1,4 @@
+import sys
from ariba import flag, report, summary_cluster_variant
class Error (Exception): pass
@@ -14,7 +15,8 @@ int_columns = [
]
-float_columns = ['pc_ident']
+float_columns = ['pc_ident', 'ctg_cov']
+
class SummaryCluster:
def __init__(self, min_pc_id=90):
@@ -30,10 +32,15 @@ class SummaryCluster:
@classmethod
- def line2dict(cls, line):
+ def line2dict(cls, line, filename=None):
data = line.rstrip().split('\t')
if len(data) != len(report.columns):
- raise Error('Wrong number of columns in the following line. Expected ' + str(len(report.columns)) + ' but got ' + str(len(data)) + '\n' + line)
+ if filename is not None:
+ filename_message = 'Error reading ariba summary file "' + filename + '". '
+ else:
+ filename_message = ''
+ raise Error(filename_message + 'Wrong number of columns in the following line. Expected ' + str(len(report.columns)) + ' but got ' + str(len(data)) + '\n' + line)
+
d = {report.columns[i]: data[i] for i in range(len(data))}
try:
d['flag'] = flag.Flag(int(d['flag']) )
@@ -84,16 +91,18 @@ class SummaryCluster:
self.data.append(data_dict)
- def pc_id_of_longest(self):
+ def _pc_id_and_read_depth_of_longest(self):
longest = 0
identity = 0
+ depth = 0
for d in self.data:
if d['ref_base_assembled'] > longest:
longest = d['ref_base_assembled']
identity = d['pc_ident']
+ depth = d['ctg_cov']
- return identity
+ return identity, depth
def _has_any_part_of_ref_assembled(self):
@@ -310,12 +319,14 @@ class SummaryCluster:
def column_summary_data(self):
'''Returns a dictionary of column name -> value, for cluster-level results'''
assembled_summary = self._to_cluster_summary_assembled()
+ pct_id, read_depth = self._pc_id_and_read_depth_of_longest()
columns = {
'assembled': self._to_cluster_summary_assembled(),
'match': self._has_match(assembled_summary),
'ref_seq': self.ref_name,
- 'pct_id': str(self.pc_id_of_longest()),
+ 'pct_id': str(pct_id),
+ 'ctg_cov': str(read_depth),
'known_var': self._to_cluster_summary_has_known_nonsynonymous(assembled_summary),
'novel_var': self._to_cluster_summary_has_novel_nonsynonymous(assembled_summary)
}
diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py
index 62c231a..709a2b6 100644
--- a/ariba/summary_cluster_variant.py
+++ b/ariba/summary_cluster_variant.py
@@ -1,3 +1,5 @@
+import re
+
class Error (Exception): pass
class SummaryClusterVariant:
@@ -32,21 +34,34 @@ class SummaryClusterVariant:
@classmethod
- def _depths_look_het(cls, depths):
- sorted_depths = sorted(depths)
- min_var_read_depth = 5
- min_second_var_read_depth = 2
- max_allele_freq = 0.90
- total_depth = sum(depths)
- second_depth_ok = len(depths) == 1 or (len(depths) > 1 and sorted_depths[-2] >= min_second_var_read_depth)
- max_depth_ok = total_depth >= min_var_read_depth and sorted_depths[-1] / total_depth <= max_allele_freq
- return depths[0] < sorted_depths[-1] or (second_depth_ok and max_depth_ok)
+ def _filter_depths(cls, ref_base, depths):
+ if ref_base not in depths:
+ return {}
+
+ min_freq = 0.1
+ max_freq = 0.9
+ new_depths = {}
+ total_depth = sum(depths.values())
+ ref_depth = depths[ref_base]
+ return {x: depths[x] for x in depths if depths[x] >= ref_depth or min_freq <= depths[x] / total_depth <= max_freq}
+
+
+ #@classmethod
+ #def _depths_look_het(cls, depths):
+ # sorted_depths = sorted(depths)
+ # min_var_read_depth = 5
+ # min_second_var_read_depth = 2
+ # max_allele_freq = 0.90
+ # total_depth = sum(depths)
+ # second_depth_ok = len(depths) == 1 or (len(depths) > 1 and sorted_depths[-2] >= min_second_var_read_depth)
+ # max_depth_ok = total_depth >= min_var_read_depth and sorted_depths[-1] / total_depth <= max_allele_freq
+ # return depths[0] < sorted_depths[-1] or (second_depth_ok and max_depth_ok)
@classmethod
def _get_is_het_and_percent(cls, data_dict):
if data_dict['gene'] == '1' or not (data_dict['known_var'] == '1' or data_dict['ref_ctg_effect'] == 'SNP' or data_dict['var_type'] == 'HET') or data_dict['smtls_nts'] == '.' or ';' in data_dict['smtls_nts'] or data_dict['smtls_nts_depth'] == 'ND':
- return False, None
+ return False, None, None
else:
nucleotides = data_dict['smtls_nts'].split(',')
depths = data_dict['smtls_nts_depth'].split(',')
@@ -72,21 +87,27 @@ class SummaryClusterVariant:
elif data_dict['known_var_change'] != '.':
var_nucleotide = data_dict['known_var_change'][-1]
else:
- return False, None
+ return False, None, None
if var_nucleotide == '.':
- return False, None
+ return False, None, None
total_depth = sum(depths)
if max([len(x) for x in nucleotides]) == 1:
var_depth = nuc_to_depth.get(var_nucleotide, 0)
else:
var_depth = sum([nuc_to_depth[x] for x in nuc_to_depth if x[0] == var_nucleotide])
- is_het = SummaryClusterVariant._depths_look_het(depths)
+ filtered_depths = SummaryClusterVariant._filter_depths(nucleotides[0], nuc_to_depth)
+
+ if len(filtered_depths) > 0:
+ bases = ''.join(sorted(list(filtered_depths.keys())))
+ return len(filtered_depths) > 1, round(100 * var_depth / total_depth, 1), bases
+ else:
+ return False, None, None
return is_het, round(100 * var_depth / total_depth, 1)
except:
- return False, None
+ return False, None, None
def _get_nonsynon_variant_data(self, data_dict):
@@ -101,10 +122,13 @@ class SummaryClusterVariant:
else:
self.var_string = data_dict['ref_ctg_effect']
- self.is_het, self.het_percent = SummaryClusterVariant._get_is_het_and_percent(data_dict)
+ self.is_het, self.het_percent, var_bases = SummaryClusterVariant._get_is_het_and_percent(data_dict)
+ if var_bases is not None:
+ self.var_string = re.sub(r'[^0-9]', '', self.var_string) + var_bases
+
+ self.has_nonsynon = SummaryClusterVariant._has_nonsynonymous(data_dict) and not (data_dict['var_type'] == 'HET' and not self.is_het)
- if not SummaryClusterVariant._has_nonsynonymous(data_dict):
- self.has_nonsynon = False
+ if not self.has_nonsynon:
return
self.has_nonsynon = True
diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
index bc1ea25..1110728 100644
--- a/ariba/summary_sample.py
+++ b/ariba/summary_sample.py
@@ -27,7 +27,7 @@ class SummarySample:
raise Error('Error parsing the following line.\n' + line)
continue
- data_dict = summary_cluster.SummaryCluster.line2dict(line)
+ data_dict = summary_cluster.SummaryCluster.line2dict(line, filename=filename)
cluster = data_dict['cluster']
if only_clusters is not None and cluster not in only_clusters:
continue
diff --git a/ariba/tasks/__init__.py b/ariba/tasks/__init__.py
index 769af32..299f518 100644
--- a/ariba/tasks/__init__.py
+++ b/ariba/tasks/__init__.py
@@ -2,6 +2,7 @@ __all__ = [
'aln2meta',
'flag',
'getref',
+ 'micplot',
'prepareref',
'pubmlstget',
'pubmlstspecies',
diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py
new file mode 100644
index 0000000..28ad144
--- /dev/null
+++ b/ariba/tasks/micplot.py
@@ -0,0 +1,39 @@
+import argparse
+import ariba
+
+def run(options):
+ plotter = ariba.mic_plotter.MicPlotter(
+ options.prepareref_dir,
+ options.antibiotic,
+ options.mic_file,
+ options.summary_file,
+ options.outprefix,
+ use_hets=options.use_hets,
+ main_title=options.main_title,
+ plot_height=options.plot_height,
+ plot_width=options.plot_width,
+ log_y=options.log_y,
+ plot_types=options.plot_types,
+ jitter_width=options.jitter_width,
+ no_combinations=options.no_combinations,
+ hlines=options.hlines,
+ point_size=options.point_size,
+ point_scale=options.point_scale,
+ dot_size=options.dot_size,
+ dot_outline=options.dot_outline,
+ dot_y_text_size=options.dot_y_text_size,
+ panel_heights=options.panel_heights,
+ panel_widths=options.panel_widths,
+ colourmap=options.colourmap,
+ number_of_colours=options.number_of_colours,
+ colour_skip=options.colour_skip,
+ interrupted=options.interrupted,
+ violin_width=options.violin_width,
+ xkcd=options.xkcd,
+ min_samples=options.min_samples,
+ count_legend_x=options.count_legend_x,
+ out_format=options.out_format,
+ p_cutoff=options.p_cutoff
+ )
+
+ plotter.run()
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index c91e982..821d7f4 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -18,22 +18,22 @@ def use_preset(options):
'row_filter': 'y',
},
'cluster_all': {
- 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
+ 'cluster_cols': 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'col_filter': 'y',
'row_filter': 'y',
},
'cluster_var_groups': {
- 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
+ 'cluster_cols': 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'col_filter': 'y',
'row_filter': 'y',
},
'all': {
- 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
+ 'cluster_cols': 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'col_filter': 'y',
'row_filter': 'y',
},
'all_no_filter': {
- 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
+ 'cluster_cols': 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'col_filter': 'n',
'row_filter': 'n',
},
@@ -69,7 +69,7 @@ def run(options):
min_id=options.min_id,
cluster_cols=options.cluster_cols,
make_phandango_tree=(not options.no_tree),
- only_clusters=None if options.only_cluster is None else {options.only_cluster},
+ only_clusters=None if options.only_clusters is None else set(options.only_clusters.split(',')),
show_var_groups=options.v_groups,
show_known_vars=options.known_variants,
show_novel_vars=options.novel_variants,
diff --git a/ariba/tests/assembly_compare_test.py b/ariba/tests/assembly_compare_test.py
index 7d4648e..03da24b 100644
--- a/ariba/tests/assembly_compare_test.py
+++ b/ariba/tests/assembly_compare_test.py
@@ -107,6 +107,37 @@ class TestAssemblyCompare(unittest.TestCase):
self.assertEqual(expected, got)
+ def test_nucmer_hits_to_ref_and_qry_coords(self):
+ '''test _nucmer_hits_to_ref_and_qry_coords'''
+ hits = [
+ ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+ ['31', '52', '1', '22', '22', '22', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+ ['11', '32', '1000', '1022', '22', '22', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+ ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'ref', 'contig1'],
+ ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'ref', 'contig2'],
+ ]
+ nucmer_hits = {
+ 'contig1': [
+ pymummer.alignment.Alignment('\t'.join(hits[0])),
+ pymummer.alignment.Alignment('\t'.join(hits[1])),
+ pymummer.alignment.Alignment('\t'.join(hits[2])),
+ ],
+ 'contig2': [
+ pymummer.alignment.Alignment('\t'.join(hits[3])),
+ ]
+ }
+
+ got_ctg, got_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(nucmer_hits)
+ expected_ctg = {
+ 'contig1': [pyfastaq.intervals.Interval(0,51), pyfastaq.intervals.Interval(99, 141)],
+ 'contig2': [pyfastaq.intervals.Interval(99, 109)]
+ }
+ expected_ref = {
+ 'contig1': [pyfastaq.intervals.Interval(0,51), pyfastaq.intervals.Interval(99, 141)],
+ 'contig2': [pyfastaq.intervals.Interval(99, 109)]
+ }
+
+
def test_ref_cov_per_contig(self):
'''test ref_cov_per_contig'''
hits = [
diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py
index 898106b..a149fd5 100644
--- a/ariba/tests/assembly_variants_test.py
+++ b/ariba/tests/assembly_variants_test.py
@@ -309,7 +309,12 @@ class TestAssemblyVariants(unittest.TestCase):
v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
- nucmer_coords = {
+ ref_nucmer_coords = {
+ 'contig1': [pyfastaq.intervals.Interval(0, 30)],
+ 'contig2': [pyfastaq.intervals.Interval(10, 41)],
+ }
+
+ ctg_nucmer_coords = {
'contig1': [pyfastaq.intervals.Interval(0, 30)],
'contig2': [pyfastaq.intervals.Interval(10, 41)],
}
@@ -328,7 +333,7 @@ class TestAssemblyVariants(unittest.TestCase):
}
a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file)
- got = a_variants.get_variants('presence_absence', nucmer_coords)
+ got = a_variants.get_variants('presence_absence', ctg_nucmer_coords, ref_nucmer_coords)
self.assertEqual(expected, got)
@@ -352,11 +357,15 @@ class TestAssemblyVariants(unittest.TestCase):
v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
- nucmer_coords = {
+ ctg_nucmer_coords = {
'contig1': [pyfastaq.intervals.Interval(0, 41)],
'contig2': [pyfastaq.intervals.Interval(10, 41)],
}
+ ref_nucmer_coords = {
+ 'contig1': [pyfastaq.intervals.Interval(0, 41)],
+ 'contig2': [pyfastaq.intervals.Interval(10, 41)],
+ }
expected = {
'contig1': [
(4, 'p', 'A5D', 'NONSYN', [v2, v3], set(), set()),
@@ -367,6 +376,6 @@ class TestAssemblyVariants(unittest.TestCase):
}
a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file)
- got = a_variants.get_variants('variants_only', nucmer_coords)
+ got = a_variants.get_variants('variants_only', ctg_nucmer_coords, ref_nucmer_coords)
self.assertEqual(expected, got)
diff --git a/ariba/tests/data/mic_plotter_load_mic_file.tsv b/ariba/tests/data/mic_plotter_load_mic_file.tsv
new file mode 100644
index 0000000..70aa315
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_load_mic_file.tsv
@@ -0,0 +1,7 @@
+Sample antibio1 antibio2
+sample1 0.25 0.004
+sample2 <0.25 <=0.004
+sample3 < 0.25 <= 0.004
+sample4 >256 >=256
+sample5 > 256 >= 256
+sample6 NA 1
diff --git a/ariba/tests/data/mic_plotter_load_summary_file.tsv b/ariba/tests/data/mic_plotter_load_summary_file.tsv
new file mode 100644
index 0000000..05c5677
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_load_summary_file.tsv
@@ -0,0 +1,3 @@
+name,cluster1.assembled,cluster1.match,cluster1.ref_seq,cluster1.pct_id,cluster1.known_var,cluster1.novel_var,cluster1.group1.A42T,cluster1.group1.A42T.%,cluster2.assembled,cluster2.match,cluster2.ref_seq,cluster2.pct_id,cluster2.known_var,cluster2.novel_var,cluster2.group1.A42T,cluster2.group1.A42T.%
+name1,yes,yes,ref1,100.0,no,no,no,NA,yes,yes,ref2,99.0,yes,no,yes,95.42
+name2,yes,yes_nonunique,ref3,99.0,yes,no,yes,90.90,no,no,NA,NA,NA,NA,NA,NA
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv
new file mode 100644
index 0000000..b4c4325
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv
@@ -0,0 +1,3 @@
+Sample MIC Mutations
+name1 0.25 cluster2.group2.A43T
+name1 0.25 cluster3.interrupted
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv
new file mode 100644
index 0000000..59743fb
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv
@@ -0,0 +1,2 @@
+Sample MIC Mutations
+name1 0.25 cluster2.group2.A43T.cluster3.interrupted
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv
new file mode 100644
index 0000000..7ef4771
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv
@@ -0,0 +1,4 @@
+Sample MIC Mutations
+name1 0.25 cluster2.group2.A43T
+name1 0.25 cluster3.interrupted
+name2 0.125 cluster1.group1.A42T
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.tsv
new file mode 100644
index 0000000..4806ebc
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.tsv
@@ -0,0 +1,3 @@
+Sample MIC Mutations
+name1 0.25 cluster2.group2.A43T.cluster3.interrupted
+name2 0.125 cluster1.group1.A42T
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv
new file mode 100644
index 0000000..0092bde
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv
@@ -0,0 +1,5 @@
+Sample MIC Mutations
+name1 0.25 cluster2.group2.A43T
+name1 0.25 cluster3.interrupted
+name2 0.125 cluster1.group1.A42T
+name2 0.125 cluster4.group4.A44T
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv
new file mode 100644
index 0000000..cb8b103
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv
@@ -0,0 +1,3 @@
+Sample MIC Mutations
+name1 0.25 cluster2.group2.A43T.cluster3.interrupted
+name2 0.125 cluster1.group1.A42T.cluster4.group4.A44T
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv
new file mode 100644
index 0000000..1912500
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv
@@ -0,0 +1,4 @@
+Sample MIC Mutations
+name1 0.004 cluster2.group2.A43T
+name1 0.004 cluster3.interrupted
+name3 0.002 without_mutation
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv
new file mode 100644
index 0000000..37b2097
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv
@@ -0,0 +1,3 @@
+Sample MIC Mutations
+name1 0.004 cluster2.group2.A43T.cluster3.interrupted
+name3 0.002 without_mutation
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv
new file mode 100644
index 0000000..1912500
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv
@@ -0,0 +1,4 @@
+Sample MIC Mutations
+name1 0.004 cluster2.group2.A43T
+name1 0.004 cluster3.interrupted
+name3 0.002 without_mutation
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv
new file mode 100644
index 0000000..37b2097
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv
@@ -0,0 +1,3 @@
+Sample MIC Mutations
+name1 0.004 cluster2.group2.A43T.cluster3.interrupted
+name3 0.002 without_mutation
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv
new file mode 100644
index 0000000..1912500
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv
@@ -0,0 +1,4 @@
+Sample MIC Mutations
+name1 0.004 cluster2.group2.A43T
+name1 0.004 cluster3.interrupted
+name3 0.002 without_mutation
diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv
new file mode 100644
index 0000000..37b2097
--- /dev/null
+++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv
@@ -0,0 +1,3 @@
+Sample MIC Mutations
+name1 0.004 cluster2.group2.A43T.cluster3.interrupted
+name3 0.002 without_mutation
diff --git a/ariba/tests/data/summary_test_load_fofn b/ariba/tests/data/summary_test_load_fofn
new file mode 100644
index 0000000..95e7d6d
--- /dev/null
+++ b/ariba/tests/data/summary_test_load_fofn
@@ -0,0 +1,4 @@
+/foo/bar/abc.tsv
+/spam/eggs/file1.tsv short_name1
+/spam/eggs/file2.tsv short_name2
+/spam/eggs/file3.tsv
diff --git a/ariba/tests/data/summary_test_whole_run.out.csv b/ariba/tests/data/summary_test_whole_run.out.csv
index 1a282e3..2d4bd9e 100644
--- a/ariba/tests/data/summary_test_whole_run.out.csv
+++ b/ariba/tests/data/summary_test_whole_run.out.csv
@@ -1,3 +1,3 @@
-name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.known_var,23S.C2597T,23S.C2597T.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.known_var,coding6.A52S,coding7.assembled,coding7 [...]
-/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,yes,yes,100.0,interrupted,no,coding1_ref1,99.1,yes,yes,coding2_ref1,98.2,no,no,NA,NA,yes,no,coding5_ref1,97.4,no,no,yes,yes,coding6_ref1,95.5,yes,yes,yes,coding7_ref1,95.4,yes,yes,coding8_ref1,95.3,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,yes,yes,yes,yes,noncoding1_ref1,99.1,yes,yes,noncoding10_ref1,95.1,yes,yes,99.0,yes,yes,noncoding11_ref1,9 [...]
-/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,het,het,12.8,yes,yes,coding1_ref2,99.2,no,no,NA,NA,yes,yes,coding3_ref1,97.6,yes,yes,coding5_ref1,97.4,yes,yes,no,no,NA,NA,NA,NA,no,NA,NA,no,no,NA,NA,NA,NA,no,NA,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,yes,yes,noncoding3_ref1,97.6,yes,no,noncoding5_ref1,99.42,no,no,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA [...]
+name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.ctg_cov,23S.known_var,23S.2597CT,23S.2597CT.%,23S.2597TC,23S.2597TC.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding1.ctg_cov,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding2.ctg_cov,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding3.ctg_cov,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.ctg_cov,coding5.known_var,coding5.A42S,coding6.assembled,c [...]
+summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,744.8,yes,no,NA,yes,100.0,interrupted,no,coding1_ref1,99.1,10.1,yes,yes,coding2_ref1,98.2,42.42,no,no,NA,NA,NA,yes,no,coding5_ref1,97.4,14.1,no,no,yes,yes,coding6_ref1,95.5,24.32,yes,yes,yes,coding7_ref1,95.4,24.32,yes,yes,coding8_ref1,95.3,24.31,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,16.2,yes,yes,yes,noncoding1_ref1,99.1,10.1,yes,yes,noncoding10_ref1,95.1,24.27,yes,yes,99.0,yes,yes,noncoding11_ref1,95 [...]
+summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,344.0,het,het,12.8,no,NA,yes,yes,coding1_ref2,99.2,10.1,no,no,NA,NA,NA,yes,yes,coding3_ref1,97.6,37.6,yes,yes,coding5_ref1,97.4,14.1,yes,yes,no,no,NA,NA,NA,NA,NA,no,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,NA,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,10.1,no,no,NA,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,NA,no,no,NA,NA,NA,yes,yes,noncoding3_ref1,97.6,37.6,yes,no,noncoding5_ref1,99.42,14.1,no,no,NA,no,no,NA,NA,NA,NA,NA,NA,no,no,N [...]
diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py
new file mode 100644
index 0000000..4ee3377
--- /dev/null
+++ b/ariba/tests/mic_plotter_test.py
@@ -0,0 +1,283 @@
+import unittest
+import copy
+import filecmp
+import os
+from ariba import mic_plotter
+
+modules_dir = os.path.dirname(os.path.abspath(mic_plotter.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestMicPlotter(unittest.TestCase):
+ def test_mic_string_to_float(self):
+ '''Test _mic_string_to_float'''
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('42.42'), 42.42)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('42'), 42.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>42'), 84.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('> 42'), 84.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>=42'), 42.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>= 42'), 42.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<42'), 21.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('< 42'), 21.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<=42'), 42.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<= 42'), 42.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(' <= 42.0 '), 42.0)
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('na'), 'NA')
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('NA'), 'NA')
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('.'), 'NA')
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(' '), 'NA')
+ self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(''), 'NA')
+
+
+ def test_load_mic_file(self):
+ '''Test _load_mic_file'''
+ infile = os.path.join(data_dir, 'mic_plotter_load_mic_file.tsv')
+ got = mic_plotter.MicPlotter._load_mic_file(infile)
+ expected = {
+ 'sample1': {'antibio1': 0.25, 'antibio2': 0.004},
+ 'sample2': {'antibio1': 0.125, 'antibio2': 0.004},
+ 'sample3': {'antibio1': 0.125, 'antibio2': 0.004},
+ 'sample4': {'antibio1': 512.0, 'antibio2': 256.0},
+ 'sample5': {'antibio1': 512.0, 'antibio2': 256.0},
+ 'sample6': {'antibio1': 'NA', 'antibio2': 1.0},
+ }
+
+ self.assertEqual(got, expected)
+
+
+ def test_load_summary_file(self):
+ '''Test _load_summary_file'''
+ infile = os.path.join(data_dir, 'mic_plotter_load_summary_file.tsv')
+ got = mic_plotter.MicPlotter._load_summary_file(infile)
+ expected = {
+ 'name1': {
+ 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref1', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'},
+ 'cluster2': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref2', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 95.42},
+ },
+ 'name2': {
+ 'cluster1': {'assembled': 'yes', 'match': 'yes_nonunique', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 90.90},
+ 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group1.A42T': 'NA', 'group1.A42T.%': 'NA'},
+ },
+ }
+ self.maxDiff = None
+ self.assertEqual(got, expected)
+
+
+ def test_get_colours(self):
+ '''test _get_colours'''
+ col1 = (0.0, 0.0, 0.5, 1.0)
+ col2 = (0.0, 0.0, 0.517825311942959, 1.0)
+
+ tests = [
+ (1, 1, 'jet', ["black"]),
+ (2, 1, 'jet', ["black", "black"]),
+ (3, 1, 'jet', ["black", "black", "black"]),
+ (2, 2, 'jet', [col1, col2]),
+ (3, 2, 'jet', [col1, col2, col1]),
+ (4, 2, 'jet', [col1, col2, col1, col2]),
+ (3, 0, 'jet', [(0.0, 0.0, 0.5, 1.0), (0.49019607843137247, 1.0, 0.47754585705249841, 1.0), (0.5, 0.0, 0.0, 1.0)])
+ ]
+
+ for total_length, number_of_colours, colormap, expected in tests:
+ self.assertEqual(expected, mic_plotter.MicPlotter._get_colours(total_length, number_of_colours, colormap))
+
+
+ def test_get_top_plot_data(self):
+ '''Test _get_top_plot_data'''
+ mic_data = {
+ 'name1': {'antibio1': 0.25, 'antibio2': 0.004},
+ 'name2': {'antibio1': 0.125, 'antibio2': 'NA'},
+ 'name3': {'antibio1': 'NA', 'antibio2': 0.002},
+ }
+
+ summary_data = {
+ 'name1': {
+ 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref1', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'},
+ 'cluster2': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref2', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group2.A43T': 'yes', 'group2.A43T.%': 95.42},
+ 'cluster3': {'assembled': 'interrupted', 'match': 'no', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'no', 'novel_var': 'yes', 'A42T': 'no', 'A44T.%': 'NA'},
+ 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group4.A44T': 'no', 'group4.A44T.%': 'NA'},
+ },
+ 'name2': {
+ 'cluster1': {'assembled': 'yes', 'match': 'yes_nonunique', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 90.90},
+ 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group2.A43T': 'NA', 'group2.A43T.%': 'NA'},
+ 'cluster3': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'no', 'novel_var': 'no', 'A42T': 'no', 'A44T.%': 'NA'},
+ 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group4.A44T': 'het', 'group4.A44T.%': '50.0'},
+ },
+ 'name3': {
+ 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref_seq42', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'},
+ 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group2.A43T': 'NA', 'group2.A43T.%': 'NA'},
+ 'cluster3': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'no', 'novel_var': 'no', 'A42T': 'no', 'A44T.%': 'NA'},
+ 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 100.0, 'known_var': 'yes', 'novel_var': 'no', 'group4.A44T': 'no', 'group4.A44T.%': 'NA'},
+ },
+ }
+
+ expected_top_plot_data = {
+ 'antibio1': {
+ 'yes': {'cluster1.group1.A42T.cluster4.group4.A44T': [0.125], 'cluster2.group2.A43T.cluster3.interrupted': [0.25]},
+ 'no': {'cluster1.group1.A42T': [0.125], 'cluster2.group2.A43T.cluster3.interrupted': [0.25]},
+ 'exclude': {'cluster2.group2.A43T.cluster3.interrupted': [0.25]},
+ },
+ 'antibio2': {
+ 'yes': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]},
+ 'no': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]},
+ 'exclude': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]},
+ }
+ }
+
+ expected_top_plot_data_no_combs = {
+ 'antibio1': {
+ 'yes': {'cluster2.group2.A43T': [0.25], 'cluster4.group4.A44T': [0.125], 'cluster3.interrupted': [0.25], 'cluster1.group1.A42T': [0.125]},
+ 'no': {'cluster2.group2.A43T': [0.25], 'cluster3.interrupted': [0.25], 'cluster1.group1.A42T': [0.125]},
+ 'exclude': {'cluster2.group2.A43T': [0.25], 'cluster3.interrupted': [0.25]},
+ },
+ 'antibio2': {
+ 'yes': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]},
+ 'no': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]},
+ 'exclude': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]},
+ }
+ }
+
+ expected_mutations = {
+ 'antibio1': {
+ 'yes': {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted', 'cluster4.group4.A44T'},
+ 'no': {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted'},
+ 'exclude': {'cluster2.group2.A43T', 'cluster3.interrupted'},
+ },
+ 'antibio2': {
+ 'yes': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'},
+ 'no': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'},
+ 'exclude': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'},
+ }
+ }
+
+ expected_combs = {
+ 'antibio1': {
+ 'yes': {('cluster2.group2.A43T', 'cluster3.interrupted'), ('cluster1.group1.A42T', 'cluster4.group4.A44T')},
+ 'no': {('cluster2.group2.A43T', 'cluster3.interrupted'), ('cluster1.group1.A42T',)},
+ 'exclude': {('cluster2.group2.A43T', 'cluster3.interrupted')}
+ },
+ 'antibio2': {
+ 'yes': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')},
+ 'no': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')},
+ 'exclude': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')}
+ }
+ }
+
+
+ expected_no_combs = {
+ 'antibio1': {
+ 'yes': {('cluster2.group2.A43T',), ('cluster3.interrupted',), ('cluster1.group1.A42T',), ('cluster4.group4.A44T',)},
+ 'no': {('cluster2.group2.A43T',), ('cluster3.interrupted',), ('cluster1.group1.A42T',)},
+ 'exclude': {('cluster2.group2.A43T',), ('cluster3.interrupted',)}
+ },
+ 'antibio2': {
+ 'yes': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)},
+ 'no': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)},
+ 'exclude': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)}
+ }
+ }
+
+
+ tmp_tsv = 'tmp.mic_plotter_test.to_boxplot.tsv'
+
+ for antibio in ['antibio1', 'antibio2']:
+ for use_het in ['no', 'yes', 'exclude']:
+ got_data, got_muts, got_combs = mic_plotter.MicPlotter._get_top_plot_data(summary_data, mic_data, antibio, use_het, interrupted=True, outfile=tmp_tsv)
+ expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.tsv')
+ self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False))
+ self.assertEqual(got_muts, expected_mutations[antibio][use_het])
+ self.assertEqual(got_combs, expected_combs[antibio][use_het])
+ self.assertEqual(got_data, expected_top_plot_data[antibio][use_het])
+ os.unlink(tmp_tsv)
+
+ got_data, got_muts, got_combs = mic_plotter.MicPlotter._get_top_plot_data(summary_data, mic_data, antibio, use_het, no_combinations=True, interrupted=True, outfile=tmp_tsv)
+ expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.no_combinations.tsv')
+ self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False))
+ self.assertEqual(got_muts, expected_mutations[antibio][use_het])
+ self.assertEqual(got_combs, expected_no_combs[antibio][use_het])
+ self.assertEqual(got_data, expected_top_plot_data_no_combs[antibio][use_het])
+ os.unlink(tmp_tsv)
+
+
+ def test_filter_top_plot_data(self):
+ '''test _filter_top_plot_data'''
+ top_plot_data = {
+ 'var1': [1, 2, 3],
+ 'var2.var3': [1],
+ 'var1.var3': [1, 2],
+ }
+
+ all_mutations = {'var1', 'var2', 'var3'}
+ seen_combinations = {('var1',), ('var1', 'var3'), ('var2', 'var3')}
+
+ got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 1)
+ self.assertEqual(got_top, top_plot_data)
+ self.assertEqual(got_all, all_mutations)
+ self.assertEqual(got_seen, seen_combinations)
+
+
+ got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 2)
+ expected_top_plot_data = {
+ 'var1': [1, 2, 3],
+ 'var1.var3': [1, 2],
+ }
+ expected_all_mutations = {'var1', 'var3'}
+ expected_seen_combinations = {('var1',), ('var1', 'var3')}
+ self.assertEqual(got_top, expected_top_plot_data)
+ self.assertEqual(got_all, expected_all_mutations)
+ self.assertEqual(got_seen, expected_seen_combinations)
+
+ got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 3)
+ expected_top_plot_data = {
+ 'var1': [1, 2, 3],
+ }
+ expected_all_mutations = {'var1'}
+ expected_seen_combinations = {('var1',),}
+ self.assertEqual(got_top, expected_top_plot_data)
+ self.assertEqual(got_all, expected_all_mutations)
+ self.assertEqual(got_seen, expected_seen_combinations)
+
+
+ def test_ordered_bottom_plot_rows(self):
+ '''test _ordered_bottom_plot_rows'''
+ to_order = {'clust1.grp1.42T', 'clust1.grp1.47G', 'clust0.10T', 'abcdefg'}
+ got = mic_plotter.MicPlotter._ordered_bottom_plot_rows(to_order)
+ expected = ['abcdefg', 'clust0.10T', 'clust1.grp1.42T', 'clust1.grp1.47G']
+ self.assertEqual(expected, got)
+
+
+ def test_ordered_columns(self):
+ '''test _ordered_colunns'''
+ top_plot_data = {}
+ # FIXME
+
+
+ def test_bottom_scatter_data(self):
+ '''test _bottom_scatter_data'''
+ #FIXME
+ pass
+
+
+ def test_top_plot_y_ticks(self):
+ '''test _top_plot_y_ticks'''
+ # FIXME
+ pass
+
+
+ def test_top_plot_scatter_counts(self):
+ '''test _top_plot_scatter_counts'''
+ top_plot_data = {}
+ # FIXME
+
+
+ def test_top_plot_scatter_data(self):
+ '''test _top_plot_scatter_data'''
+ top_plot_data = {}
+ # FIXME
+
+
+ def test_top_plot_violin_data(self):
+ '''test _top_plot_violin_data'''
+ top_plot_data = {}
+ # FIXME
+
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index 2cf8f19..e87ddcf 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -23,7 +23,7 @@ class TestSummaryCluster(unittest.TestCase):
'pc_ident': 98.33,
'ctg': 'ctg_name',
'ctg_len': 279,
- 'ctg_cov': '24.4',
+ 'ctg_cov': 24.4,
'known_var': '1',
'var_type': 'SNP',
'var_seq_type': 'n',
@@ -85,20 +85,20 @@ class TestSummaryCluster(unittest.TestCase):
self.assertTrue(cluster._has_any_part_of_ref_assembled())
- def test_pc_id_of_longest(self):
- '''Test pc_id_of_longest'''
+ def test_pc_id_and_read_depth_of_longest(self):
+ '''Test _pc_id_and_read_depth_of_longest'''
cluster = summary_cluster.SummaryCluster()
self.assertTrue(cluster.name is None)
- line1 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
- line2 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
- line3 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+ line1 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t42.2\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+ line2 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t42.42\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+ line3 = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t42.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
cluster.add_data_dict(data_dict1)
cluster.add_data_dict(data_dict2)
cluster.add_data_dict(data_dict3)
- self.assertEqual(98.2, cluster.pc_id_of_longest())
+ self.assertEqual((98.2, 42.42), cluster._pc_id_and_read_depth_of_longest())
def test_to_cluster_summary_number(self):
@@ -465,6 +465,7 @@ class TestSummaryCluster(unittest.TestCase):
'novel_var': 'no',
'known_var': 'yes',
'pct_id': '98.33',
+ 'ctg_cov': '24.4',
}
got = cluster.column_summary_data()
self.assertEqual(expected, got)
@@ -543,6 +544,7 @@ class TestSummaryCluster(unittest.TestCase):
'match': 'yes',
'ref_seq': 'ref1',
'pct_id': '98.33',
+ 'ctg_cov': '24.4',
'known_var': 'yes',
'novel_var': 'no',
}
diff --git a/ariba/tests/summary_cluster_variant_test.py b/ariba/tests/summary_cluster_variant_test.py
index f88cc0c..10af7d9 100644
--- a/ariba/tests/summary_cluster_variant_test.py
+++ b/ariba/tests/summary_cluster_variant_test.py
@@ -23,51 +23,73 @@ class TestSummaryClusterVariant(unittest.TestCase):
self.assertEqual(expected[i], summary_cluster_variant.SummaryClusterVariant._has_nonsynonymous(dicts[i]))
- def test_depths_look_het(self):
- '''test _depths_look_het'''
+ def test_filter_depths(self):
+ '''test _filter_depths'''
tests = [
- ([1], False),
- ([2], False),
- ([3], False),
- ([4], False),
- ([5], False),
- ([90, 1], False),
- ([90, 9], False),
- ([90, 10], True),
- ([9, 1], False),
- ([9, 2], True),
- ([1, 2], True),
- ([90, 5, 5], True),
- ([90, 2, 1, 1], False),
- ([97, 2, 1], False),
+ ('A', {'A': 1}, {'A': 1}),
+ ('A', {'A': 2}, {'A': 2}),
+ ('A', {'A': 3}, {'A': 3}),
+ ('A', {'A': 4}, {'A': 4}),
+ ('A', {'A': 5}, {'A': 5}),
+ ('A', {'A': 90, 'C': 9}, {'A': 90}),
+ ('C', {'A': 90, 'C': 9}, {'A': 90, 'C': 9}),
+ ('C', {'A': 90, 'C': 9, 'G':1}, {'A': 90, 'C': 9}),
+ ('A', {'A': 90, 'C': 10}, {'A': 90, 'C': 10}),
+ ('A', {'A': 90, 'C': 5, 'G': 5}, {'A': 90}),
+ ('A', {'A': 89, 'C': 10, 'G': 1}, {'A': 89, 'C': 10}),
+ ('A', {'A': 80, 'C': 10, 'G': 10}, {'A': 80, 'C': 10, 'G': 10}),
]
- for depths, expected in tests:
- self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._depths_look_het(depths))
+ for ref_base, depths, expected in tests:
+ self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._filter_depths(ref_base, depths))
+
+
+ #def test_depths_look_het(self):
+ # '''test _depths_look_het'''
+ # tests = [
+ # ([1], False),
+ # ([2], False),
+ # ([3], False),
+ # ([4], False),
+ # ([5], False),
+ # ([90, 1], False),
+ # ([90, 9], False),
+ # ([90, 10], True),
+ # ([9, 1], False),
+ # ([9, 2], True),
+ # ([1, 2], True),
+ # ([89, 10, 1], True),
+ # ([89, 9, 2], False),
+ # ([90, 2, 1, 1], False),
+ # ([97, 2, 1], False),
+ # ]
+
+ # for depths, expected in tests:
+ # self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._depths_look_het(depths))
def test_get_is_het_and_percent(self):
'''test _get_is_het_and_percent'''
tests = [
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', (False, 100.0)),
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tT,A\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', (True, 25.0)),
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tA,T\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', (True, 75.0)),
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 40.0)),
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t95,5\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 5.0)),
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t90,10\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 10.0)),
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t90,6,4\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 6.0)),
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t3,7,90\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 7.0)),
- ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tHET\t.\t.\t.\t.\t.\t.\t.\t.\t84\t84\tA\t50\tA,T\t40,10\t.\t.', (True, 20.0)),
- ('ariba_ref1\t23S.rDNA_WHO_F_01358c\t0\t1\t531\t9914\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3120\t744.8\t1\tSNP\tn\tC2597T\t1\tC2597T\tSNP\t2597\t2597\tC\t2755\t2755\tT\t823\tTC,T\t487,1\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T.\tHigh-level resistance to Azithromycin', (False, 100.0)),
- ('ariba\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0)),
- ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 9.0)),
- ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0)),
- ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t70,30\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 30.0)),
- ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 91.0)),
- ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 90.0)),
- ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0)),
- ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t10,90\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0)),
- ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t9,91\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 9.0)),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', (False, 100.0, 'T')),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tT,A\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', (True, 25.0, 'AT')),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tA,T\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', (True, 75.0, 'AT')),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 40.0, 'AGT')),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t95,5\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 5.0, 'A')),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t90,10\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 10.0, 'AT')),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t90,6,4\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 6.0, 'A')),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t3,7,90\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 7.0, 'ACT')),
+ ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tHET\t.\t.\t.\t.\t.\t.\t.\t.\t84\t84\tA\t50\tA,T\t40,10\t.\t.', (True, 20.0, 'AT')),
+ ('ariba_ref1\t23S.rDNA_WHO_F_01358c\t0\t1\t531\t9914\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3120\t744.8\t1\tSNP\tn\tC2597T\t1\tC2597T\tSNP\t2597\t2597\tC\t2755\t2755\tT\t823\tC,T\t487,1\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T.\tHigh-level resistance to Azithromycin', (False, 0.2, 'C')),
+ ('ariba\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0, 'CT')),
+ ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 9.0, 'C')),
+ ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0, 'CT')),
+ ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t70,30\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 30.0, 'CT')),
+ ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 91.0, 'T')),
+ ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 90.0, 'CT')),
+ ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0, 'CT')),
+ ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t10,90\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0, 'CT')),
+ ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t9,91\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 9.0, 'CT')),
]
for line, expected in tests:
@@ -88,10 +110,10 @@ class TestSummaryClusterVariant(unittest.TestCase):
expected = [
{'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None},
- {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 100.0},
- {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0},
- {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0},
- {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 10.0},
+ {'coding': False, 'known': True, 'var_string': '14T', 'var_group': 'id1', 'het_percent': 100.0},
+ {'coding': False, 'known': True, 'var_string': '14AT', 'var_group': 'id1', 'het_percent': 25.0},
+ {'coding': False, 'known': True, 'var_string': '14AGT', 'var_group': 'id1', 'het_percent': 50.0},
+ {'coding': False, 'known': True, 'var_string': '14AT', 'var_group': 'id1', 'het_percent': 10.0},
]
assert len(lines) == len(expected)
diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py
index 67ca2bc..f17591c 100644
--- a/ariba/tests/summary_sample_test.py
+++ b/ariba/tests/summary_sample_test.py
@@ -52,7 +52,8 @@ class TestSummarySample(unittest.TestCase):
'ref_seq': 'noncoding1',
'known_var': 'yes',
'novel_var': 'yes',
- 'pct_id': '98.33'
+ 'pct_id': '98.33',
+ 'ctg_cov': '35.4',
},
'cluster.p': {
'assembled': 'yes',
@@ -60,7 +61,8 @@ class TestSummarySample(unittest.TestCase):
'ref_seq': 'presence_absence1',
'known_var': 'yes',
'novel_var': 'no',
- 'pct_id': '98.96'
+ 'pct_id': '98.96',
+ 'ctg_cov': '35.1',
},
'cluster.v': {
'assembled': 'yes',
@@ -68,7 +70,8 @@ class TestSummarySample(unittest.TestCase):
'ref_seq': 'variants_only1',
'known_var': 'yes',
'novel_var': 'no',
- 'pct_id': '100.0'
+ 'pct_id': '100.0',
+ 'ctg_cov': '42.4',
}
}
self.maxDiff = None
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index f98e6e6..4e2ef18 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -11,15 +11,16 @@ class TestSummary(unittest.TestCase):
'''Test init'''
fofn = os.path.join(data_dir, 'summary_test_init.fofn')
s = summary.Summary('out', fofn=fofn)
- self.assertEqual(s.filenames, ['file1', 'file2'])
- s = summary.Summary('out', filenames=['file42'])
- self.assertEqual(s.filenames, ['file42'])
- s = summary.Summary('out', fofn=fofn, filenames=['file42'])
- self.assertEqual(s.filenames, ['file42', 'file1', 'file2'])
+ self.assertEqual(s.filenames, {'file1': None, 'file2': None})
+ s = summary.Summary('out', filenames={'file42': None})
+ self.assertEqual(s.filenames, {'file42': None})
+ s = summary.Summary('out', fofn=fofn, filenames={'file42': None})
+ self.assertEqual(s.filenames, {'file42': None, 'file1': None, 'file2': None})
def test_determine_cluster_cols(self):
col_strings = [
+ 'assembled,match,ref_seq,pct_id,ctg_cov,known_var,novel_var',
'assembled,match,ref_seq,pct_id,known_var,novel_var',
'ref_seq,pct_id,known_var,novel_var',
'assembled,pct_id,known_var,novel_var',
@@ -29,12 +30,13 @@ class TestSummary(unittest.TestCase):
]
expected = [
- {'assembled': True, 'match': True, 'ref_seq': True, 'pct_id': True, 'known_var': True, 'novel_var': True},
- {'assembled': False, 'match': False, 'ref_seq': True, 'pct_id': True, 'known_var': True, 'novel_var': True},
- {'assembled': True, 'match': False, 'ref_seq': False, 'pct_id': True, 'known_var': True, 'novel_var': True},
- {'assembled': True, 'match': False, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False},
- {'assembled': False, 'match': False, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False},
- {'assembled': False, 'match': False, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False},
+ {'assembled': True, 'match': True, 'ref_seq': True, 'pct_id': True, 'ctg_cov': True, 'known_var': True, 'novel_var': True},
+ {'assembled': True, 'match': True, 'ref_seq': True, 'pct_id': True, 'ctg_cov': False, 'known_var': True, 'novel_var': True},
+ {'assembled': False, 'match': False, 'ref_seq': True, 'pct_id': True, 'ctg_cov': False, 'known_var': True, 'novel_var': True},
+ {'assembled': True, 'match': False, 'ref_seq': False, 'pct_id': True, 'ctg_cov': False, 'known_var': True, 'novel_var': True},
+ {'assembled': True, 'match': False, 'ref_seq': False, 'pct_id': False, 'ctg_cov': False, 'known_var': False, 'novel_var': False},
+ {'assembled': False, 'match': False, 'ref_seq': False, 'pct_id': False, 'ctg_cov': False, 'known_var': False, 'novel_var': False},
+ {'assembled': False, 'match': False, 'ref_seq': False, 'pct_id': False, 'ctg_cov': False, 'known_var': False, 'novel_var': False},
]
assert len(col_strings) == len(expected)
@@ -43,6 +45,19 @@ class TestSummary(unittest.TestCase):
self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i]))
+ def test_load_fofn(self):
+ '''Test _load_fofn'''
+ infile = os.path.join(data_dir, 'summary_test_load_fofn')
+ got = summary.Summary._load_fofn(infile)
+ expected = {
+ '/foo/bar/abc.tsv': None,
+ '/spam/eggs/file1.tsv': 'short_name1',
+ '/spam/eggs/file2.tsv': 'short_name2',
+ '/spam/eggs/file3.tsv': None
+ }
+ self.assertEqual(expected, got)
+
+
def test_load_input_files(self):
'''Test _load_input_files'''
file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv')
@@ -80,6 +95,7 @@ class TestSummary(unittest.TestCase):
'match': 'yes',
'novel_var': 'no',
'pct_id': '98.33',
+ 'ctg_cov': '10.0',
'ref_seq': 'noncoding_ref1'
},
'groups': {},
@@ -92,6 +108,7 @@ class TestSummary(unittest.TestCase):
'match': 'yes',
'novel_var': 'no',
'pct_id': '98.33',
+ 'ctg_cov': '10.0',
'ref_seq': 'noncoding_ref2'
},
'groups': {},
@@ -104,6 +121,7 @@ class TestSummary(unittest.TestCase):
'match': 'yes',
'novel_var': 'yes',
'pct_id': '98.96',
+ 'ctg_cov': '20.1',
'ref_seq': 'presence_absence_ref1'
},
'groups': {},
@@ -116,6 +134,7 @@ class TestSummary(unittest.TestCase):
'match': 'no',
'novel_var': 'yes',
'pct_id': '99.1',
+ 'ctg_cov': '22.3',
'ref_seq': 'presence_absence_ref2'
},
'groups': {},
@@ -129,6 +148,7 @@ class TestSummary(unittest.TestCase):
'match': 'yes',
'novel_var': 'no',
'pct_id': '98.33',
+ 'ctg_cov': '50.1',
'ref_seq': 'noncoding_ref1'
},
'groups': {},
@@ -141,6 +161,7 @@ class TestSummary(unittest.TestCase):
'match': 'yes',
'novel_var': 'no',
'pct_id': '98.33',
+ 'ctg_cov': '10.0',
'ref_seq': 'noncoding_ref2'
},
'groups': {},
@@ -153,6 +174,7 @@ class TestSummary(unittest.TestCase):
'match': 'yes',
'novel_var': 'yes',
'pct_id': '98.96',
+ 'ctg_cov': '51.1',
'ref_seq': 'presence_absence1'
},
'groups': {},
@@ -169,6 +191,7 @@ class TestSummary(unittest.TestCase):
'match',
'novel_var',
'pct_id',
+ 'ctg_cov',
'ref_seq'
},
'groups': set(),
@@ -181,6 +204,7 @@ class TestSummary(unittest.TestCase):
'match',
'novel_var',
'pct_id',
+ 'ctg_cov',
'ref_seq'
},
'groups': set(),
@@ -193,6 +217,7 @@ class TestSummary(unittest.TestCase):
'match',
'novel_var',
'pct_id',
+ 'ctg_cov',
'ref_seq'
},
'groups': set(),
@@ -205,6 +230,7 @@ class TestSummary(unittest.TestCase):
'match',
'novel_var',
'pct_id',
+ 'ctg_cov',
'ref_seq'
},
'groups': set(),
@@ -214,7 +240,7 @@ class TestSummary(unittest.TestCase):
self.maxDiff = None
s = summary.Summary('out', filenames=infiles)
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
self.assertEqual(expected_potential_cols, s.all_potential_columns)
self.assertEqual(expected_all, s.all_data)
@@ -226,20 +252,20 @@ class TestSummary(unittest.TestCase):
expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes', 'id3.%': 100.0}
expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0}
s = summary.Summary('out', filenames=infiles, show_var_groups=True)
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
self.assertEqual(expected_potential_cols, s.all_potential_columns)
self.assertEqual(expected_all, s.all_data)
- expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A6G.%', 'A14T'}
- expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T', 'A42T.%'}
+ expected_potential_cols['noncoding1']['vars'] = {'14T', '14T.%', '14GT', '14GT.%', '6G', '6G.%'}
+ expected_potential_cols['noncoding2']['vars'] = {'52GT', '52GT.%', '42T', '42T.%'}
- expected_all[infiles[0]]['noncoding1']['vars'] = {'A14T': 'yes', 'A14T.%': 100.0}
- expected_all[infiles[0]]['noncoding2']['vars'] = {'A42T': 'yes', 'A42T.%': 100.0, 'A52T': 'het', 'A52T.%': 40.0}
- expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes', 'A6G.%': 100.0}
- expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0}
+ expected_all[infiles[0]]['noncoding1']['vars'] = {'14T': 'yes', '14T.%': 100.0}
+ expected_all[infiles[0]]['noncoding2']['vars'] = {'42T': 'yes', '42T.%': 100.0, '52GT': 'het', '52GT.%': 40.0}
+ expected_all[infiles[1]]['noncoding1']['vars'] = {'14GT': 'het', '14GT.%': 80.0, '6G': 'yes', '6G.%': 100.0}
+ expected_all[infiles[1]]['noncoding2']['vars'] = {'52GT': 'het', '52GT.%': 40.0}
s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True)
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
self.assertEqual(expected_potential_cols, s.all_potential_columns)
self.assertEqual(expected_all, s.all_data)
@@ -250,7 +276,7 @@ class TestSummary(unittest.TestCase):
expected_all[infiles[0]]['presence_absence2']['vars'] = {'V175L': 'yes'}
expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'}
s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True)
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
self.assertEqual(expected_potential_cols, s.all_potential_columns)
self.assertEqual(expected_all, s.all_data)
@@ -263,16 +289,23 @@ class TestSummary(unittest.TestCase):
os.path.join(data_dir, 'summary_to_matrix.2.tsv')
]
- s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True)
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ fofn = 'tmp.summary_to_matrix_all_cols'
+ with open(fofn, 'w') as f:
+ print(infiles[0], 'sample1', file=f)
+ print(infiles[1], file=f)
+
+
+ s = summary.Summary('out', fofn=fofn, show_var_groups=True, show_known_vars=True, show_novel_vars=True)
+ os.unlink(fofn)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
- got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns)
- expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding1.A6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known [...]
- expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding1.A6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2 [...]
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.ctg_cov:c3', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding1.14GT:o1', 'noncoding1.14GT.%:c2', 'noncoding1.14T:o1', 'noncoding1.14T.%:c2', 'noncoding1.6G:o1', 'noncoding1.6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', [...]
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.ctg_cov', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding1.14GT', 'noncoding1.14GT.%', 'noncoding1.14T', 'noncoding1.14T.%', 'noncoding1.6G', 'noncoding1.6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.ctg_cov [...]
expected_matrix = [
- [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'],
- [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes']
+ ['sample1', 'yes', 'yes', 'noncoding_ref1', '98.33', '10.0', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', '10.0', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', '20.1', 'no', 'yes', 'yes'],
+ [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', '50.1', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'het', 80.0, 'no', 'NA', 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', '10.0', 'yes', 'no', 'het', 40.0, 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', '51.1', 'no', 'yes', 'yes']
]
self.assertEqual(expected_phandango_header, got_phandango_header)
@@ -288,15 +321,15 @@ class TestSummary(unittest.TestCase):
]
s = summary.Summary('out', filenames=infiles, show_var_groups=True)
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
- got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns)
- expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'presence [...]
- expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_a [...]
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.ctg_cov:c3', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.ctg_cov:c3', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'no [...]
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.ctg_cov', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.ctg_cov', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'presence_absence1.assemb [...]
expected_matrix = [
- [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'],
- [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes']
+ [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', '10.0', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', '10.0', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'yes', 'presence_absence_ref1', '98.96', '20.1', 'no', 'yes'],
+ [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', '50.1', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', '10.0', 'yes', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', '51.1', 'no', 'yes']
]
self.assertEqual(expected_phandango_header, got_phandango_header)
@@ -312,15 +345,15 @@ class TestSummary(unittest.TestCase):
]
s = summary.Summary('out', filenames=infiles, show_known_vars=True, show_novel_vars=True)
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
- got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns)
- expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding1.A6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.A42T:o1', 'noncoding2.A42T.%:c2', 'nonc [...]
- expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding1.A6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.A42T', 'noncoding2.A42T.%', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assemb [...]
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.ctg_cov:c3', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.14GT:o1', 'noncoding1.14GT.%:c2', 'noncoding1.14T:o1', 'noncoding1.14T.%:c2', 'noncoding1.6G:o1', 'noncoding1.6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.ctg_cov:c3', 'noncoding2.k [...]
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.ctg_cov', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.14GT', 'noncoding1.14GT.%', 'noncoding1.14T', 'noncoding1.14T.%', 'noncoding1.6G', 'noncoding1.6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.ctg_cov', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.42T', 'noncod [...]
expected_matrix = [
- [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'],
- [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes']
+ [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', '10.0', 'yes', 'no', 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', '10.0', 'yes', 'no', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', '20.1', 'no', 'yes', 'yes'],
+ [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', '50.1', 'yes', 'no', 'het', 80.0, 'no', 'NA', 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', '10.0', 'yes', 'no', 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', '51.1', 'no', 'yes', 'yes']
]
self.assertEqual(expected_phandango_header, got_phandango_header)
@@ -336,15 +369,15 @@ class TestSummary(unittest.TestCase):
]
s = summary.Summary('out', filenames=infiles)
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
- got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns)
- expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_abse [...]
- expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var']
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.ctg_cov:c3', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.ctg_cov:c3', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq: [...]
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.ctg_cov', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.ctg_cov', 'noncoding2.known_var', 'noncoding2.novel_var', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.ctg_c [...]
expected_matrix = [
- [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'],
- [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes']
+ [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', '10.0', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', '10.0', 'yes', 'no', 'yes', 'yes', 'presence_absence_ref1', '98.96', '20.1', 'no', 'yes'],
+ [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', '50.1', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', '10.0', 'yes', 'no', 'yes', 'yes', 'presence_absence1', '98.96', '51.1', 'no', 'yes']
]
self.assertEqual(expected_phandango_header, got_phandango_header)
@@ -360,9 +393,9 @@ class TestSummary(unittest.TestCase):
]
s = summary.Summary('out', filenames=infiles, cluster_cols='assembled')
- s.samples = summary.Summary._load_input_files(infiles, 90)
+ s.samples = summary.Summary._load_input_files(s.filenames, 90)
s._gather_unfiltered_output_data()
- got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns)
expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding2.assembled:o1', 'presence_absence1.assembled:o1']
expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding2.assembled', 'presence_absence1.assembled']
diff --git a/scripts/ariba b/scripts/ariba
index 2ba73d8..9a76385 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -56,6 +56,55 @@ subparser_getref.add_argument('outprefix', help='Prefix of output filenames')
subparser_getref.set_defaults(func=ariba.tasks.getref.run)
+#----------------------------- micplot -------------------------------
+subparser_micplot = subparsers.add_parser(
+ 'micplot',
+ help='Make violin/dot plots using MIC data',
+ usage='ariba prepareref [options] <prepareref_dir> <antibiotic> <MIC file> <summary file> <outprefix>',
+ description='Makes a violin and scatter plot of MIC per variant in the summary file',
+)
+subparser_micplot.add_argument('prepareref_dir', help='Name of output directory when "ariba prepareref" was run')
+subparser_micplot.add_argument('antibiotic', help='Antibiotic name. Must exactly match a column from the MIC file')
+subparser_micplot.add_argument('mic_file', help='File containing MIC data for each sample and one or more antibiotics')
+subparser_micplot.add_argument('summary_file', help='File made by running "ariba summary"')
+subparser_micplot.add_argument('outprefix', help='Prefix of output files')
+
+micplot_general_group = subparser_micplot.add_argument_group('General options')
+micplot_general_group.add_argument('--out_format', help='Output format of image file. Use anything that matplotlib can save to, eg pdf or png [%(default)s]', default='pdf')
+micplot_general_group.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"')
+micplot_general_group.add_argument('--plot_height', help='Height of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT')
+micplot_general_group.add_argument('--plot_width', help='Width of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT')
+micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes')
+micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)')
+micplot_general_group.add_argument('--min_samples', type=int, help='Minimum number of samples in each column required to include in plot [%(default)s]', metavar='INT', default=1)
+micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.')
+micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='9,2', metavar='height1,height2')
+micplot_general_group.add_argument('--panel_widths', help='Two integers that determine relative width of plots and space used by counts legend. eg 5,1 means ratio of 5:1 between top and bottom panel widths. Only applies when plotting points and --point_size 0 [%(default)s]', default='5,1', metavar='width1,width2')
+micplot_general_group.add_argument('--count_legend_x', type=float, help='Control x position of counts legend when plotting points and --point_size 0 [%(default)s]', default=-2, metavar='FLOAT')
+micplot_general_group.add_argument('--p_cutoff', type=float, help='p-value cutoff for Mann-Whitney tests [%(default)s]', default=0.05)
+micplot_general_group.add_argument('--xkcd', action='store_true', help='Best used with xkcd font installed ;)')
+
+micplot_colour_group = subparser_micplot.add_argument_group('Colour options')
+micplot_colour_group.add_argument('--colourmap', help='Colours to use. See http://matplotlib.org/users/colormaps.html [%(default)s]', default='Accent', metavar='colourmap name')
+micplot_colour_group.add_argument('--number_of_colours', type=int, help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colourmap specified by --colourmap and cycle them [%(default)s]', default=0, metavar='INT')
+micplot_colour_group.add_argument('--colour_skip', help='If using a continuous colourmap, --colour_skip a,b (where 0 <= a < b <= 1) will skip the range between a and b. Useful for excluding near-white colours', metavar='FLOAT1,FLOAT2')
+
+micplot_upper_plot_group = subparser_micplot.add_argument_group('Upper plot options')
+micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...')
+micplot_upper_plot_group.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. Default is to draw no lines.', metavar='float1,float2,...', default='')
+micplot_upper_plot_group.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT')
+micplot_upper_plot_group.add_argument('--log_y', type=float, help='Base of log applied to y values. Set to zero to not log [%(default)s]', default=2, metavar='FLOAT')
+micplot_upper_plot_group.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT')
+micplot_upper_plot_group.add_argument('--point_scale', type=float, help='Scale point sizes when --point_size 0. All point sizes are multiplied by this number. Useful if you have large data set [%(default)s]', default=1, metavar='FLOAT')
+micplot_upper_plot_group.add_argument('--violin_width', type=float, help='Width of violins [%(default)s]', default=0.75)
+
+micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options')
+micplot_lower_plot_group.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=100, metavar='FLOAT')
+micplot_lower_plot_group.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots')
+micplot_lower_plot_group.add_argument('--dot_y_text_size', type=int, help='Text size of labels [%(default)s]', default=7, metavar='INT')
+
+subparser_micplot.set_defaults(func=ariba.tasks.micplot.run)
+
#----------------------------- prepareref -------------------------------
subparser_prepareref = subparsers.add_parser(
'prepareref',
@@ -187,14 +236,14 @@ subparser_summary = subparsers.add_parser(
epilog='Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input.'
)
-subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
+subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports to be summarised. Must be used if no input files listed after the outfile. The first column should be the filename. An optional second column can be used to specify a sample name for that file, which will be used instead of the filename in output files. Columns separated by whitespace.', metavar='FILENAME')
subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--v_groups,--variants. Using this overrides those options', metavar='|'.join(summary_presets))
-subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...')
+subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, ctg_cov, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...')
subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree')
subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
-subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name')
+subparser_summary.add_argument('--only_clusters', help='Only report data for the given comma-separated list of cluster names, eg: cluster1,cluster2,cluster42', metavar='Cluster_names')
subparser_summary.add_argument('--v_groups', action='store_true', help='Show a group column for each group of variants')
subparser_summary.add_argument('--known_variants', action='store_true', help='Report all known variants')
subparser_summary.add_argument('--novel_variants', action='store_true', help='Report all novel variants')
diff --git a/setup.py b/setup.py
index b50f6a6..a874476 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@ vcfcall_mod = Extension(
setup(
ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
name='ariba',
- version='2.7.2',
+ version='2.9.0',
description='ARIBA: Antibiotic Resistance Identification By Assembly',
packages = find_packages(),
package_data={'ariba': ['test_run_data/*']},
@@ -68,6 +68,7 @@ setup(
install_requires=[
'BeautifulSoup4 >= 4.1.0',
'dendropy >= 4.2.0',
+ 'matplotlib',
'pyfastaq >= 3.12.0',
'pysam >= 0.9.1',
'pymummer>=0.10.2',
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ariba.git
More information about the debian-med-commit
mailing list