[med-svn] [ariba] 01/03: Imported Upstream version 2.2.0+ds
Sascha Steinbiss
satta at debian.org
Thu Aug 18 15:09:52 UTC 2016
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository ariba.
commit 61985a317010528545f5f75742e7c0d10eb652cc
Author: Sascha Steinbiss <satta at debian.org>
Date: Thu Aug 18 15:02:10 2016 +0000
Imported Upstream version 2.2.0+ds
---
ariba/__init__.py | 1 +
ariba/ref_genes_getter.py | 116 ++++-
ariba/ref_preparer.py | 41 +-
ariba/reference_data.py | 7 +-
ariba/report.py | 3 +-
ariba/summary.py | 289 +++++------
ariba/summary_cluster.py | 30 +-
ariba/summary_cluster_variant.py | 83 ++++
ariba/summary_sample.py | 21 +-
ariba/tasks/getref.py | 6 +-
ariba/tasks/prepareref.py | 3 +-
ariba/tasks/summary.py | 47 +-
ariba/tests/assembly_variants_test.py | 12 +-
ariba/tests/cluster_test.py | 111 +++--
ariba/tests/clusters_test.py | 24 +-
...fa => assembly_variants_one_var_one_ctg_cdg.fa} | 0
...v => assembly_variants_one_var_one_ctg_cdg.tsv} | 0
...=> assembly_variants_one_var_one_ctg_noncdg.fa} | 0
...> assembly_variants_one_var_one_ctg_noncdg.tsv} | 0
...uster_full_run_known_smtls_snp_presabs_gene.fa} | 0
..._known_smtls_snp_presabs_gene.ref_for_reads.fa} | 0
...ster_full_run_known_smtls_snp_presabs_gene.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
...uster_full_run_known_smtls_snp_presabs_nonc.fa} | 0
...ster_full_run_known_smtls_snp_presabs_nonc.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
... => cluster_full_run_smtls_snp_presabs_gene.fa} | 0
...ll_run_smtls_snp_presabs_gene.ref_for_reads.fa} | 0
...=> cluster_full_run_smtls_snp_presabs_gene.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
... => cluster_full_run_smtls_snp_presabs_nonc.fa} | 0
...=> cluster_full_run_smtls_snp_presabs_nonc.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
... => cluster_full_run_smtls_snp_varonly_gene.fa} | 0
...=> cluster_full_run_smtls_snp_varonly_gene.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
...> cluster_full_run_smtls_snp_varonly_gene_2.fa} | 0
... cluster_full_run_smtls_snp_varonly_gene_2.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
...ster_full_run_smtls_snp_varonly_gene_no_snp.fa} | 0
...ter_full_run_smtls_snp_varonly_gene_no_snp.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
... => cluster_full_run_smtls_snp_varonly_nonc.fa} | 0
...=> cluster_full_run_smtls_snp_varonly_nonc.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
...ster_full_run_smtls_snp_varonly_nonc_no_snp.fa} | 0
...ter_full_run_smtls_snp_varonly_nonc_no_snp.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
...full_run_varonly.not_present.always_report.tsv} | 0
...ll_run_ok_gene_start_mismatch.ref_for_reads.fa} | 0
..._test_full_run_ok_samtools_snp_pres_abs_gene.fa | 3 -
..._full_run_ok_samtools_snp_pres_abs_noncoding.fa | 3 -
..._test_full_run_ok_samtools_snp_var_only_gene.fa | 3 -
..._full_run_ok_samtools_snp_var_only_noncoding.fa | 3 -
....fa => cluster_test_full_run_partial_asmbly.fa} | 2 +-
.../data/cluster_test_full_run_partial_asmbly.tsv | 1 +
.../reads_1.fq | 432 ++++++++++++++++
.../reads_2.fq | 432 ++++++++++++++++
.../references.fa | 2 +-
...luster_test_full_run_smtls_snp_varonly_nonc.fa} | 0
...uster_test_full_run_smtls_snp_varonly_nonc.tsv} | 0
.../reads_1.fq | 0
.../reads_2.fq | 0
.../references.fa | 0
...cted.out.fa => clusters_cat_genes_match_ref.fa} | 0
.../00.info.txt | 0
.../01.filter.check_metadata.tsv | 0
.../02.cdhit.all.fa | 0
.../02.cdhit.clusters.pickle | Bin
... clusters_minimap_reads_to_all_refs.clstrs.tsv} | 0
...usters_minimap_reads_to_all_refs.out.clstr2rep} | 0
...ters_minimap_reads_to_all_refs.out.clstr_count} | 0
...=> clusters_minimap_reads_to_all_refs.out.hist} | 0
...> clusters_minimap_reads_to_all_refs.out.pairs} | 0
... clusters_minimap_reads_to_all_refs.reads_1.fq} | 0
... clusters_minimap_reads_to_all_refs.reads_2.fq} | 0
...a => clusters_minimap_reads_to_all_refs.ref.fa} | 0
.../ref_preparer_test_fasta_to_metadata.coding.tsv | 3 +
.../data/ref_preparer_test_fasta_to_metadata.fa | 6 +
...f_preparer_test_fasta_to_metadata.noncoding.tsv | 3 +
.../data/ref_preparer_test_run.out/00.info.txt | 10 +-
.../ref_preparer_test_run.out/00.version_info.txt | 4 +-
.../02.cdhit.clusters.pickle | Bin 276 -> 312 bytes
.../02.cdhit.clusters.tsv | 4 +-
.../00.auto_metadata.tsv | 9 +
.../00.info.txt | 5 +
.../00.version_info.txt | 5 +
.../01.filter.check_genes.log | 0
.../01.filter.check_metadata.log | 0
.../01.filter.check_metadata.tsv | 9 +
.../02.cdhit.all.fa | 18 +
.../02.cdhit.clusters.pickle | Bin 0 -> 344 bytes
.../02.cdhit.clusters.tsv | 6 +-
.../02.cdhit.gene.fa | 0
.../02.cdhit.gene.varonly.fa | 0
.../02.cdhit.noncoding.fa | 18 +
.../02.cdhit.noncoding.varonly.fa | 0
..._cluster_w_cdhit_clstrs_file.expect.clstrs.tsv} | 0
...data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv} | 0
...ference_data_cluster_w_cdhit_clstrs_file.in.fa} | 0
...e_data_cluster_w_cdhit_clstrs_file.in.meta.tsv} | 0
...ence_data_cluster_w_cdhit_nocluster.expect.tsv} | 0
...reference_data_cluster_w_cdhit_nocluster.in.fa} | 0
...eference_data_cluster_w_cdhit_nocluster.in.tsv} | 0
ariba/tests/data/reference_data_load_fasta_file.fa | 2 +-
...ence_data_load_input_check_seq_names.bad.csv.1} | 0
...ence_data_load_input_check_seq_names.bad.csv.2} | 0
...rence_data_load_input_check_seq_names.bad.fa.1} | 0
...rence_data_load_input_check_seq_names.bad.fa.2} | 0
...nce_data_load_input_check_seq_names.good.csv.1} | 0
...nce_data_load_input_check_seq_names.good.csv.2} | 0
...ence_data_load_input_check_seq_names.good.fa.1} | 0
...ence_data_load_input_check_seq_names.good.fa.2} | 0
.../tests/data/reference_data_rename_sequences.fa | 8 +-
.../reference_data_rename_sequences_metadata.tsv | 12 +-
.../data/reference_data_test_rename_sequences.out | 10 +-
...s_variants_make_vcf_and_depths_files.asmbly.fa} | 0
...riants_make_vcf_and_depths_files.asmbly.fa.fai} | 0
...amtools_variants_make_vcf_and_depths_files.bam} | Bin
...nts_make_vcf_and_depths_files.expect.depths.gz} | Bin
...make_vcf_and_depths_files.expect.depths.gz.tbi} | Bin
..._variants_make_vcf_and_depths_files.expect.vcf} | 0
.../summary_gather_unfiltered_output_data.in.1.tsv | 6 +
.../summary_gather_unfiltered_output_data.in.2.tsv | 6 +
...ample_test_column_names_tuples_and_het_snps.tsv | 8 +-
.../summary_sample_test_column_summary_data.tsv | 8 +-
.../tests/data/summary_sample_test_var_groups.tsv | 8 +-
.../data/summary_test_gather_output_rows.in.1.tsv | 3 -
.../data/summary_test_gather_output_rows.in.2.tsv | 5 -
.../data/summary_test_get_all_cluster_names.1.tsv | 3 -
.../data/summary_test_get_all_cluster_names.2.tsv | 5 -
.../tests/data/summary_test_get_all_het_snps.1.tsv | 3 -
.../tests/data/summary_test_get_all_het_snps.2.tsv | 5 -
.../data/summary_test_get_all_var_groups.1.tsv | 3 -
.../data/summary_test_get_all_var_groups.2.tsv | 5 -
ariba/tests/data/summary_to_matrix.1.tsv | 5 +
ariba/tests/data/summary_to_matrix.2.tsv | 6 +
ariba/tests/data/vfdb_parser_test_run.out.fa | 2 +-
ariba/tests/data/vfdb_parser_test_run.out.tsv | 6 +-
ariba/tests/ref_preparer_test.py | 108 +++-
ariba/tests/reference_data_test.py | 69 ++-
ariba/tests/samtools_variants_test.py | 12 +-
ariba/tests/summary_cluster_test.py | 191 +++++---
ariba/tests/summary_cluster_variant_test.py | 67 +++
ariba/tests/summary_sample_test.py | 8 +-
ariba/tests/summary_test.py | 541 ++++++++++++---------
ariba/vfdb_parser.py | 5 +-
scripts/ariba | 26 +-
setup.py | 2 +-
167 files changed, 2165 insertions(+), 758 deletions(-)
diff --git a/ariba/__init__.py b/ariba/__init__.py
index 0c36b1a..1d589dc 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -39,6 +39,7 @@ __all__ = [
'sequence_variant',
'summary',
'summary_cluster',
+ 'summary_cluster_variant',
'summary_sample',
'tasks',
'versions',
diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py
index 1ae8d39..541f0c4 100644
--- a/ariba/ref_genes_getter.py
+++ b/ariba/ref_genes_getter.py
@@ -12,20 +12,33 @@ import json
from ariba import common, card_record, vfdb_parser
+allowed_ref_dbs = {
+ 'argannot',
+ 'card',
+ 'plasmidfinder',
+ 'resfinder',
+ 'srst2_argannot',
+ 'vfdb_core',
+ 'vfdb_full',
+}
+
+argannot_ref = '"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n'
+
+
class RefGenesGetter:
- def __init__(self, ref_db, genetic_code=11):
- allowed_ref_dbs = {'card', 'argannot', 'plasmidfinder', 'resfinder','vfdb'}
+ def __init__(self, ref_db, genetic_code=11, version=None):
if ref_db not in allowed_ref_dbs:
raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db)
self.ref_db=ref_db
self.genetic_code = genetic_code
self.max_download_attempts = 3
self.sleep_time = 2
+ self.version = version
pyfastaq.sequences.genetic_code = self.genetic_code
def _download_file(self, url, outfile):
- print('Downloading "', url, '" and saving as "', outfile, '" ...', end='', sep='')
+ print('Downloading "', url, '" and saving as "', outfile, '" ...', end='', sep='', flush=True)
for i in range(self.max_download_attempts):
time.sleep(self.sleep_time)
try:
@@ -38,6 +51,30 @@ class RefGenesGetter:
print(' done', flush=True)
+ def _get_card_versions(self, tmp_file):
+ print('Getting available CARD versions')
+ self._download_file('https://card.mcmaster.ca/download', tmp_file)
+ p = re.compile(r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.gz)"''')
+ versions = {}
+
+ with open(tmp_file) as f:
+ for line in f:
+ got = p.findall(line)
+ for match in got:
+ key = tuple([int(x) for x in match[1].split('.')])
+ versions[key] = 'https://card.mcmaster.ca' + match[0]
+
+ if len(versions) == 0:
+ raise Error('Error getting CARD versions. Cannot continue')
+
+ print('Found versions:')
+
+ for key, url in sorted(versions.items()):
+ print('.'.join([str(x) for x in key]), url, sep='\t')
+
+ os.unlink(tmp_file)
+ return versions
+
def _get_from_card(self, outprefix):
outprefix = os.path.abspath(outprefix)
@@ -50,8 +87,17 @@ class RefGenesGetter:
except:
raise Error('Error mkdir/chdir ' + tmpdir)
- card_version = '1.0.9'
- card_tarball_url = 'https://card.mcmaster.ca/download/0/broadstreet-v' + card_version + '.tar.gz'
+ versions = self._get_card_versions('download.html')
+ if self.version is not None:
+ key = tuple([int(x) for x in self.version.split('.')])
+ if key not in versions:
+ raise Error('Error! Did not find requested version ' + self.version)
+ else:
+ key = sorted(list(versions.keys()))[-1]
+ self.version = '.'.join([str(x) for x in key])
+
+ print('Getting version', self.version)
+ card_tarball_url = versions[key]
card_tarball = 'card.tar.gz'
print('Working in temporary directory', tmpdir)
print('Downloading data from card:', card_tarball_url, flush=True)
@@ -146,7 +192,7 @@ class RefGenesGetter:
print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
print('If you use this downloaded data, please cite:')
print('"The Comprehensive Antibiotic Resistance Database", McArthur et al 2013, PMID: 23650175')
- print('and in your methods say that version', card_version, 'of the database was used')
+ print('and in your methods say that version', self.version, 'of the database was used')
def _get_from_resfinder(self, outprefix):
@@ -221,9 +267,9 @@ class RefGenesGetter:
for seq in seq_reader:
original_id = seq.id
- seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id)
+ seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0])
print(seq, file=f_out_fa)
- print(seq.id, '1', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=f_out_tsv)
+ print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv)
pyfastaq.utils.close(f_out_tsv)
@@ -234,7 +280,7 @@ class RefGenesGetter:
print('You can use them with ARIBA like this:')
print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
print('If you use this downloaded data, please cite:')
- print('"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n')
+ print(argannot_ref)
def _get_from_plasmidfinder(self, outprefix):
@@ -289,7 +335,49 @@ class RefGenesGetter:
print('"PlasmidFinder and pMLST: in silico detection and typing of plasmids", Carattoli et al 2014, PMID: 24777092\n')
- def _get_from_vfdb(self, outprefix):
+ def _get_from_srst2_argannot(self, outprefix):
+ srst2_version = '0.2.0'
+ srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta'
+ srst2_fa = outprefix + '.original.fa'
+ command = 'wget -O ' + srst2_fa + ' ' + srst2_url
+ common.syscall(command, verbose=True)
+
+ final_fasta = outprefix + '.fa'
+ final_tsv = outprefix + '.tsv'
+
+ f_out_fa = pyfastaq.utils.open_file_write(final_fasta)
+ f_out_meta = pyfastaq.utils.open_file_write(final_tsv)
+ seq_reader = pyfastaq.sequences.file_reader(srst2_fa)
+
+ for seq in seq_reader:
+ original_id = seq.id
+ name, extra = seq.id.split()
+ cluster_id, cluster_name, allele_name, allele_id = name.split('__')
+ seq.id = cluster_name + '.' + name
+ print(seq, file=f_out_fa)
+ print(seq.id, 1, 0, '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_meta)
+
+ pyfastaq.utils.close(f_out_fa)
+ pyfastaq.utils.close(f_out_meta)
+
+ print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
+ print('You can use them with ARIBA like this:')
+ print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
+ print('If you use this downloaded data, please cite:')
+ print('"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n')
+ print(argannot_ref)
+ print('and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.')
+
+
+ def _get_from_vfdb_core(self, outprefix):
+ self._get_from_vfdb_common(outprefix, 'VFDB_setA_nt.fas.gz','core')
+
+
+ def _get_from_vfdb_full(self, outprefix):
+ self._get_from_vfdb_common(outprefix, 'VFDB_setB_nt.fas.gz','full')
+
+
+ def _get_from_vfdb_common(self, outprefix, filename, info_text):
outprefix = os.path.abspath(outprefix)
tmpdir = outprefix + '.tmp.download'
@@ -298,12 +386,13 @@ class RefGenesGetter:
except:
raise Error('Error mkdir ' + tmpdir)
- zipfile = os.path.join(tmpdir, 'VFDB_setA_nt.fas.gz')
- self._download_file('http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz', zipfile)
+ zipfile = os.path.join(tmpdir, filename)
+ self._download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile)
+ print('Extracting files ... ', end='', flush=True)
vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
vparser.run()
shutil.rmtree(tmpdir)
- print('Extracted files.')
+ print('done')
final_fasta = outprefix + '.fa'
final_tsv = outprefix + '.tsv'
@@ -313,6 +402,7 @@ class RefGenesGetter:
print('If you use this downloaded data, please cite:')
print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
+
def run(self, outprefix):
exec('self._get_from_' + self.ref_db + '(outprefix)')
diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py
index 9fee5fd..c2cb310 100644
--- a/ariba/ref_preparer.py
+++ b/ariba/ref_preparer.py
@@ -1,6 +1,7 @@
import sys
import os
import pickle
+import pyfastaq
from ariba import reference_data
class Error (Exception): pass
@@ -9,8 +10,9 @@ class Error (Exception): pass
class RefPreparer:
def __init__(self,
fasta_files,
- metadata_tsv_files,
extern_progs,
+ metadata_tsv_files=None,
+ all_coding=None,
version_report_lines=None,
min_gene_length=6,
max_gene_length=10000,
@@ -30,7 +32,8 @@ class RefPreparer:
self.version_report_lines = version_report_lines
self.fasta_files = fasta_files
- self.metadata_tsv_files = metadata_tsv_files
+ self.metadata_tsv_files = [] if metadata_tsv_files is None else metadata_tsv_files
+ self.all_coding = all_coding
self.min_gene_length = min_gene_length
self.max_gene_length = max_gene_length
self.genetic_code = genetic_code
@@ -42,6 +45,21 @@ class RefPreparer:
self.verbose = verbose
+ @classmethod
+ def _fasta_to_metadata(cls, infile, out_fh, all_coding):
+ seq_reader = pyfastaq.sequences.file_reader(infile)
+ coding = '1' if all_coding else '0'
+
+ for seq in seq_reader:
+ fields = seq.id.split(maxsplit=1)
+ if len(fields) > 1:
+ info_column = 'Original name: ' + seq.id
+ seq.id = fields[0]
+ else:
+ info_column = '.'
+ print(seq.id, coding, 0, '.', '.', info_column, sep='\t', file=out_fh)
+
+
def _write_info_file(self, outfile):
with open(outfile, 'w') as fout:
for filename in self.fasta_files:
@@ -83,6 +101,13 @@ class RefPreparer:
else:
new_key = common_prefix + '-'
+ i = 1
+ new_new_key = new_key
+ while new_new_key in new_clusters:
+ new_new_key = new_key + '_' + str(i)
+ i += 1
+ new_key = new_new_key
+
if new_key in key_count:
if new_key in new_clusters:
assert key_count[new_key] == 1
@@ -126,6 +151,18 @@ class RefPreparer:
print(file=f)
print(*self.version_report_lines, sep='\n', file=f)
+ if self.all_coding is not None:
+ assert len(self.metadata_tsv_files) == 0
+ assert self.all_coding in {'yes', 'no'}
+ self.metadata_tsv_files = [os.path.join(outdir, '00.auto_metadata.tsv')]
+ f_out = pyfastaq.utils.open_file_write(self.metadata_tsv_files[0])
+ for fasta_file in self.fasta_files:
+ RefPreparer._fasta_to_metadata(fasta_file, f_out, self.all_coding=='yes')
+ pyfastaq.utils.close(f_out)
+ else:
+ assert self.all_coding is None
+ assert len(self.metadata_tsv_files) > 0
+
self._write_info_file(os.path.join(outdir, '00.info.txt'))
self.refdata = reference_data.ReferenceData(
diff --git a/ariba/reference_data.py b/ariba/reference_data.py
index ed5ef8b..8d25e92 100644
--- a/ariba/reference_data.py
+++ b/ariba/reference_data.py
@@ -8,7 +8,7 @@ from ariba import sequence_metadata, cdhit
class Error (Exception): pass
-rename_sub_regex = re.compile(r'[^\w.-]')
+rename_sub_regex = re.compile(r'''[':!@,-]''')
class ReferenceData:
@@ -83,6 +83,8 @@ class ReferenceData:
if filename is not None:
seq_reader = pyfastaq.sequences.file_reader(filename)
for seq in seq_reader:
+ seq.id = seq.id.split()[0]
+
if seq.id in seq_dict:
raise Error('Duplicate name "' + seq.id + '" found in file ' + filename + '. Cannot continue)')
seq_dict[seq.id] = copy.copy(seq)
@@ -281,7 +283,7 @@ class ReferenceData:
@classmethod
def _new_seq_name(cls, name):
- name = name.split()[0]
+ assert len(name.split()) == 1 and name.strip() == name
return re.sub(rename_sub_regex, '_', name)
@@ -291,6 +293,7 @@ class ReferenceData:
old_name_to_new = {}
for old_name in sorted(names):
+ assert len(old_name.split()) == 1 and old_name.strip() == old_name
new_name = ReferenceData._new_seq_name(old_name)
if new_name in used_names:
i = 1
diff --git a/ariba/report.py b/ariba/report.py
index fdbe50f..1884c9a 100644
--- a/ariba/report.py
+++ b/ariba/report.py
@@ -137,8 +137,9 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
lines = []
contig_length = len(cluster.assembly.sequences[contig_name])
assert contig_length != 0
+
if contig_name in ref_cov_per_contig:
- if contig_name == cluster.assembly_compare.scaff_name_matching_ref:
+ if contig_name == cluster.assembly_compare.scaff_name_matching_ref and cluster.assembly_compare.gene_matching_ref_type == 'GENE_FOUND':
ref_cov = len(cluster.ref_sequence)
else:
ref_cov = ref_cov_per_contig[contig_name]
diff --git a/ariba/summary.py b/ariba/summary.py
index f1b767f..d0e920c 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -18,9 +18,12 @@ class Summary:
filter_rows=True,
filter_columns=True,
min_id=90.0,
- show_known_het=False,
cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var',
- variant_cols='groups,grouped,ungrouped,novel',
+ make_phandango_tree=True,
+ only_clusters=None,
+ show_var_groups=False,
+ show_known_vars=False,
+ show_novel_vars=False,
verbose=False,
):
if filenames is None and fofn is None:
@@ -34,13 +37,16 @@ class Summary:
if fofn is not None:
self.filenames.extend(self._load_fofn(fofn))
- self.show_known_het = show_known_het
self.cluster_columns = self._determine_cluster_cols(cluster_cols)
- self.var_columns = self._determine_var_cols(variant_cols)
self.filter_rows = filter_rows
self.filter_columns = filter_columns
self.min_id = min_id
self.outprefix = outprefix
+ self.make_phandango_tree = make_phandango_tree
+ self.only_clusters = only_clusters
+ self.show_var_groups = show_var_groups
+ self.show_known_vars = show_known_vars
+ self.show_novel_vars = show_novel_vars
self.verbose = verbose
@@ -60,12 +66,6 @@ class Summary:
return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns')
- @staticmethod
- def _determine_var_cols(cols_string):
- allowed_cols = {'groups', 'grouped', 'ungrouped', 'novel'}
- return Summary._determine_cols(cols_string, allowed_cols, 'variant columns')
-
-
def _load_fofn(self, fofn):
f = pyfastaq.utils.open_file_read(fofn)
filenames = [x.rstrip() for x in f.readlines()]
@@ -80,172 +80,127 @@ class Summary:
@classmethod
- def _load_input_files(cls, filenames, min_id, verbose=False):
+ def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None):
samples = {}
for filename in filenames:
- samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id)
+ samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id, only_clusters=only_clusters)
samples[filename].run()
if verbose:
print('Loaded file', filename, flush=True)
return samples
- @classmethod
- def _get_all_cluster_names(cls, samples_dict):
- '''Input should be output of _load_input_files'''
- cluster_names = set()
- for filename, sample in samples_dict.items():
- cluster_names.update(set(sample.clusters.keys()))
- return cluster_names
-
+ def _gather_unfiltered_output_data(self):
+ self.all_potential_columns = {}
+ self.all_data = {}
- @classmethod
- def _get_all_variant_columns(cls, samples_dict):
- '''Input should be output of _load_input_files'''
- columns = {}
- for filename, sample in samples_dict.items():
- for cluster in sample.column_summary_data:
- if sample.column_summary_data[cluster]['assembled'] == 'yes':
- for key, tuple_set in sample.variant_column_names_tuples.items():
- for t in tuple_set:
- if key not in columns:
- columns[key] = set()
- columns[key].add(t)
- return columns
-
-
- @classmethod
- def _get_all_het_snps(cls, samples_dict):
- snps = set()
- for filename, sample in samples_dict.items():
- for cluster, snp_dict in sample.het_snps.items():
- if len(snp_dict):
- for snp in snp_dict:
- snps.add((cluster, snp))
+ for filename in sorted(self.samples):
+ self.all_data[filename] = {}
+ for cluster in self.samples[filename].clusters.values():
+ self.all_data[filename][cluster.name] = {}
+ if cluster.name not in self.all_potential_columns:
+ self.all_potential_columns[cluster.name] = {'summary' : set(), 'groups': set(), 'vars': set()}
- return snps
+ this_cluster_dict = {'groups': {}, 'vars': {}}
- @classmethod
- def _get_all_var_groups(cls, samples_dict):
- groups = {}
- for filename, sample in samples_dict.items():
- for name, name_set in sample.var_groups.items():
- if name not in groups:
- groups[name] = set()
- groups[name].update(name_set)
- return groups
-
-
- def _gather_output_rows(self):
- all_cluster_names = Summary._get_all_cluster_names(self.samples)
- all_var_columns = Summary._get_all_variant_columns(self.samples)
- all_het_snps = Summary._get_all_het_snps(self.samples)
-
- if self.var_columns['groups']:
- var_groups = Summary._get_all_var_groups(self.samples)
- else:
- var_groups = set()
- rows = {}
+ if cluster.summary['assembled'] == 'no':
+ this_cluster_dict['summary'] = {
+ 'assembled': 'no',
+ 'known_var': 'NA',
+ 'match': 'no',
+ 'novel_var': 'NA',
+ 'pct_id': 'NA',
+ 'ref_seq': 'NA'
+ }
+ else:
+ this_cluster_dict['summary'] = copy.copy(cluster.summary)
+ seen_groups = {}
- for filename, sample in self.samples.items():
- rows[filename] = {}
+ for variant in cluster.variants:
+ if (self.show_known_vars and variant.known) or (self.show_novel_vars and not variant.known):
+ this_cluster_dict['vars'][variant.var_string] = 'yes' if variant.het_percent is None else 'het'
+ if variant.het_percent is not None:
+ this_cluster_dict['vars'][variant.var_string + '.%'] = variant.het_percent
- for cluster in all_cluster_names:
- rows[filename][cluster] = {}
+ if self.show_var_groups and variant.var_group != '.':
+ if variant.var_group not in seen_groups:
+ seen_groups[variant.var_group] = {'yes': 0, 'het': 0}
- if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'].startswith('yes'):
- rows[filename][cluster] = sample.column_summary_data[cluster]
- else:
- rows[filename][cluster] = {
- 'assembled': 'no',
- 'match': 'no',
- 'ref_seq': 'NA',
- 'known_var': 'NA',
- 'novel_var': 'NA',
- 'pct_id': 'NA'
- }
+ if variant.het_percent is None:
+ seen_groups[variant.var_group]['yes'] += 1
+ this_cluster_dict['groups'][variant.var_group] = 'yes'
+ else:
+ seen_groups[variant.var_group]['het'] += 1
+ this_cluster_dict['groups'][variant.var_group] = 'het'
+ this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
- if self.var_columns['groups']:
- for group_name in var_groups[cluster]:
- if cluster in sample.var_groups and group_name in sample.var_groups[cluster]:
- rows[filename][cluster]['vgroup.' + group_name] = 'yes'
- else:
- rows[filename][cluster]['vgroup.' + group_name] = 'no'
-
- if cluster in all_var_columns:
- for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]:
- if not self.var_columns[grouped_or_novel]:
- continue
-
- key = ref_name + '.' + variant
-
- if rows[filename][cluster]['assembled'] == 'no':
- rows[filename][cluster][key] = 'NA'
- elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]:
- rows[filename][cluster][key] = 'yes'
- if self.show_known_het:
- if cluster in sample.het_snps and variant in sample.het_snps[cluster]:
- rows[filename][cluster][key] = 'het'
- rows[filename][cluster][key + '.%'] = sample.het_snps[cluster][variant]
- else:
- rows[filename][cluster][key] = 'no'
- if self.show_known_het and (cluster, variant) in all_het_snps:
- rows[filename][cluster][key + '.%'] = 'NA'
+ for group, d in seen_groups.items():
+ if d['het'] > 0 and d['het'] + d['yes'] > 1:
+ this_cluster_dict['groups'][group] = 'yes_multi_het'
+ this_cluster_dict['groups'][group + '.%'] = 'NA'
- if self.show_known_het and (cluster, variant) in all_het_snps and key + '.%' not in rows[filename][cluster]:
- rows[filename][cluster][key + '.%'] = 'NA'
+ for x in this_cluster_dict:
+ self.all_potential_columns[cluster.name][x].update(set(this_cluster_dict[x].keys()))
- for key, wanted in self.cluster_columns.items():
- if not wanted:
- del rows[filename][cluster][key]
-
- return rows
+ self.all_data[filename][cluster.name] = this_cluster_dict
@classmethod
- def _to_matrix(cls, filenames, rows, cluster_cols):
- '''rows = output from _gather_output_rows().
- filenames = self.filenames
- cluster_cols = self.cluster_columns'''
+ def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols):
matrix = []
making_header_lines = True
phandango_header = ['name']
- phandago_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'}
+ phandango_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'}
ref_seq_counter = 2
csv_header = ['name']
- all_cluster_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
- all_cluster_cols_in_order_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
- cluster_cols_in_order = [x for x in all_cluster_cols_in_order if cluster_cols[x]]
+ summary_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
+ summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
+ summary_cols_in_order = [x for x in summary_cols_in_order if cluster_cols[x]]
for filename in filenames:
- assert filename in rows
line = [filename]
- for cluster_name in sorted(rows[filename]):
- for col in cluster_cols_in_order:
+ for cluster_name in sorted(all_potential_columns):
+ group_cols = sorted(list(all_potential_columns[cluster_name]['groups']))
+ var_cols = sorted(list(all_potential_columns[cluster_name]['vars']))
+
+ for col in summary_cols_in_order + group_cols + var_cols:
if making_header_lines:
csv_header.append(cluster_name + '.' + col)
if col == 'ref_seq':
- phandago_suffixes[col] = ':o' + str(ref_seq_counter)
+ phandango_suffixes[col] = ':o' + str(ref_seq_counter)
ref_seq_counter += 1
- phandango_header.append(cluster_name + '.' + col + phandago_suffixes[col])
-
- line.append(rows[filename][cluster_name][col])
-
- for col in sorted(rows[filename][cluster_name]):
- if col in all_cluster_cols_in_order_set:
- continue
-
- if making_header_lines:
- csv_header.append(cluster_name + '.' + col)
- suffix = ':c2' if col.endswith('.%') else ':o1'
- phandango_header.append(cluster_name + '.' + col + suffix)
-
- line.append(rows[filename][cluster_name][col])
+ phandango_header.append(cluster_name + '.' + col + phandango_suffixes[col])
+ elif col in phandango_suffixes:
+ phandango_header.append(cluster_name + '.' + col + phandango_suffixes[col])
+ elif col.endswith('.%'):
+ phandango_header.append(cluster_name + '.' + col + ':c2')
+ else:
+ phandango_header.append(cluster_name + '.' + col + ':o1')
+
+ for col_type in ['summary', 'groups', 'vars']:
+ if cluster_name in all_data[filename] and col in all_data[filename][cluster_name][col_type]:
+ line.append(all_data[filename][cluster_name][col_type][col])
+ break
+ else:
+ if col in {'assembled', 'match'}:
+ line.append('no')
+ elif col in summary_cols_set:
+ line.append('NA')
+ elif cluster_name in all_data[filename] and all_data[filename][cluster_name]['summary'].get('assembled', 'no') != 'no':
+ if col.endswith('.%'):
+ line.append('NA')
+ else:
+ line.append('no')
+ else:
+ line.append('NA')
making_header_lines = False
matrix.append(line)
+ assert len(phandango_header) == len(csv_header)
+ for line in matrix:
+ assert len(line) == len(csv_header)
return phandango_header, csv_header, matrix
@@ -292,11 +247,13 @@ class Summary:
matrix = copy.deepcopy(matrix)
cols_to_add_colour_col = [i for i in range(len(header)) if header[i].endswith(':o1')]
field_to_col = {
- 'yes': '#1f78b4',
- 'yes_nonunique': '#a6cee3',
- 'no': '#33a02c',
- 'NA': '#b2df8a',
- 'het': '#fb9a99',
+ 'yes': '#33a02c',
+ 'yes_nonunique': '#b2df8a',
+ 'no': '#fb9a99',
+ 'NA': '#ffffff',
+ 'het': '#fdbf6f',
+ 'fragmented': '#1f78b4',
+ 'interrupted': '#a6cee3',
}
cols_to_add_colour_col.reverse()
@@ -313,11 +270,15 @@ class Summary:
@classmethod
- def _matrix_to_csv(cls, matrix, header, outfile):
+ def _matrix_to_csv(cls, matrix, header, outfile, remove_nas=False):
f = pyfastaq.utils.open_file_write(outfile)
print(*header, sep=',', file=f)
for line in matrix:
- print(*line, sep=',', file=f)
+ if remove_nas:
+ new_line = ['' if x=='NA' else x for x in line]
+ print(*new_line, sep=',', file=f)
+ else:
+ print(*line, sep=',', file=f)
pyfastaq.utils.close(f)
@@ -372,15 +333,14 @@ class Summary:
if self.verbose:
print('Loading input files...', flush=True)
self._check_files_exist()
- self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose)
+ self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose, only_clusters=self.only_clusters)
if self.verbose:
print('Generating output rows', flush=True)
- self.rows = self._gather_output_rows()
- phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.rows, self.cluster_columns)
+ self._gather_unfiltered_output_data()
+ phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.all_data, self.all_potential_columns, self.cluster_columns)
# sanity check same number of columns in headers and matrix
lengths = {len(x) for x in matrix}
- print(lengths, len(phandango_header), len(csv_header))
assert len(lengths) == 1
assert len(matrix[0]) == len(phandango_header) == len(csv_header)
@@ -416,20 +376,25 @@ class Summary:
print('Making Phandango csv file', csv_file, flush=True)
csv_file = self.outprefix + '.phandango.csv'
phandango_header, phandango_matrix = Summary._add_phandango_colour_columns(phandango_header, matrix)
- Summary._matrix_to_csv(phandango_matrix, phandango_header, csv_file)
- dist_matrix_file = self.outprefix + '.phandango.distance_matrix'
- tree_file = self.outprefix + '.phandango.tre'
-
- if self.verbose:
- print('Making Phandango distance matrix', dist_matrix_file, flush=True)
- Summary._write_distance_matrix(matrix, dist_matrix_file)
-
- if self.verbose:
- print('Making Phandango tree file', tree_file, flush=True)
- Summary._newick_from_dist_matrix(dist_matrix_file, tree_file)
- os.unlink(dist_matrix_file)
+ Summary._matrix_to_csv(phandango_matrix, phandango_header, csv_file, remove_nas=True)
+
+ if self.make_phandango_tree:
+ dist_matrix_file = self.outprefix + '.phandango.distance_matrix'
+ tree_file = self.outprefix + '.phandango.tre'
+
+ if self.verbose:
+ print('Making Phandango distance matrix', dist_matrix_file, flush=True)
+ Summary._write_distance_matrix(matrix, dist_matrix_file)
+
+ if self.verbose:
+ print('Making Phandango tree file', tree_file, flush=True)
+ Summary._newick_from_dist_matrix(dist_matrix_file, tree_file)
+ os.unlink(dist_matrix_file)
+ elif self.verbose:
+ print('Skipping making tree because you asked me not to make it', flush=True)
else:
print('Made csv file. Not making Phandango files because only one sample remains after filtering', file=sys.stderr)
if self.verbose:
print('Finished', flush=True)
+
diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 7a53f55..efc4cf8 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -1,4 +1,4 @@
-from ariba import flag, report
+from ariba import flag, report, summary_cluster_variant
class Error (Exception): pass
@@ -56,7 +56,7 @@ class SummaryCluster:
d['var_group'] = '.'
else:
try:
- d['var_group'] = d['var_description'].split(':')[3]
+ d['var_group'] = d['var_description'].split(':')[4]
except:
raise Error('Error getting variant group from the following line:\n' + line)
@@ -118,6 +118,8 @@ class SummaryCluster:
return 'yes'
else:
return 'yes_nonunique'
+ elif self.flag.has('assembled_into_one_contig'):
+ return 'interrupted'
else:
return 'fragmented'
@@ -218,6 +220,7 @@ class SummaryCluster:
return None
+
@staticmethod
def _get_nonsynonymous_var(data_dict):
'''if data_dict has a non synonymous variant, return string:
@@ -295,5 +298,26 @@ class SummaryCluster:
for d in self.data:
snp_tuple = self._get_known_noncoding_het_snp(d)
if snp_tuple is not None:
- snps[snp_tuple[0]] = snp_tuple[1]
+ snp_id = d['var_description'].split(':')[4]
+ if snp_id not in snps:
+ snps[snp_id] = {}
+ snps[snp_id][snp_tuple[0]] = snp_tuple[1]
return snps
+
+
+ @classmethod
+ def _get_all_nonsynon_variants_set(cls, data_dicts):
+ variants = set()
+
+ for data_dict in data_dicts:
+ cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
+ if cluster_var.has_nonsynon:
+ variants.add(cluster_var)
+
+ return variants
+
+
+ def gather_data(self):
+ self.summary = self.column_summary_data()
+ self.variants = self._get_all_nonsynon_variants_set(self.data)
+
diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py
new file mode 100644
index 0000000..51e00d9
--- /dev/null
+++ b/ariba/summary_cluster_variant.py
@@ -0,0 +1,83 @@
+from ariba import flag, report
+
+class Error (Exception): pass
+
+class SummaryClusterVariant:
+ def __init__(self, data_dict):
+ self._get_nonsynon_variant_data(data_dict)
+
+
+ def __eq__(self, other):
+ return type(other) is type(self) and self.__dict__ == other.__dict__
+
+
+ def __hash__(self):
+ return hash(tuple([self.__dict__[x] for x in sorted(self.__dict__.keys())]))
+
+
+ def __str__(self):
+ if self.has_nonsynon:
+ return ', '.join((str(self.known), self.var_group, str(self.coding), self.var_string, str(self.het_percent)))
+ else:
+ return 'None'
+
+
+ @classmethod
+ def _has_nonsynonymous(cls, data_dict):
+ return data_dict['ref_ctg_effect'] != 'SYN' and \
+ (
+ data_dict['has_known_var'] == '1' or \
+ (data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
+ )
+
+
+ @classmethod
+ def _get_het_percent(cls, data_dict):
+ if data_dict['gene'] == '1' or data_dict['ref_ctg_effect'] != 'SNP' or data_dict['smtls_alt_nt'] == '.' or ';' in data_dict['smtls_alt_nt']:
+ return None
+ else:
+ nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',')
+ depths = data_dict['smtls_alt_depth'].split(',')
+
+ if len(nucleotides) != len(depths):
+ raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
+
+ try:
+ var_nucleotide = data_dict['known_var_change'][-1] if data_dict['known_var_change'] != '.' else data_dict['ref_ctg_change'][-1]
+ if var_nucleotide == '.':
+ return None
+ depths = [int(x) for x in depths]
+ nuc_to_depth = dict(zip(nucleotides, depths))
+ total_depth = sum(depths)
+ var_depth = nuc_to_depth.get(var_nucleotide, 0)
+ return round(100 * var_depth / total_depth, 1)
+ except:
+ return None
+
+
+ def _get_nonsynon_variant_data(self, data_dict):
+ if not SummaryClusterVariant._has_nonsynonymous(data_dict):
+ self.has_nonsynon = False
+ return
+
+ self.has_nonsynon = True
+
+ if data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
+ raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
+ elif '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \
+ data_dict['known_var_change'] != data_dict['ref_ctg_change']:
+ raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue')
+
+ self.known = data_dict['known_var'] == '1'
+ self.var_group = data_dict['var_group']
+ self.coding = data_dict['gene'] == '1'
+
+ if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.':
+ self.var_string = data_dict['known_var_change']
+ elif data_dict['ref_ctg_change'] != '.':
+ self.var_string = data_dict['ref_ctg_change']
+ else:
+ self.var_string = data_dict['ref_ctg_effect']
+
+ self.het_percent = SummaryClusterVariant._get_het_percent(data_dict)
+
diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
index c5349f4..bc1ea25 100644
--- a/ariba/summary_sample.py
+++ b/ariba/summary_sample.py
@@ -4,9 +4,10 @@ from ariba import report, summary_cluster
class Error (Exception): pass
class SummarySample:
- def __init__(self, report_tsv, min_pc_id=90):
+ def __init__(self, report_tsv, min_pc_id=90, only_clusters=None):
self.report_tsv = report_tsv
self.min_pc_id = min_pc_id
+ self.only_clusters = only_clusters
self.clusters = {}
@@ -15,7 +16,7 @@ class SummarySample:
@staticmethod
- def _load_file(filename, min_pc_id):
+ def _load_file(filename, min_pc_id, only_clusters=None):
f = pyfastaq.utils.open_file_read(filename)
clusters = {}
@@ -28,11 +29,25 @@ class SummarySample:
data_dict = summary_cluster.SummaryCluster.line2dict(line)
cluster = data_dict['cluster']
+ if only_clusters is not None and cluster not in only_clusters:
+ continue
+
if cluster not in clusters:
clusters[cluster] = summary_cluster.SummaryCluster(min_pc_id=min_pc_id)
clusters[cluster].add_data_dict(data_dict)
pyfastaq.utils.close(f)
+
+ to_delete = set()
+
+ for cluster_name, cluster in clusters.items():
+ cluster.gather_data()
+ if cluster.name is None:
+ to_delete.add(cluster_name)
+
+ for name in to_delete:
+ del clusters[name]
+
return clusters
@@ -58,7 +73,7 @@ class SummarySample:
def run(self):
- self.clusters = self._load_file(self.report_tsv, self.min_pc_id)
+ self.clusters = self._load_file(self.report_tsv, self.min_pc_id, only_clusters=self.only_clusters)
self.column_summary_data = self._column_summary_data()
self.variant_column_names_tuples, self.het_snps = self._variant_column_names_tuples_and_het_snps()
self.var_groups = self._var_groups()
diff --git a/ariba/tasks/getref.py b/ariba/tasks/getref.py
index d83e028..a9292c1 100644
--- a/ariba/tasks/getref.py
+++ b/ariba/tasks/getref.py
@@ -3,6 +3,10 @@ from ariba import ref_genes_getter
def run(options):
- getter = ref_genes_getter.RefGenesGetter(options.db, genetic_code=options.genetic_code)
+ getter = ref_genes_getter.RefGenesGetter(
+ options.db,
+ genetic_code=options.genetic_code,
+ version=options.version
+ )
getter.run(options.outprefix)
diff --git a/ariba/tasks/prepareref.py b/ariba/tasks/prepareref.py
index 7a7591a..ef52684 100644
--- a/ariba/tasks/prepareref.py
+++ b/ariba/tasks/prepareref.py
@@ -12,8 +12,9 @@ def run(options):
preparer = ref_preparer.RefPreparer(
options.fasta_files,
- options.tsv_files,
extern_progs,
+ metadata_tsv_files=options.tsv_files,
+ all_coding=options.all_coding,
version_report_lines=version_report_lines,
min_gene_length=options.min_gene_length,
max_gene_length=options.max_gene_length,
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index 782c905..c91e982 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -9,66 +9,33 @@ def use_preset(options):
preset_to_vals = {
'minimal': {
'cluster_cols': 'match',
- 'variant_cols': '',
'col_filter': 'y',
'row_filter': 'y',
- 'var_groups': 'n',
- 'known_vars': 'n',
- 'novel_vars': 'n'
},
'cluster_small': {
'cluster_cols': 'assembled,match,ref_seq,known_var',
- 'variant_cols': '',
'col_filter': 'y',
'row_filter': 'y',
- 'var_groups': 'n',
- 'known_vars': 'n',
- 'novel_vars': 'n'
},
'cluster_all': {
'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
- 'variant_cols': '',
'col_filter': 'y',
'row_filter': 'y',
- 'var_groups': 'n',
- 'known_vars': 'n',
- 'novel_vars': 'n'
},
'cluster_var_groups': {
'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
- 'variant_cols': 'groups',
'col_filter': 'y',
'row_filter': 'y',
- 'var_groups': 'y',
- 'known_vars': 'n',
- 'novel_vars': 'n'
- },
- 'cluster_known_vars': {
- 'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
- 'variant_cols': 'groups,grouped,ungrouped',
- 'col_filter': 'y',
- 'row_filter': 'y',
- 'var_groups': 'y',
- 'known_vars': 'y',
- 'novel_vars': 'n'
},
'all': {
'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
- 'variant_cols': 'groups,grouped,ungrouped,novel',
'col_filter': 'y',
'row_filter': 'y',
- 'var_groups': 'y',
- 'known_vars': 'y',
- 'novel_vars': 'y'
},
'all_no_filter': {
'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
- 'variant_cols': 'groups,grouped,ungrouped,novel',
'col_filter': 'n',
'row_filter': 'n',
- 'var_groups': 'y',
- 'known_vars': 'y',
- 'novel_vars': 'y'
},
}
@@ -77,6 +44,13 @@ def use_preset(options):
for key, val in preset_to_vals[options.preset].items():
exec('options.' + key + ' = "' + val + '"')
+ if options.preset in {'cluster_var_groups', 'all', 'all_no_filter'}:
+ options.v_groups = True
+
+ if options.preset in {'all', 'all_no_filter'}:
+ options.known_variants = True
+ options.novel_variants = True
+
return options
@@ -93,9 +67,12 @@ def run(options):
filter_rows=options.col_filter == 'y',
filter_columns=options.row_filter == 'y',
min_id=options.min_id,
- show_known_het=options.het,
cluster_cols=options.cluster_cols,
- variant_cols=options.var_cols,
+ make_phandango_tree=(not options.no_tree),
+ only_clusters=None if options.only_cluster is None else {options.only_cluster},
+ show_var_groups=options.v_groups,
+ show_known_vars=options.known_variants,
+ show_novel_vars=options.novel_variants,
verbose=options.verbose
)
s.run()
diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py
index 83d805f..898106b 100644
--- a/ariba/tests/assembly_variants_test.py
+++ b/ariba/tests/assembly_variants_test.py
@@ -102,10 +102,10 @@ class TestAssemblyVariants(unittest.TestCase):
self.assertEqual(expected, mummer_variants)
- def test_get_one_variant_for_one_contig_non_coding(self):
+ def test_one_var_one_ctg_noncdg(self):
'''test _get_one_variant_for_one_contig_non_coding'''
- fasta_in = os.path.join(data_dir, 'assembly_variants_test_get_variants_non_coding.fa')
- tsv_in = os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_noncdg.fa')
+ tsv_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_noncdg.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
ref_sequence_name = 'non_coding'
refdata_var_dict = refdata.metadata[ref_sequence_name]
@@ -144,10 +144,10 @@ class TestAssemblyVariants(unittest.TestCase):
self.assertEqual(expected_used_variants[i], got_used_variants)
- def test_get_one_variant_for_one_contig_coding(self):
+ def test_one_var_one_ctg_cdg(self):
'''test _get_one_variant_for_one_contig_coding'''
- fasta_in = os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa')
- tsv_in = os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_cdg.fa')
+ tsv_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_cdg.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
ref_sequence_name = 'presence_absence'
ref_sequence = refdata.sequence(ref_sequence_name)
diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py
index 7b7de6a..352b74e 100644
--- a/ariba/tests/cluster_test.py
+++ b/ariba/tests/cluster_test.py
@@ -216,9 +216,9 @@ class TestCluster(unittest.TestCase):
def test_full_run_ok_variants_only_variant_not_present_always_report(self):
'''test complete run of cluster on a variants only gene when variant not present but always report variant'''
fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_varonly.not_present.always_report.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
- tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.not_present.always_report'
+ tmpdir = 'tmp.cluster_full_run_varonly.not_present.always_report'
shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300)
@@ -265,13 +265,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_pres_abs_gene(self):
+ def test_full_run_smtls_snp_presabs_gene(self):
'''test complete run where samtools calls a snp in a presence/absence gene'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_gene.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_gene.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_pres_abs_gene'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_gene'), tmpdir)
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
expected = [
@@ -281,13 +281,15 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_var_only_gene(self):
+ def test_full_run_smtls_snp_varonly_gene_2(self):
'''test complete run where samtools calls a snp in a variant only gene'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_gene.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_gene.metadata.tsv')
+ # _2 because I think test_full_run_smtls_snp_varonly_gene tests the asame functionality.
+ # ... but let's leave both tests in anyway
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
- tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_var_only_gene'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_gene'), tmpdir)
+ tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2'
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
expected = [
@@ -297,13 +299,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_known_position_pres_abs_gene(self):
+ def test_full_run_known_smtls_snp_presabs_gene(self):
'''test complete run where samtools calls a snp at a known snp location in a presence/absence gene'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene'), tmpdir)
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
@@ -316,13 +318,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var(self):
+ def test_full_run_smtls_snp_varonly_gene_no_snp(self):
'''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does not have variant'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_no_snp.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_no_snp.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
- tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var'), tmpdir)
+ tmpdir = 'tmp.cluster_test_full_run_smtls_snp_varonly_gene_no_snp'
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_no_snp'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
@@ -335,13 +337,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var(self):
+ def test_full_run_smtls_snp_varonly_gene(self):
'''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does have variant'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var'), tmpdir)
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
@@ -354,13 +356,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_pres_abs_noncoding(self):
+ def test_full_run_smtls_snp_presabs_nonc(self):
'''test complete run where samtools calls a snp in a presence/absence noncoding sequence'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_nonc.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_nonc.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
- tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding'), tmpdir)
+ tmpdir = 'tmp.cluster_test_full_run_smtls_snp_presabs_nonc'
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_nonc'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
expected = [
@@ -370,13 +372,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_var_only_noncoding(self):
+ def test_full_run_smtls_snp_varonly_nonc(self):
'''test complete run where samtools calls a snp in a presence/absence noncoding sequence'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_noncoding.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_noncoding.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
- tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_var_only_noncoding'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_noncoding'), tmpdir)
+ tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc'
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
expected = [
@@ -386,13 +388,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding(self):
+ def test_full_run_known_smtls_snp_presabs_nonc(self):
'''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_nonc.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_nonc.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding'), tmpdir)
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_nonc'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
@@ -405,13 +407,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var(self):
+ def test_full_run_smtls_snp_varonly_nonc_no_snp(self):
'''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample does not have the var'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc_no_snp.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc_no_snp.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var'), tmpdir)
+ shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc_no_snp'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
@@ -424,13 +426,13 @@ class TestCluster(unittest.TestCase):
shutil.rmtree(tmpdir)
- def test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var(self):
+ def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self):
'''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample has the var'''
- fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.fa')
- tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.metadata.tsv')
+ fasta_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding'
- shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var'), tmpdir)
+ shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc'), tmpdir)
c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
c.run()
@@ -441,3 +443,20 @@ class TestCluster(unittest.TestCase):
]
self.assertEqual(expected, c.report_lines)
shutil.rmtree(tmpdir)
+
+
+ def test_full_run_partial_assembly(self):
+ '''Test complete run where only part of the ref gene is present in the reads'''
+ fasta_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.fa')
+ tsv_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.tsv')
+ refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
+ tmpdir = 'tmp.cluster_test_full_run_partial_assembly'
+ shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly'), tmpdir)
+ c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=278, total_reads_bases=15020)
+ c.run()
+
+ expected = [
+ 'presence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.scaffold.1\t949\t20.5\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1'
+ ]
+ self.assertEqual(expected, c.report_lines)
+ shutil.rmtree(tmpdir)
diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py
index 38d1ae0..a107003 100644
--- a/ariba/tests/clusters_test.py
+++ b/ariba/tests/clusters_test.py
@@ -49,9 +49,9 @@ class TestClusters(unittest.TestCase):
self.assertEqual(expected, got)
- def test_load_reference_data_from_dir(self):
+ def test_load_ref_data_from_dir(self):
'''test _load_reference_data_from_dir'''
- indir = os.path.join(data_dir, 'clusters_test_load_reference_data_from_dir')
+ indir = os.path.join(data_dir, 'clusters_load_ref_data_from_dir')
got_refdata, got_clusters = clusters.Clusters._load_reference_data_from_dir(indir)
expected_seq_dict = {
'variants_only1': pyfastaq.sequences.Fasta('variants_only1', 'atggcgtgcgatgaataa'),
@@ -92,16 +92,16 @@ class TestClusters(unittest.TestCase):
def test_minimap_reads_to_all_ref_seqs(self):
'''test test_minimap_reads_to_all_ref_seqs'''
- clusters_tsv = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.clusters.tsv')
- ref_fasta = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.ref.fa')
- reads_1 = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.reads_1.fq')
- reads_2 = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.reads_2.fq')
+ clusters_tsv = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.clstrs.tsv')
+ ref_fasta = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.ref.fa')
+ reads_1 = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.reads_1.fq')
+ reads_2 = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.reads_2.fq')
tmp_outprefix = 'tmp.clusters_test_minimap_reads_to_all_ref_seqs'
clusters.Clusters._minimap_reads_to_all_ref_seqs(clusters_tsv, ref_fasta, reads_1, reads_2, tmp_outprefix)
- expected_cluster2rep = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.out.cluster2representative')
- expected_cluster_counts = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.out.clusterCounts')
- expected_proper_pairs = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.out.properPairs')
- expected_insert_hist = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.out.insertHistogram')
+ expected_cluster2rep = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.out.clstr2rep')
+ expected_cluster_counts = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.out.clstr_count')
+ expected_proper_pairs = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.out.pairs')
+ expected_insert_hist = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.out.hist')
# not sure that the reads order is preserved, so just check read store file exists
self.assertTrue(os.path.exists(os.path.join(tmp_outprefix + '.reads')))
@@ -258,7 +258,7 @@ class TestClusters(unittest.TestCase):
os.unlink(tmp_file)
- def test_write_catted_genes_matching_refs_fasta(self):
+ def test_cat_genes_match_ref(self):
'''test _write_catted_genes_matching_refs_fasta'''
seq1 = pyfastaq.sequences.Fasta('seq1', 'ACGT')
seq3 = pyfastaq.sequences.Fasta('seq3', 'AAAA')
@@ -281,7 +281,7 @@ class TestClusters(unittest.TestCase):
tmp_file = 'tmp.test_write_catted_genes_matching_refs_fasta.fa'
self.clusters._write_catted_genes_matching_refs_fasta(tmp_file)
- expected = os.path.join(data_dir, 'clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa')
+ expected = os.path.join(data_dir, 'clusters_cat_genes_match_ref.fa')
self.assertTrue(filecmp.cmp(expected, tmp_file, shallow=False))
os.unlink(tmp_file)
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa b/ariba/tests/data/assembly_variants_one_var_one_ctg_cdg.fa
similarity index 100%
rename from ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa
rename to ariba/tests/data/assembly_variants_one_var_one_ctg_cdg.fa
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv b/ariba/tests/data/assembly_variants_one_var_one_ctg_cdg.tsv
similarity index 100%
rename from ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
rename to ariba/tests/data/assembly_variants_one_var_one_ctg_cdg.tsv
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa b/ariba/tests/data/assembly_variants_one_var_one_ctg_noncdg.fa
similarity index 100%
rename from ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa
rename to ariba/tests/data/assembly_variants_one_var_one_ctg_noncdg.fa
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv b/ariba/tests/data/assembly_variants_one_var_one_ctg_noncdg.tsv
similarity index 100%
rename from ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
rename to ariba/tests/data/assembly_variants_one_var_one_ctg_noncdg.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
copy to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.ref_for_reads.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.ref_for_reads.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.ref_for_reads.fa
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.ref_for_reads.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.metadata.tsv b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.metadata.tsv
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/reads_1.fq b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/reads_1.fq
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/reads_2.fq b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/reads_2.fq
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/references.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/references.fa
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa
copy to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.metadata.tsv b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.metadata.tsv
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/reads_1.fq b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/reads_1.fq
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/reads_2.fq b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/reads_2.fq
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/references.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/references.fa
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.ref_for_reads.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.ref_for_reads.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.ref_for_reads.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.ref_for_reads.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/references.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
copy to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/references.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
copy to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/references.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa
copy to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv b/ariba/tests/data/cluster_full_run_varonly.not_present.always_report.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
rename to ariba/tests/data/cluster_full_run_varonly.not_present.always_report.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_gene_start_mismatch.ref_to_make_reads.fa b/ariba/tests/data/cluster_test_full_run_ok_gene_start_mismatch.ref_for_reads.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_gene_start_mismatch.ref_to_make_reads.fa
rename to ariba/tests/data/cluster_test_full_run_ok_gene_start_mismatch.ref_for_reads.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.fa b/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.fa
deleted file mode 100644
index c42d06b..0000000
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.fa
+++ /dev/null
@@ -1,3 +0,0 @@
->ref_gene
-ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
-ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.fa b/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.fa
deleted file mode 100644
index a23e635..0000000
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.fa
+++ /dev/null
@@ -1,3 +0,0 @@
->ref_seq
-ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
-ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.fa b/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.fa
deleted file mode 100644
index c42d06b..0000000
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.fa
+++ /dev/null
@@ -1,3 +0,0 @@
->ref_gene
-ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
-ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.fa b/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.fa
deleted file mode 100644
index a23e635..0000000
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.fa
+++ /dev/null
@@ -1,3 +0,0 @@
->ref_seq
-ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
-ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_test_full_run_partial_asmbly.fa
similarity index 83%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
copy to ariba/tests/data/cluster_test_full_run_partial_asmbly.fa
index c42d06b..d7a6970 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly.fa
@@ -1,3 +1,3 @@
->ref_gene
+>presence_absence1
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_partial_asmbly.tsv b/ariba/tests/data/cluster_test_full_run_partial_asmbly.tsv
new file mode 100644
index 0000000..da35140
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly.tsv
@@ -0,0 +1 @@
+presence_absence1 1 0 . . Generic description of presence_absence1
diff --git a/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_1.fq b/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_1.fq
new file mode 100644
index 0000000..f53750a
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_1.fq
@@ -0,0 +1,432 @@
+ at presence_absence1:1:128:236/1
+CTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:2:637:746/1
+CCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:3:420:530/1
+TAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:4:125:236/1
+GAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:5:297:405/1
+GTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:6:649:758/1
+AGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:7:568:678/1
+TCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:8:494:606/1
+CGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:9:386:495/1
+CAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:10:129:238/1
+TACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:11:735:846/1
+TCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:12:360:470/1
+TTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:13:730:840/1
+TCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:14:669:777/1
+GACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:15:657:766/1
+ACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:16:778:887/1
+ACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATCCACTTGCTGCGACTTATTCCGTCTGTATCTGAAGATCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:17:340:450/1
+GCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:18:463:573/1
+ACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:19:371:481/1
+ACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:20:520:629/1
+ATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:21:300:411/1
+AAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:22:316:426/1
+TGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:23:494:603/1
+CGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:24:507:619/1
+ACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:25:351:459/1
+GCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:26:709:818/1
+TATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:27:474:583/1
+CACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:28:181:292/1
+ATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:29:200:308/1
+GGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:30:435:545/1
+TTCTCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:31:212:323/1
+GGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:32:134:244/1
+ATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:33:652:761/1
+CGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:34:200:311/1
+GGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:35:516:624/1
+AGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:36:667:778/1
+TTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:37:767:875/1
+AGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATCCACTTGCTGCGACTTATTCCGTCTGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:38:78:188/1
+CCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:39:183:292/1
+GTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:40:446:556/1
+TACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:41:747:856/1
+CGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATCCACTTGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:42:672:781/1
+AACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:43:103:214/1
+CAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:44:115:225/1
+CTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:45:615:725/1
+CGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:46:775:886/1
+AAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATCCACTTGCTGCGACTTATTCCGTCTGTATCTGAAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:47:721:832/1
+ACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:48:503:614/1
+GAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:49:279:388/1
+CCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:50:691:802/1
+ATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:51:713:821/1
+CGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:52:626:736/1
+ATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:53:739:849/1
+TTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:54:650:757/1
+GACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:55:544:654/1
+AAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:56:75:184/1
+GGACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:57:77:187/1
+ACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:58:493:602/1
+CCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:59:250:361/1
+TCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:60:438:547/1
+TCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:61:656:767/1
+AACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:62:674:784/1
+CGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:63:609:720/1
+CTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:64:459:570/1
+TGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:65:514:624/1
+TTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:66:628:736/1
+GGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:67:735:845/1
+TCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:68:273:385/1
+GGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:69:676:787/1
+ACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:70:28:137/1
+TCTACTTCCAGACCCGTCTCGATATCTCACCTTTGCCCCAACTCGGCGGACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:71:634:744/1
+CACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:72:346:455/1
+TGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:73:153:263/1
+GAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:74:106:215/1
+TAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:75:219:329/1
+CACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:76:591:700/1
+GGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:77:260:371/1
+CTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:78:60:170/1
+TTGCCCCAACTCGGCGGACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:79:611:720/1
+CTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:80:688:797/1
+GATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:81:604:713/1
+TCTTCCTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:82:622:731/1
+TCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:83:72:182/1
+GGCGGACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:84:327:438/1
+TAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:85:231:342/1
+CAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:86:733:843/1
+CCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:87:154:265/1
+AGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:88:218:328/1
+GCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:89:239:349/1
+TGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:90:315:422/1
+TTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:91:351:460/1
+GCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:92:224:333/1
+CACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:93:371:479/1
+ACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:94:543:653/1
+GAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:95:345:454/1
+TTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:96:88:196/1
+AGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:97:680:789/1
+CTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:98:439:548/1
+CCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:99:523:633/1
+TTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCCTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:100:455:566/1
+GCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:101:365:474/1
+AGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:102:496:608/1
+AACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:103:392:502/1
+GTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:104:345:455/1
+TTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:105:714:825/1
+GCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:106:589:699/1
+TCGGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:107:237:346/1
+ACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:108:642:750/1
+TATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_2.fq b/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_2.fq
new file mode 100644
index 0000000..de62a9d
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_2.fq
@@ -0,0 +1,432 @@
+ at presence_absence1:1:128:236/2
+GCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:2:637:746/2
+GCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:3:420:530/2
+CGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:4:125:236/2
+GCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:5:297:405/2
+GGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:6:649:758/2
+GAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:7:568:678/2
+TTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:8:494:606/2
+ACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:9:386:495/2
+GGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:10:129:238/2
+ATGCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:11:735:846/2
+AAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:12:360:470/2
+CTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:13:730:840/2
+CCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:14:669:777/2
+TGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:15:657:766/2
+TACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:16:778:887/2
+CCTTCCCGAGTGGTTAGATAGTAGTCACAGCGGCTTATGTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:17:340:450/2
+CGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:18:463:573/2
+GAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:19:371:481/2
+CGACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:20:520:629/2
+GGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:21:300:411/2
+GCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:22:316:426/2
+AATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:23:494:603/2
+ATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:24:507:619/2
+TCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:25:351:459/2
+GCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:26:709:818/2
+CGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:27:474:583/2
+TGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:28:181:292/2
+TTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:29:200:308/2
+CGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:30:435:545/2
+GGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:31:212:323/2
+AGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:32:134:244/2
+CCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:33:652:761/2
+ACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:34:200:311/2
+TACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:35:516:624/2
+GGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:36:667:778/2
+TTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:37:767:875/2
+GTTAGATAGTAGTCACAGCGGCTTATGTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:38:78:188/2
+AAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:39:183:292/2
+TTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:40:446:556/2
+GATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:41:747:856/2
+GGCTTATGTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:42:672:781/2
+ATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:43:103:214/2
+CTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:44:115:225/2
+TGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:45:615:725/2
+GAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:46:775:886/2
+CTTCCCGAGTGGTTAGATAGTAGTCACAGCGGCTTATGTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:47:721:832/2
+ATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:48:503:614/2
+TAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:49:279:388/2
+AGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:50:691:802/2
+CGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:51:713:821/2
+GCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:52:626:736/2
+CCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:53:739:849/2
+GTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:54:650:757/2
+AATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:55:544:654/2
+ATAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:56:75:184/2
+CTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACGCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:57:77:187/2
+AGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:58:493:602/2
+TATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:59:250:361/2
+TTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:60:438:547/2
+GTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:61:656:767/2
+ATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:62:674:784/2
+GGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:63:609:720/2
+GAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:64:459:570/2
+GGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:65:514:624/2
+GGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:66:628:736/2
+CCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:67:735:845/2
+AAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:68:273:385/2
+GTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:69:676:787/2
+ATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:70:28:137/2
+GTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACGCTACATGTGGAGCGCTCTACATGATGTTTCACTCAACAAACGGGGATACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:71:634:744/2
+AAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:72:346:455/2
+TCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:73:153:263/2
+GCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:74:106:215/2
+TCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:75:219:329/2
+GGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:76:591:700/2
+TATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:77:260:371/2
+CAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:78:60:170/2
+CAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACGCTACATGTGGAGCGCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:79:611:720/2
+GAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:80:688:797/2
+ACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:81:604:713/2
+GTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTTCGGAGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:82:622:731/2
+TAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:83:72:182/2
+CTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACGCTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:84:327:438/2
+GTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:85:231:342/2
+TCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:86:733:843/2
+AGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:87:154:265/2
+AAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:88:218:328/2
+GTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:89:239:349/2
+AGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:90:315:422/2
+TTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:91:351:460/2
+TGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:92:224:333/2
+TTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:93:371:479/2
+ACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:94:543:653/2
+TAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:95:345:454/2
+CCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:96:88:196/2
+GGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:97:680:789/2
+AGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:98:439:548/2
+GGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:99:523:633/2
+GTTCGGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:100:455:566/2
+AGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:101:365:474/2
+CCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:102:496:608/2
+TTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:103:392:502/2
+CGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:104:345:455/2
+TCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:105:714:825/2
+GCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:106:589:699/2
+ATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:107:237:346/2
+ATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:108:642:750/2
+CGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_test_full_run_partial_asmbly/references.fa
similarity index 83%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
rename to ariba/tests/data/cluster_test_full_run_partial_asmbly/references.fa
index c42d06b..d7a6970 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly/references.fa
@@ -1,3 +1,3 @@
->ref_gene
+>presence_absence1
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.fa b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.fa
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.metadata.tsv b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.metadata.tsv
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/reads_1.fq b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/reads_1.fq
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/reads_2.fq b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/reads_2.fq
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/references.fa
diff --git a/ariba/tests/data/clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa b/ariba/tests/data/clusters_cat_genes_match_ref.fa
similarity index 100%
rename from ariba/tests/data/clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa
rename to ariba/tests/data/clusters_cat_genes_match_ref.fa
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/00.info.txt b/ariba/tests/data/clusters_load_ref_data_from_dir/00.info.txt
similarity index 100%
rename from ariba/tests/data/clusters_test_load_reference_data_from_dir/00.info.txt
rename to ariba/tests/data/clusters_load_ref_data_from_dir/00.info.txt
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/01.filter.check_metadata.tsv b/ariba/tests/data/clusters_load_ref_data_from_dir/01.filter.check_metadata.tsv
similarity index 100%
rename from ariba/tests/data/clusters_test_load_reference_data_from_dir/01.filter.check_metadata.tsv
rename to ariba/tests/data/clusters_load_ref_data_from_dir/01.filter.check_metadata.tsv
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/02.cdhit.all.fa b/ariba/tests/data/clusters_load_ref_data_from_dir/02.cdhit.all.fa
similarity index 100%
rename from ariba/tests/data/clusters_test_load_reference_data_from_dir/02.cdhit.all.fa
rename to ariba/tests/data/clusters_load_ref_data_from_dir/02.cdhit.all.fa
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/02.cdhit.clusters.pickle b/ariba/tests/data/clusters_load_ref_data_from_dir/02.cdhit.clusters.pickle
similarity index 100%
rename from ariba/tests/data/clusters_test_load_reference_data_from_dir/02.cdhit.clusters.pickle
rename to ariba/tests/data/clusters_load_ref_data_from_dir/02.cdhit.clusters.pickle
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.clusters.tsv b/ariba/tests/data/clusters_minimap_reads_to_all_refs.clstrs.tsv
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.clusters.tsv
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.clstrs.tsv
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.cluster2representative b/ariba/tests/data/clusters_minimap_reads_to_all_refs.out.clstr2rep
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.cluster2representative
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.out.clstr2rep
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.clusterCounts b/ariba/tests/data/clusters_minimap_reads_to_all_refs.out.clstr_count
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.clusterCounts
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.out.clstr_count
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.insertHistogram b/ariba/tests/data/clusters_minimap_reads_to_all_refs.out.hist
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.insertHistogram
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.out.hist
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.properPairs b/ariba/tests/data/clusters_minimap_reads_to_all_refs.out.pairs
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.properPairs
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.out.pairs
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.reads_1.fq b/ariba/tests/data/clusters_minimap_reads_to_all_refs.reads_1.fq
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.reads_1.fq
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.reads_1.fq
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.reads_2.fq b/ariba/tests/data/clusters_minimap_reads_to_all_refs.reads_2.fq
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.reads_2.fq
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.reads_2.fq
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.ref.fa b/ariba/tests/data/clusters_minimap_reads_to_all_refs.ref.fa
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.ref.fa
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.ref.fa
diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv
new file mode 100644
index 0000000..71dc3e2
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv
@@ -0,0 +1,3 @@
+seq1 1 0 . . .
+seq2 1 0 . . Original name: seq2 spam eggs
+seq3 1 0 . . Original name: seq3 hello dave
diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa
new file mode 100644
index 0000000..8acb693
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa
@@ -0,0 +1,6 @@
+>seq1
+CACTACAT
+>seq2 spam eggs
+AAAA
+>seq3 hello dave
+GGGGG
diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv
new file mode 100644
index 0000000..5ad3200
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv
@@ -0,0 +1,3 @@
+seq1 0 0 . . .
+seq2 0 0 . . Original name: seq2 spam eggs
+seq3 0 0 . . Original name: seq3 hello dave
diff --git a/ariba/tests/data/ref_preparer_test_run.out/00.info.txt b/ariba/tests/data/ref_preparer_test_run.out/00.info.txt
index c05ab65..52774fd 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/00.info.txt
+++ b/ariba/tests/data/ref_preparer_test_run.out/00.info.txt
@@ -1,6 +1,6 @@
-input fasta file: /home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa
-input fasta file: /home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa
-input fasta file: /home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa
-input tsv file: /home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.1.tsv
-input tsv file: /home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.2.tsv
+input fasta file: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa
+input fasta file: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa
+input fasta file: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa
+input tsv file: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.tsv
+input tsv file: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.tsv
genetic_code 1
diff --git a/ariba/tests/data/ref_preparer_test_run.out/00.version_info.txt b/ariba/tests/data/ref_preparer_test_run.out/00.version_info.txt
index 0373c3d..fe7f1bb 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/00.version_info.txt
+++ b/ariba/tests/data/ref_preparer_test_run.out/00.version_info.txt
@@ -1,5 +1,5 @@
ARIBA run with this command:
-python3 -m unittest prepareref ariba.tests.ref_preparer_test
-from this directory: /home/ubuntu/git/ariba
+python3 -m unittest prepareref ariba.tests.ref_preparer_test.TestRefPreparer
+from this directory: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba
diff --git a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.pickle b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.pickle
index 6545b8b..a113521 100644
Binary files a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.pickle and b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.pickle differ
diff --git a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
index 217fd98..bd9af57 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
+++ b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
@@ -1,5 +1,5 @@
-cluster_1 gene1 gene2
-cluster_2 gene3
+cluster gene1 gene2
+cluster_1 gene3
gene4 gene4.var_only
noncoding- noncoding1 noncoding2.var_only noncoding3.var_only
noncoding4 noncoding4.var_only
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv
new file mode 100644
index 0000000..d2dc5ff
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv
@@ -0,0 +1,9 @@
+gene1 0 0 . . .
+gene2 0 0 . . .
+gene3 0 0 . . .
+gene4.var_only 0 0 . . .
+noncoding1 0 0 . . .
+noncoding2.var_only 0 0 . . .
+noncoding3.var_only 0 0 . . .
+noncoding4.var_only 0 0 . . .
+cannot_make_into_a_gene 0 0 . . .
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt
new file mode 100644
index 0000000..2a0b555
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt
@@ -0,0 +1,5 @@
+input fasta file: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa
+input fasta file: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa
+input fasta file: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa
+input tsv file: tmp.ref_preparer_test_run/00.auto_metadata.tsv
+genetic_code 1
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt
new file mode 100644
index 0000000..fe7f1bb
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt
@@ -0,0 +1,5 @@
+ARIBA run with this command:
+python3 -m unittest prepareref ariba.tests.ref_preparer_test.TestRefPreparer
+from this directory: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba
+
+
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv
new file mode 100644
index 0000000..626e9a3
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv
@@ -0,0 +1,9 @@
+cannot_make_into_a_gene 0 0 . . .
+gene1 0 0 . . .
+gene2 0 0 . . .
+gene3 0 0 . . .
+gene4.var_only 0 0 . . .
+noncoding1 0 0 . . .
+noncoding2.var_only 0 0 . . .
+noncoding3.var_only 0 0 . . .
+noncoding4.var_only 0 0 . . .
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa
new file mode 100644
index 0000000..339405a
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa
@@ -0,0 +1,18 @@
+>cannot_make_into_a_gene
+AAAAAAAAAAAAAAAA
+>gene1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
+>gene2
+ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
+>gene3
+ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
+>gene4.var_only
+ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
+>noncoding1
+CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding2.var_only
+CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding3.var_only
+CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding4.var_only
+CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle
new file mode 100644
index 0000000..c48f0d8
Binary files /dev/null and b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle differ
diff --git a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv
similarity index 53%
copy from ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
copy to ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv
index 217fd98..da82b78 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv
@@ -1,5 +1,5 @@
-cluster_1 gene1 gene2
-cluster_2 gene3
-gene4 gene4.var_only
+cluster gene1 gene2
+cluster_1 cannot_make_into_a_gene
+gene4+ gene3 gene4.var_only
noncoding- noncoding1 noncoding2.var_only noncoding3.var_only
noncoding4 noncoding4.var_only
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.fa
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.varonly.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.varonly.fa
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa
new file mode 100644
index 0000000..339405a
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa
@@ -0,0 +1,18 @@
+>cannot_make_into_a_gene
+AAAAAAAAAAAAAAAA
+>gene1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
+>gene2
+ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
+>gene3
+ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
+>gene4.var_only
+ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
+>noncoding1
+CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding2.var_only
+CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding3.var_only
+CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding4.var_only
+CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected.clusters.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.expect.clstrs.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected.clusters.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.expect.clstrs.tsv
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.clusters.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.clusters.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.fa b/ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.fa
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.fa
rename to ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.fa
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.meta.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.meta.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.meta.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.meta.tsv
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected.clusters.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.expect.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected.clusters.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.expect.tsv
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.in.fa b/ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.in.fa
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.in.fa
rename to ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.in.fa
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.in.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.in.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.in.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.in.tsv
diff --git a/ariba/tests/data/reference_data_load_fasta_file.fa b/ariba/tests/data/reference_data_load_fasta_file.fa
index 6b27dae..e99410d 100644
--- a/ariba/tests/data/reference_data_load_fasta_file.fa
+++ b/ariba/tests/data/reference_data_load_fasta_file.fa
@@ -1,2 +1,2 @@
->seq1
+>seq1 foo
ACGT
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.csv.1 b/ariba/tests/data/reference_data_load_input_check_seq_names.bad.csv.1
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.csv.1
rename to ariba/tests/data/reference_data_load_input_check_seq_names.bad.csv.1
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.csv.2 b/ariba/tests/data/reference_data_load_input_check_seq_names.bad.csv.2
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.csv.2
rename to ariba/tests/data/reference_data_load_input_check_seq_names.bad.csv.2
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.fa.1 b/ariba/tests/data/reference_data_load_input_check_seq_names.bad.fa.1
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.fa.1
rename to ariba/tests/data/reference_data_load_input_check_seq_names.bad.fa.1
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.fa.2 b/ariba/tests/data/reference_data_load_input_check_seq_names.bad.fa.2
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.fa.2
rename to ariba/tests/data/reference_data_load_input_check_seq_names.bad.fa.2
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.csv.1 b/ariba/tests/data/reference_data_load_input_check_seq_names.good.csv.1
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.csv.1
rename to ariba/tests/data/reference_data_load_input_check_seq_names.good.csv.1
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.csv.2 b/ariba/tests/data/reference_data_load_input_check_seq_names.good.csv.2
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.csv.2
rename to ariba/tests/data/reference_data_load_input_check_seq_names.good.csv.2
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.fa.1 b/ariba/tests/data/reference_data_load_input_check_seq_names.good.fa.1
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.fa.1
rename to ariba/tests/data/reference_data_load_input_check_seq_names.good.fa.1
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.fa.2 b/ariba/tests/data/reference_data_load_input_check_seq_names.good.fa.2
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.fa.2
rename to ariba/tests/data/reference_data_load_input_check_seq_names.good.fa.2
diff --git a/ariba/tests/data/reference_data_rename_sequences.fa b/ariba/tests/data/reference_data_rename_sequences.fa
index e45e17b..60c820e 100644
--- a/ariba/tests/data/reference_data_rename_sequences.fa
+++ b/ariba/tests/data/reference_data_rename_sequences.fa
@@ -1,7 +1,5 @@
>pres_abs1 foo bar spam eggs
ACGT
->pres_abs1 blah
-AAAA
>pres'abs1
CCCC
>pres_abs2
@@ -12,11 +10,9 @@ GGGG
AAAA
>var:only1 boo
CCCC
->var_only1
+>var,only1
GGGG
>var_only2
TTTT
->noncoding1
-AAAA
>noncoding1 blah
-CCCC
+AAAA
diff --git a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
index 462c736..41c6e8c 100644
--- a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
+++ b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
@@ -1,11 +1,9 @@
-noncoding1 0 0 . . original name "noncoding1"
-noncoding1 blah 0 0 . . original name "noncoding1 blah"
-pres_abs1 foo bar spam eggs 0 0 . . original name "pres_abs1 foo bar spam eggs"
-pres_abs1 blah 0 0 . . original name "pres_abs1 blah"
+noncoding1 0 0 . . original name "noncoding1 blah"
+pres_abs1 0 0 . . original name "pres_abs1 foo bar spam eggs"
pres'abs1 0 0 . . original name "pres'abs1"
pres_abs2 0 0 . . original name "pres_abs2"
pres!abs3 0 0 . . original name "pres!abs3"
-var_only1 hello 0 0 . . original name "var_only1 hello"
-var:only1 boo 0 0 . . original name "var:only1 boo"
-var_only1 0 0 . . original name "var_only1"
+var_only1 0 0 . . original name "var_only1 hello"
+var:only1 0 0 . . original name "var:only1 boo"
+var,only1 0 0 . . original name "var,only1"
var_only2 0 0 . . original name "var_only2"
diff --git a/ariba/tests/data/reference_data_test_rename_sequences.out b/ariba/tests/data/reference_data_test_rename_sequences.out
index d47d87c..e48b0d5 100644
--- a/ariba/tests/data/reference_data_test_rename_sequences.out
+++ b/ariba/tests/data/reference_data_test_rename_sequences.out
@@ -1,8 +1,6 @@
-noncoding1 blah noncoding1_1
pres!abs3 pres_abs3
pres'abs1 pres_abs1
-pres_abs1 blah pres_abs1_1
-pres_abs1 foo bar spam eggs pres_abs1_2
-var:only1 boo var_only1
-var_only1 var_only1_1
-var_only1 hello var_only1_2
+pres_abs1 pres_abs1_1
+var,only1 var_only1
+var:only1 var_only1_1
+var_only1 var_only1_2
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.asmbly.fa
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.asmbly.fa
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa.fai b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.asmbly.fa.fai
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa.fai
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.asmbly.fa.fai
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.bam b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.bam
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.bam
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.bam
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.depths.gz
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.depths.gz
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz.tbi b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.depths.gz.tbi
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz.tbi
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.depths.gz.tbi
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.vcf
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.vcf
diff --git a/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv b/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv
new file mode 100644
index 0000000..c652f1c
--- /dev/null
+++ b/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv
@@ -0,0 +1,6 @@
+#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
+noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding_ref1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A42T 1 A42T SNP 42 42 A 84 84 T 17 . 17 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A52T 1 A52T SNP 42 42 A 84 84 T 17 G 20,30 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1
+presence_absence_ref1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence_ref1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1
+presence_absence_ref2 1 0 528 232 presence_absence2 1005 554 99.1 presence_absence2.scaffold.1 1032 22.3 0 . p . 0 V175L NONSYN 522 522 G 265 265 C 36 . 36 . Description foo bar
diff --git a/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv b/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv
new file mode 100644
index 0000000..4a23ebc
--- /dev/null
+++ b/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv
@@ -0,0 +1,6 @@
+#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
+noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id3:variant in ref and reads so should report generic description of noncoding1
+noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A52T 1 A52T SNP 42 42 A 84 84 T 17 G 20,30 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1
+presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1
+variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . .
diff --git a/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv b/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
index 5e12e4a..159949c 100644
--- a/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
+++ b/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
@@ -1,8 +1,8 @@
#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:n:A14T:.:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:.:ref has wild type, reads have variant so should report generic description of noncoding1
noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 0 SNP . . . G15T SNP 15 15 G 85 85 T 17 . 17 . generic description of noncoding1
-noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1
+noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1
noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2
-presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1
+presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1
presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2
-variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:.:Ref and reads have variant so report Generic description of variants_only1
+variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:1:0:S5T:.:Ref and reads have variant so report Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_column_summary_data.tsv b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
index 22a42b5..9c495ec 100644
--- a/ariba/tests/data/summary_sample_test_column_summary_data.tsv
+++ b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
@@ -1,8 +1,8 @@
#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1_n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 0 SNP . . . G15T SNP 15 15 G 85 85 T 17 . 17 . generic description of noncoding1
-noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1_n:A6G:id2:variant in ref and reads so should report generic description of noncoding1
+noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1
noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2
-presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1
+presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1
presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2
-variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:id4:Ref and reads have variant so report Generic description of variants_only1
+variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:1:0:S5T:id4:Ref and reads have variant so report Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_var_groups.tsv b/ariba/tests/data/summary_sample_test_var_groups.tsv
index a125211..3352660 100644
--- a/ariba/tests/data/summary_sample_test_var_groups.tsv
+++ b/ariba/tests/data/summary_sample_test_var_groups.tsv
@@ -1,7 +1,7 @@
#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1
+noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding1.scaffold.1 279 35.4 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id2:variant in ref and reads so should report generic description of noncoding1
noncoding1 0 0 19 78 cluster.n 120 120 98.33 noncoding2.scaffold.1 279 35.4 . . . . . . . . . . . . . . . . . generic description of noncoding2
-presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1
+presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.1 267 35.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1
presence_absence1 1 0 27 88 cluster.p 96 96 98.96 presence_absence1.scaffold.2 267 35.1 . . . . . . . . . . . . . . . . . Generic description of presence_absence2
-variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:p:S5T:id4:Ref and reads have variant so report Generic description of variants_only1
+variants_only1 1 1 27 64 cluster.v 90 90 100.0 variants_only1.scaffold.1 260 42.4 1 SNP p S5T 1 . . 13 15 A;C;C 96 98 A;C;C 12;13;13 .;.;. 12;13;13 variants_only1:1:0:S5T:id4:Ref and reads have variant so report Generic description of variants_only1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
deleted file mode 100644
index d1f5f70..0000000
--- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
deleted file mode 100644
index 6507d5f..0000000
--- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id3:variant in ref and reads so should report generic description of noncoding1
-presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1
-variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . .
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
deleted file mode 100644
index 9e8e9a2..0000000
--- a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-presence_absence1 1 0 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
deleted file mode 100644
index d4cd028..0000000
--- a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1
-presence_absence1 1 0 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1
-variants_only1 1 1 64 12 cluster.v.1 90 . . . . . . . . . . . . . . . . . . . . . . .
diff --git a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv b/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
deleted file mode 100644
index d1f5f70..0000000
--- a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv b/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
deleted file mode 100644
index 6507d5f..0000000
--- a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-noncoding1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id3:variant in ref and reads so should report generic description of noncoding1
-presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1
-variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . .
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
deleted file mode 100644
index 62394c0..0000000
--- a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-presence_absence1 1 0 27 88 cluster.p.1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id4:Ref has wild, reads have variant so report Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
deleted file mode 100644
index d4cd028..0000000
--- a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
-noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
-noncoding1 0 0 19 78 cluster.n.1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:n:A6G:id2:variant in ref and reads so should report generic description of noncoding1
-presence_absence1 1 0 27 88 cluster.p.2 96 96 98.96 presence_absence1.scaffold.1 267 51.1 1 SNP p A10V 1 A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report Generic description of presence_absence1
-variants_only1 1 1 64 12 cluster.v.1 90 . . . . . . . . . . . . . . . . . . . . . . .
diff --git a/ariba/tests/data/summary_to_matrix.1.tsv b/ariba/tests/data/summary_to_matrix.1.tsv
new file mode 100644
index 0000000..1957349
--- /dev/null
+++ b/ariba/tests/data/summary_to_matrix.1.tsv
@@ -0,0 +1,5 @@
+#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
+noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 10.0 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 17 . 17 noncoding_ref1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A42T 1 A42T SNP 42 42 A 84 84 T 17 . 17 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A52T 1 A52T SNP 42 42 A 84 84 T 17 G 20,30 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1
+presence_absence_ref1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 20.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence_ref1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_to_matrix.2.tsv b/ariba/tests/data/summary_to_matrix.2.tsv
new file mode 100644
index 0000000..4a23ebc
--- /dev/null
+++ b/ariba/tests/data/summary_to_matrix.2.tsv
@@ -0,0 +1,6 @@
+#ref_name gene var_only flag reads cluster ref_len ref_base_assembled pc_ident ctg ctg_len ctg_cov known_var var_type var_seq_type known_var_change has_known_var ref_ctg_change ref_ctg_effect ref_start ref_end ref_nt ctg_start ctg_end ctg_nt smtls_total_depth smtls_alt_nt smtls_alt_depth var_description free_text
+noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A14T 1 A14T SNP 13 13 A 84 84 T 50 G 40,10 noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report generic description of noncoding1
+noncoding_ref1 0 0 19 78 noncoding1 120 120 98.33 noncoding1.scaffold.1 279 50.1 1 SNP n A6G 1 . . 6 6 G 77 77 G 18 . 18 noncoding1:0:0:A6G:id3:variant in ref and reads so should report generic description of noncoding1
+noncoding_ref2 0 0 19 78 noncoding2 120 120 98.33 noncoding2.scaffold.1 279 10.0 1 SNP n A52T 1 A52T SNP 42 42 A 84 84 T 17 G 20,30 noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report generic description of noncoding1
+presence_absence1 1 0 27 88 presence_absence1 96 96 98.96 presence_absence1.scaffold.1 267 51.1 0 SNP p A10V . A10V NONSYN 28 28 C 113 113 T 29 . 29 presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report Generic description of presence_absence1
+variants_only1 1 1 64 12 variants_only1 90 . . . . . . . . . . . . . . . . . . . . . . .
diff --git a/ariba/tests/data/vfdb_parser_test_run.out.fa b/ariba/tests/data/vfdb_parser_test_run.out.fa
index 720ef52..e00ddf9 100644
--- a/ariba/tests/data/vfdb_parser_test_run.out.fa
+++ b/ariba/tests/data/vfdb_parser_test_run.out.fa
@@ -2,5 +2,5 @@
AAAA
>efgH.VF234(gi:2345).genus2_species2
CCCC
->seq1 blah
+>seq1
ACGT
diff --git a/ariba/tests/data/vfdb_parser_test_run.out.tsv b/ariba/tests/data/vfdb_parser_test_run.out.tsv
index 253ce52..c853bcf 100644
--- a/ariba/tests/data/vfdb_parser_test_run.out.tsv
+++ b/ariba/tests/data/vfdb_parser_test_run.out.tsv
@@ -1,3 +1,3 @@
-abcD.VF123(gi:1234).genus1_species1 1 0 . . foobar description1 [abc]
-efgH.VF234(gi:2345).genus2_species2 1 0 . . spam eggs description2 [abc]
-seq1 blah 1 0 . . .
+abcD.VF123(gi:1234).genus1_species1 1 0 . . Original name: VF123(gi:1234) (abcD) foobar description1 [abc] [genus1 species1]
+efgH.VF234(gi:2345).genus2_species2 1 0 . . Original name: VF234(gi:2345) (efgH) spam eggs description2 [abc] [genus2 species2]
+seq1 1 0 . . Original name: seq1 blah
diff --git a/ariba/tests/ref_preparer_test.py b/ariba/tests/ref_preparer_test.py
index 604395e..00b979d 100644
--- a/ariba/tests/ref_preparer_test.py
+++ b/ariba/tests/ref_preparer_test.py
@@ -9,6 +9,24 @@ data_dir = os.path.join(modules_dir, 'tests', 'data')
class TestRefPreparer(unittest.TestCase):
+ def test_fasta_to_metadata(self):
+ '''test _fasta_to_metadata'''
+ infile = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.fa')
+ tmp_out = 'tmp.test_fasta_to_metadata.tsv'
+ expected_coding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.coding.tsv')
+ expected_noncoding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.noncoding.tsv')
+
+ with open(tmp_out, 'w') as f:
+ ref_preparer.RefPreparer._fasta_to_metadata(infile, f, True)
+ self.assertTrue(filecmp.cmp(expected_coding, tmp_out, shallow=False))
+
+ with open(tmp_out, 'w') as f:
+ ref_preparer.RefPreparer._fasta_to_metadata(infile, f, False)
+ self.assertTrue(filecmp.cmp(expected_noncoding, tmp_out, shallow=False))
+
+ os.unlink(tmp_out)
+
+
def test_rename_clusters(self):
'''test _rename_clusters'''
clusters_in = {
@@ -31,28 +49,48 @@ class TestRefPreparer(unittest.TestCase):
'16': {'def_2.3'},
'17': {'def.4'},
'18': {'def.5'},
+ '19': {'x_1.foo'},
+ '20': {'x_1.bar'},
+ '21': {'x_1.baz'},
+ '22': {'x_1_2.abc'},
+ '23': {'x_1_2.def'},
+ '24': {'y_1.foo'},
+ '25': {'y_1_2.def'},
+ '26': {'y_1.bar'},
+ '27': {'y_1.baz'},
+ '28': {'y_1_2.abc'},
}
expected = {
- 'cluster_1': {'no_dot_in_name'},
- 'cluster_2': {'another_no_dot_in_name'},
- 'foo_1': {'foo.blah_blah_blah', 'foo.xyz'},
- 'foo_2': {'foo.abc', 'foo.def'},
- 'pre-_1': {'pre1.abc', 'pre2.abc'},
- 'pre-_2': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
- 'prefix1+_1': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
- 'prefix1+_2': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
+ 'cluster': {'no_dot_in_name'},
+ 'cluster_1': {'another_no_dot_in_name'},
+ 'foo': {'foo.blah_blah_blah', 'foo.xyz'},
+ 'foo_1': {'foo.abc', 'foo.def'},
+ 'pre-': {'pre1.abc', 'pre2.abc'},
+ 'pre-_1': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
+ 'prefix1+': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
+ 'prefix1+_1': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
'xyz+': {'xyz.1', 'xyz.2', 'abcdefgh'},
- 'cluster_3': {'a.foo', 'a.bar'},
+ 'cluster_2': {'a.foo', 'a.bar'},
'abc_1': {'abc_1.1'},
- 'abc_2': {'abc.2'},
- 'abc_3': {'abc.3'},
- 'abc_4': {'abc.4'},
+ 'abc': {'abc.2'},
+ 'abc_2': {'abc.3'},
+ 'abc_3': {'abc.4'},
'def_1': {'def_1.2'},
'def_2': {'def_2.3'},
- 'def_3': {'def.1'},
- 'def_4': {'def.4'},
- 'def_5': {'def.5'},
+ 'def': {'def.1'},
+ 'def_3': {'def.4'},
+ 'def_4': {'def.5'},
+ 'x_1': {'x_1.foo'},
+ 'x_1_1': {'x_1.bar'},
+ 'x_1_2_1': {'x_1_2.abc'},
+ 'x_1_2_2': {'x_1_2.def'},
+ 'x_1_2': {'x_1.baz'},
+ 'y_1': {'y_1.foo'},
+ 'y_1_1': {'y_1.bar'},
+ 'y_1_2_1': {'y_1_2.abc'},
+ 'y_1_2': {'y_1_2.def'},
+ 'y_1_3': {'y_1.baz'},
}
got = ref_preparer.RefPreparer._rename_clusters(clusters_in)
@@ -72,7 +110,7 @@ class TestRefPreparer(unittest.TestCase):
]
extern_progs = external_progs.ExternalProgs()
- refprep = ref_preparer.RefPreparer(fasta_in, tsv_in, extern_progs, genetic_code=1)
+ refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, metadata_tsv_files=tsv_in, genetic_code=1)
tmp_out = 'tmp.ref_preparer_test_run'
refprep.run(tmp_out)
expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run.out')
@@ -96,3 +134,41 @@ class TestRefPreparer(unittest.TestCase):
shutil.rmtree(tmp_out)
+
+ def test_run_all_noncoding(self):
+ '''test run with no metadata input, all sequences are noncoding'''
+ fasta_in = [
+ os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'),
+ os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'),
+ os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'),
+ ]
+
+ extern_progs = external_progs.ExternalProgs()
+ refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1)
+ tmp_out = 'tmp.ref_preparer_test_run'
+ refprep.run(tmp_out)
+ expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out')
+
+ test_files = [
+ '00.auto_metadata.tsv',
+ '01.filter.check_metadata.tsv',
+ '01.filter.check_genes.log',
+ '01.filter.check_metadata.log',
+ '02.cdhit.all.fa',
+ '02.cdhit.clusters.tsv',
+ '02.cdhit.gene.fa',
+ '02.cdhit.gene.varonly.fa',
+ '02.cdhit.noncoding.fa',
+ '02.cdhit.noncoding.varonly.fa',
+ ]
+
+ for filename in test_files:
+ expected = os.path.join(expected_outdir, filename)
+ got = os.path.join(tmp_out, filename)
+ self.assertTrue(filecmp.cmp(expected, got, shallow=False))
+
+ shutil.rmtree(tmp_out)
+
+
+
+
diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py
index 4714e29..f23d4cc 100644
--- a/ariba/tests/reference_data_test.py
+++ b/ariba/tests/reference_data_test.py
@@ -129,10 +129,10 @@ class TestReferenceData(unittest.TestCase):
self.assertEqual(expected, got)
- def test_load_input_files_and_check_seq_names_ok(self):
+ def test_load_input_check_seq_names_ok(self):
'''Test _load_input_files_and_check_seq_names with good input'''
- fasta_files = [os.path.join(data_dir, 'reference_data_load_input_files_and_check_seq_names.good.in.fa.' + x) for x in ['1', '2']]
- metadata_files = [os.path.join(data_dir, 'reference_data_load_input_files_and_check_seq_names.good.in.csv.' + x) for x in ['1', '2']]
+ fasta_files = [os.path.join(data_dir, 'reference_data_load_input_check_seq_names.good.fa.' + x) for x in ['1', '2']]
+ metadata_files = [os.path.join(data_dir, 'reference_data_load_input_check_seq_names.good.csv.' + x) for x in ['1', '2']]
expected_seqs = {
'seq1': pyfastaq.sequences.Fasta('seq1', 'ACGT'),
'seq2': pyfastaq.sequences.Fasta('seq2', 'TTTT')
@@ -160,10 +160,10 @@ class TestReferenceData(unittest.TestCase):
self.assertEqual(expected_meta, got_meta)
- def test_load_input_files_and_check_seq_names_bad(self):
+ def test_load_input_check_seq_names_bad(self):
'''Test _load_input_files_and_check_seq_names with bad input'''
- fasta_files = [os.path.join(data_dir, 'reference_data_load_input_files_and_check_seq_names.bad.in.fa.' + x) for x in ['1', '2']]
- metadata_files = [os.path.join(data_dir, 'reference_data_load_input_files_and_check_seq_names.bad.in.csv.' + x) for x in ['1', '2']]
+ fasta_files = [os.path.join(data_dir, 'reference_data_load_input_check_seq_names.bad.fa.' + x) for x in ['1', '2']]
+ metadata_files = [os.path.join(data_dir, 'reference_data_load_input_check_seq_names.bad.csv.' + x) for x in ['1', '2']]
with self.assertRaises(reference_data.Error):
reference_data.ReferenceData._load_input_files_and_check_seq_names(fasta_files, metadata_files)
@@ -264,12 +264,9 @@ class TestReferenceData(unittest.TestCase):
'''Test _new_seq_name'''
tests = [
('name', 'name'),
- ('name ', 'name'),
- ('name xyz', 'name'),
('name_a', 'name_a'),
('name.a', 'name.a'),
- ('name-a', 'name-a'),
- ('name spam eggs foo', 'name'),
+ ('name-a', 'name_a'),
('name!', 'name_'),
('name:foo', 'name_foo'),
('name:!@foo', 'name___foo'),
@@ -281,15 +278,15 @@ class TestReferenceData(unittest.TestCase):
def test_seq_names_to_rename_dict(self):
'''Test _seq_names_to_rename_dict'''
- names = {'foo', 'foo abc', 'foo xyz', 'bar!', 'bar:', 'spam abc', 'eggs'}
+ names = {'foo', 'bar!', 'bar:', 'bar,', 'spam', 'eggs,123'}
got = reference_data.ReferenceData._seq_names_to_rename_dict(names)
expected = {
- 'foo abc': 'foo_1',
- 'foo xyz': 'foo_2',
'bar!': 'bar_',
- 'bar:': 'bar__1',
- 'spam abc': 'spam'
+ 'bar,': 'bar__1',
+ 'bar:': 'bar__2',
+ 'eggs,123': 'eggs_123'
}
+
self.assertEqual(expected, got)
@@ -386,23 +383,19 @@ class TestReferenceData(unittest.TestCase):
self.assertTrue(filecmp.cmp(expected_file, tmp_out, shallow=False))
os.unlink(tmp_out)
- meta1 = sequence_metadata.SequenceMetadata('noncoding1\t0\t0\t.\t.\toriginal name "noncoding1"')
- meta2 = sequence_metadata.SequenceMetadata('noncoding1_1\t0\t0\t.\t.\toriginal name "noncoding1 blah"')
- meta3 = sequence_metadata.SequenceMetadata('pres_abs1_2\t0\t0\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"')
- meta4 = sequence_metadata.SequenceMetadata('pres_abs1_1\t0\t0\t.\t.\toriginal name "pres_abs1 blah"')
+ meta1 = sequence_metadata.SequenceMetadata('noncoding1\t0\t0\t.\t.\toriginal name "noncoding1 blah"')
+ meta3 = sequence_metadata.SequenceMetadata('pres_abs1_1\t0\t0\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"')
meta5 = sequence_metadata.SequenceMetadata('pres_abs1\t0\t0\t.\t.\toriginal name "pres\'abs1"')
meta6 = sequence_metadata.SequenceMetadata('pres_abs2\t0\t0\t.\t.\toriginal name "pres_abs2"')
meta7 = sequence_metadata.SequenceMetadata('pres_abs3\t0\t0\t.\t.\toriginal name "pres!abs3"')
meta8 = sequence_metadata.SequenceMetadata('var_only1_2\t0\t0\t.\t.\toriginal name "var_only1 hello"')
- meta9 = sequence_metadata.SequenceMetadata('var_only1\t0\t0\t.\t.\toriginal name "var:only1 boo"')
- meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t0\t0\t.\t.\toriginal name "var_only1"')
+ meta9 = sequence_metadata.SequenceMetadata('var_only1\t0\t0\t.\t.\toriginal name "var,only1"')
+ meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t0\t0\t.\t.\toriginal name "var:only1 boo"')
meta11 = sequence_metadata.SequenceMetadata('var_only2\t0\t0\t.\t.\toriginal name "var_only2"')
expected_meta = {
'noncoding1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta1}},
- 'noncoding1_1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta2}},
- 'pres_abs1_2': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta3}},
- 'pres_abs1_1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta4}},
+ 'pres_abs1_1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta3}},
'pres_abs1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta5}},
'pres_abs2': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta6}},
'pres_abs3': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta7}},
@@ -412,19 +405,19 @@ class TestReferenceData(unittest.TestCase):
'var_only2': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta11}},
}
+ self.maxDiff = None
+ self.assertEqual(set(expected_meta.keys()), set(refdata.metadata.keys()))
self.assertEqual(expected_meta, refdata.metadata)
expected_seqs_dict = {
'noncoding1': pyfastaq.sequences.Fasta('noncoding1', 'AAAA'),
- 'noncoding1_1': pyfastaq.sequences.Fasta('noncoding1_1', 'CCCC'),
- 'pres_abs1_2': pyfastaq.sequences.Fasta('pres_abs1_2', 'ACGT'),
- 'pres_abs1_1': pyfastaq.sequences.Fasta('pres_abs1_1', 'AAAA'),
+ 'pres_abs1_1': pyfastaq.sequences.Fasta('pres_abs1_1', 'ACGT'),
'pres_abs1': pyfastaq.sequences.Fasta('pres_abs1', 'CCCC'),
'pres_abs2': pyfastaq.sequences.Fasta('pres_abs2', 'TTTT'),
'pres_abs3': pyfastaq.sequences.Fasta('pres_abs3', 'GGGG'),
'var_only1_2': pyfastaq.sequences.Fasta('var_only1_2', 'AAAA'),
- 'var_only1': pyfastaq.sequences.Fasta('var_only1', 'CCCC'),
- 'var_only1_1': pyfastaq.sequences.Fasta('var_only1_1', 'GGGG'),
+ 'var_only1': pyfastaq.sequences.Fasta('var_only1', 'GGGG'),
+ 'var_only1_1': pyfastaq.sequences.Fasta('var_only1_1', 'CCCC'),
'var_only2': pyfastaq.sequences.Fasta('var_only2', 'TTTT'),
}
@@ -537,11 +530,11 @@ class TestReferenceData(unittest.TestCase):
os.unlink(outprefix + '.noncoding.varonly.fa')
- def test_cluster_with_cdhit_clusters_in_file(self):
+ def test_cluster_w_cdhit_clstrs_file(self):
'''Test cluster_with_cd_hit clusters from file'''
- fasta_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file.in.fa')
- meta_tsv_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file.in.meta.tsv')
- cluster_tsv_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file.in.clusters.tsv')
+ fasta_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_clstrs_file.in.fa')
+ meta_tsv_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_clstrs_file.in.meta.tsv')
+ cluster_tsv_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv')
refdata = reference_data.ReferenceData([fasta_in], [meta_tsv_in])
outprefix = 'tmp.test_cluster_with_cdhit_clusters_in_file'
@@ -555,7 +548,7 @@ class TestReferenceData(unittest.TestCase):
got_clusters = refdata.cluster_with_cdhit(outprefix, clusters_file=cluster_tsv_in)
self.assertEqual(expected_clusters, got_clusters)
- expected_clusters_file = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file.expected.clusters.tsv')
+ expected_clusters_file = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_clstrs_file.expect.clstrs.tsv')
got_clusters_file = outprefix + '.clusters.tsv'
self.assertTrue(filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False))
@@ -567,10 +560,10 @@ class TestReferenceData(unittest.TestCase):
os.unlink(outprefix + '.noncoding.varonly.fa')
- def test_cluster_with_cdhit_nocluster(self):
+ def test_cluster_w_cdhit_nocluster(self):
'''Test cluster_with_cd_hit do not run cdhit'''
- fasta_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_nocluster.in.fa')
- tsv_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_nocluster.in.tsv')
+ fasta_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_nocluster.in.fa')
+ tsv_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_nocluster.in.tsv')
refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
outprefix = 'tmp.test_cluster_with_cdhit_nocluster'
@@ -586,7 +579,7 @@ class TestReferenceData(unittest.TestCase):
got_clusters = refdata.cluster_with_cdhit(outprefix, nocluster=True)
self.assertEqual(expected_clusters, got_clusters)
- expected_clusters_file = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_nocluster.expected.clusters.tsv')
+ expected_clusters_file = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_nocluster.expect.tsv')
got_clusters_file = outprefix + '.clusters.tsv'
self.assertTrue(filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False))
diff --git a/ariba/tests/samtools_variants_test.py b/ariba/tests/samtools_variants_test.py
index 873e840..d0417be 100644
--- a/ariba/tests/samtools_variants_test.py
+++ b/ariba/tests/samtools_variants_test.py
@@ -16,13 +16,13 @@ def file2lines(filename):
class TestSamtoolsVariants(unittest.TestCase):
- def test_make_vcf_and_read_depths_files(self):
+ def test_make_vcf_and_depths_files(self):
'''test _make_vcf_and_read_depths_files'''
- ref = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa')
- bam = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.bam')
- expected_vcf = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf')
- expected_depths = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz')
- tmp_prefix = 'tmp.test_make_vcf_and_read_depths_files'
+ ref = os.path.join(data_dir, 'samtools_variants_make_vcf_and_depths_files.asmbly.fa')
+ bam = os.path.join(data_dir, 'samtools_variants_make_vcf_and_depths_files.bam')
+ expected_vcf = os.path.join(data_dir, 'samtools_variants_make_vcf_and_depths_files.expect.vcf')
+ expected_depths = os.path.join(data_dir, 'samtools_variants_make_vcf_and_depths_files.expect.depths.gz')
+ tmp_prefix = 'tmp.test_make_vcf_and_depths_files'
sv = samtools_variants.SamtoolsVariants(
ref,
bam,
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index 6220dcf..f5022fc 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -1,6 +1,6 @@
import unittest
import os
-from ariba import flag, summary_cluster
+from ariba import flag, summary_cluster, summary_cluster_variant
modules_dir = os.path.dirname(os.path.abspath(summary_cluster.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')
@@ -8,7 +8,7 @@ data_dir = os.path.join(modules_dir, 'tests', 'data')
class TestSummaryCluster(unittest.TestCase):
def test_line2dict(self):
'''Test _line2dict'''
- line = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:var_group1:ref has wild type, foo bar\tsome free text'
+ line = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:var_group1:ref has wild type, foo bar\tsome free text'
expected = {
'ref_name': 'refname',
@@ -39,7 +39,7 @@ class TestSummaryCluster(unittest.TestCase):
'smtls_total_depth': '17',
'smtls_alt_nt': '.',
'smtls_alt_depth': '17',
- 'var_description': 'noncoding1:n:A14T:var_group1:ref has wild type, foo bar',
+ 'var_description': 'noncoding1:1:0:A14T:var_group1:ref has wild type, foo bar',
'var_group': 'var_group1',
'free_text': 'some free text'
}
@@ -51,9 +51,9 @@ class TestSummaryCluster(unittest.TestCase):
'''Test add_data_dict'''
cluster = summary_cluster.SummaryCluster()
self.assertTrue(cluster.name is None)
- line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
- line2 = 'refname\t1\t0\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text'
- line3 = 'refname2\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text'
+ line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+ line2 = 'refname\t1\t0\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id2:ref has wild type, foo bar\tsome free text'
+ line3 = 'refname2\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id3:ref has wild type, foo bar\tsome free text'
data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
@@ -71,9 +71,9 @@ class TestSummaryCluster(unittest.TestCase):
'''Test pc_id_of_longest'''
cluster = summary_cluster.SummaryCluster()
self.assertTrue(cluster.name is None)
- line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
- line2 = 'refname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
- line3 = 'refname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+ line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+ line2 = 'refname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+ line3 = 'refname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
@@ -85,7 +85,7 @@ class TestSummaryCluster(unittest.TestCase):
def test_to_cluster_summary_number(self):
'''Test _to_cluster_summary_assembled'''
- line = 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+ line = 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text'
data_dict = summary_cluster.SummaryCluster.line2dict(line)
tests = [
@@ -122,9 +122,9 @@ class TestSummaryCluster(unittest.TestCase):
def test_has_known_variant(self):
'''Test _has_known_variant'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
]
@@ -139,9 +139,9 @@ class TestSummaryCluster(unittest.TestCase):
def test_has_any_known_variant(self):
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
]
@@ -159,10 +159,10 @@ class TestSummaryCluster(unittest.TestCase):
def test_has_nonsynonymous(self):
'''Test _has_nonsynonymous'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
]
@@ -178,11 +178,11 @@ class TestSummaryCluster(unittest.TestCase):
def test_has_any_nonsynonymous(self):
'''Test _has_any_nonsynonymous'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:N_ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:N_ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
]
expected = ['no', 'yes', 'no', 'yes', 'yes']
@@ -198,9 +198,9 @@ class TestSummaryCluster(unittest.TestCase):
def test_has_novel_nonsynonymous(self):
'''Test _has_novel_nonsynonymous'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
]
@@ -216,9 +216,9 @@ class TestSummaryCluster(unittest.TestCase):
def test_has_any_novel_nonsynonymous(self):
'''Test _has_any_novel_nonsynonymous'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
]
@@ -236,11 +236,11 @@ class TestSummaryCluster(unittest.TestCase):
def test_to_cluster_summary_has_known_nonsynonymous(self):
'''Test _to_cluster_summary_has_known_nonsynonymous'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
]
expected = ['yes', 'yes', 'no', 'no', 'no']
@@ -257,11 +257,11 @@ class TestSummaryCluster(unittest.TestCase):
def test_to_cluster_summary_has_novel_nonsynonymous(self):
'''Test _to_cluster_summary_has_novel_nonsynonymous'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
]
expected = ['no', 'no', 'no', 'yes', 'yes']
@@ -278,11 +278,11 @@ class TestSummaryCluster(unittest.TestCase):
def test_to_cluster_summary_has_nonsynonymous(self):
'''Test _to_cluster_summary_has_nonsynonymous'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
]
expected = ['no', 'yes', 'no', 'yes', 'yes']
@@ -369,16 +369,16 @@ class TestSummaryCluster(unittest.TestCase):
def test_has_match(self):
'''Test _has_match'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
]
expected = ['yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'no', 'no']
@@ -396,14 +396,14 @@ class TestSummaryCluster(unittest.TestCase):
def test_has_var_groups(self):
'''Test has_var_groups'''
lines = [
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
- 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id4:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id5:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id6:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
- 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id2:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id3:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id4:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id5:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id6:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id7:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id7:ref has wild type, foo bar\tsome free text',
]
dicts = [summary_cluster.SummaryCluster.line2dict(line) for line in lines]
cluster = summary_cluster.SummaryCluster()
@@ -438,7 +438,7 @@ class TestSummaryCluster(unittest.TestCase):
def test_non_synon_variants(self):
'''Test non_synon_variants'''
- line1 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
+ line1 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs'
line2 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text'
data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
@@ -454,16 +454,67 @@ class TestSummaryCluster(unittest.TestCase):
def test_known_noncoding_het_snps(self):
'''test known_noncoding_het_snps'''
lines = [
- 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
- 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
- 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
- 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs'
]
cluster = summary_cluster.SummaryCluster()
for line in lines:
cluster.add_data_dict(summary_cluster.SummaryCluster.line2dict(line))
got = cluster.known_noncoding_het_snps()
- expected = {'A42T': 25.0, 'A62T': 75.0, 'A82T': 40.0}
+ expected = {
+ '.': {'A82T': 40.0},
+ 'id1': {'A42T': 25.0},
+ 'id2': {'A62T': 75.0},
+ }
self.assertEqual(expected, got)
+
+ def test_get_all_nonsynon_variants_set(self):
+ '''test _get_all_nonsynon_variants_set'''
+ lines = [
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text',
+ 'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ ]
+
+ data_dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+
+ cluster_vars = [summary_cluster_variant.SummaryClusterVariant(x) for x in data_dicts]
+ expected = {x for x in cluster_vars if x.has_nonsynon}
+ got = summary_cluster.SummaryCluster._get_all_nonsynon_variants_set(data_dicts)
+ self.assertEqual(expected, got)
+
+
+ def test_gather_data(self):
+ '''test gather_data'''
+ lines = [
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text',
+ 'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ ]
+
+ data_dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+ cluster = summary_cluster.SummaryCluster()
+ for data_dict in data_dicts:
+ cluster.add_data_dict(data_dict)
+
+ cluster.gather_data()
+ expected_summary = {
+ 'assembled': 'yes',
+ 'match': 'yes',
+ 'ref_seq': 'ref1',
+ 'pct_id': '98.33',
+ 'known_var': 'yes',
+ 'novel_var': 'no',
+ }
+ self.assertEqual(expected_summary, cluster.summary)
+
+ cluster_vars = [summary_cluster_variant.SummaryClusterVariant(x) for x in data_dicts]
+ expected_variants = {x for x in cluster_vars if x.has_nonsynon}
+ self.assertEqual(expected_variants, cluster.variants)
+
diff --git a/ariba/tests/summary_cluster_variant_test.py b/ariba/tests/summary_cluster_variant_test.py
new file mode 100644
index 0000000..ec09942
--- /dev/null
+++ b/ariba/tests/summary_cluster_variant_test.py
@@ -0,0 +1,67 @@
+import unittest
+import os
+from ariba import summary_cluster, summary_cluster_variant
+
+
+class TestSummaryClusterVariant(unittest.TestCase):
+ def test_has_nonsynonymous(self):
+ '''Test _has_nonsynonymous'''
+ lines = [
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+ 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
+ 'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
+ ]
+
+ dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+ expected = [False, True, False, True, True, True]
+ assert len(dicts) == len(expected)
+
+ for i in range(len(dicts)):
+ self.assertEqual(expected[i], summary_cluster_variant.SummaryClusterVariant._has_nonsynonymous(dicts[i]))
+
+
+ def test_get_het_percent(self):
+ '''test _get_het_percent'''
+ lines = [
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs'
+ ]
+
+ expected = [None, 25.0, 75.0, 40.0]
+ assert len(lines) == len(expected)
+
+ for i in range(len(lines)):
+ data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+ got = summary_cluster_variant.SummaryClusterVariant._get_het_percent(data_dict)
+ self.assertEqual(expected[i], got)
+
+
+ def test_init(self):
+ '''test __init__'''
+ lines = [
+ 'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+ ]
+
+ expected = [
+ {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None},
+ {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': None},
+ {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0},
+ {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0},
+ ]
+ assert len(lines) == len(expected)
+
+ for i in range(len(lines)):
+ data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+ cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
+ for key in expected[i]:
+ got_value = eval('cluster_var.' + key)
+ self.assertEqual(expected[i][key], got_value)
+
diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py
index 091e8c0..67ca2bc 100644
--- a/ariba/tests/summary_sample_test.py
+++ b/ariba/tests/summary_sample_test.py
@@ -18,11 +18,14 @@ class TestSummarySample(unittest.TestCase):
cluster1.add_data_dict(dicts[0])
cluster1.add_data_dict(dicts[1])
cluster1.add_data_dict(dicts[2])
+ cluster1.gather_data()
cluster2 = summary_cluster.SummaryCluster()
cluster2.add_data_dict(dicts[3])
cluster2.add_data_dict(dicts[4])
+ cluster2.gather_data()
cluster3 = summary_cluster.SummaryCluster()
cluster3.add_data_dict(dicts[5])
+ cluster3.gather_data()
expected = {
'cluster.n': cluster1,
@@ -33,6 +36,9 @@ class TestSummarySample(unittest.TestCase):
got = summary_sample.SummarySample._load_file(infile, 90)
self.assertEqual(expected, got)
+ got = summary_sample.SummarySample._load_file(infile, 90, only_clusters={'cluster.n'})
+ expected = {'cluster.n': cluster1}
+ self.assertEqual(expected, got)
def test_column_summary_data(self):
'''Test _column_summary_data'''
@@ -104,7 +110,7 @@ class TestSummarySample(unittest.TestCase):
expected_het_snps = {
'cluster.v': {},
- 'cluster.n': {'A14T': 80.0},
+ 'cluster.n': {'.': {'A14T': 80.0}},
'cluster.p': {},
}
self.assertEqual(expected_het_snps, got_het_snps)
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 6b615ee..542ca53 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -43,35 +43,6 @@ class TestSummary(unittest.TestCase):
self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i]))
- def test_determine_var_cols(self):
- col_strings = [
- 'groups,grouped,ungrouped,novel',
- 'groups,grouped,ungrouped',
- 'grouped,novel',
- 'ungrouped,novel',
- 'grouped',
- 'ungrouped',
- 'novel',
- ''
- ]
-
- expected = [
- {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': True},
- {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': False},
- {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': True},
- {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': True},
- {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': False},
- {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': False},
- {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': True},
- {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': False},
- ]
-
- assert len(col_strings) == len(expected)
-
- for i in range(len(col_strings)):
- self.assertEqual(expected[i], summary.Summary._determine_var_cols(col_strings[i]))
-
-
def test_load_input_files(self):
'''Test _load_input_files'''
file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv')
@@ -84,239 +55,322 @@ class TestSummary(unittest.TestCase):
expected = {file1: sample1, file2: sample2}
self.assertEqual(expected, got)
-
- def test_get_all_cluster_names(self):
- '''Test _get_all_cluster_names'''
- file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv')
- file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv')
- samples = summary.Summary._load_input_files([file1, file2], 90)
- got = summary.Summary._get_all_cluster_names(samples)
- expected = {'cluster.n.1', 'cluster.v.1', 'cluster.p.1', 'cluster.p.2'}
- self.assertEqual(expected, got)
-
-
- def test_get_all_variant_columns(self):
- '''Test _get_all_variant_columns'''
- file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv')
- file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv')
- samples = summary.Summary._load_input_files([file1, file2], 90)
- got = summary.Summary._get_all_variant_columns(samples)
- expected = {
- 'cluster.p.2': {('presence_absence1', 'A10V', 'grouped', 'id3')},
- 'cluster.n.1': {('noncoding1', 'A6G', 'grouped', 'id2'), ('noncoding1', 'A14T', 'grouped', 'id1')},
- 'cluster.p.1': {('presence_absence1', 'A10V', 'grouped', 'id2')},
- }
- self.assertEqual(expected, got)
-
-
- def test_get_all_het_snps(self):
- '''test _get_all_het_snps'''
- file1 = os.path.join(data_dir, 'summary_test_get_all_het_snps.1.tsv')
- file2 = os.path.join(data_dir, 'summary_test_get_all_het_snps.2.tsv')
- samples = summary.Summary._load_input_files([file1, file2], 90)
- got = summary.Summary._get_all_het_snps(samples)
- expected = {('noncoding1', 'A14T')}
- self.assertEqual(expected, got)
-
-
- def test_get_all_var_groups(self):
- '''test _get_all_var_groups'''
- file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv')
- file2 = os.path.join(data_dir, 'summary_test_get_all_var_groups.2.tsv')
- samples = summary.Summary._load_input_files([file1, file2], 90)
- got = summary.Summary._get_all_var_groups(samples)
- expected = {
- 'cluster.p.1': {'id4'},
- 'cluster.p.2': {'id3'},
- 'cluster.v.1': set(),
- 'cluster.n.1': {'id1', 'id2'}
- }
+ sample1 = summary_sample.SummarySample(file1, only_clusters={'noncoding1'})
+ sample2 = summary_sample.SummarySample(file2, only_clusters={'noncoding1'})
+ sample1.run()
+ sample2.run()
+ expected = {file1: sample1, file2: sample2}
+ got = summary.Summary._load_input_files([file1, file2], 90, only_clusters={'noncoding1'})
self.assertEqual(expected, got)
- def test_gather_output_rows(self):
- '''Test _gather_output_rows'''
+ def test_gather_unfiltered_output_data(self):
+ '''test gather_output_rows_new'''
infiles = [
- os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'),
- os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv')
+ os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.1.tsv'),
+ os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.2.tsv')
]
- s = summary.Summary('out', filenames=infiles, variant_cols=None)
- s.samples = summary.Summary._load_input_files(infiles, 90)
- expected = {
+
+ expected_all = {
infiles[0]: {
'noncoding1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'noncoding1',
- 'known_var': 'yes',
- 'novel_var': 'no',
- 'pct_id': '98.33',
+ 'summary': {
+ 'assembled': 'yes',
+ 'known_var': 'yes',
+ 'match': 'yes',
+ 'novel_var': 'no',
+ 'pct_id': '98.33',
+ 'ref_seq': 'noncoding_ref1'
+ },
+ 'groups': {},
+ 'vars': {},
+ },
+ 'noncoding2': {
+ 'summary': {
+ 'assembled': 'yes',
+ 'known_var': 'yes',
+ 'match': 'yes',
+ 'novel_var': 'no',
+ 'pct_id': '98.33',
+ 'ref_seq': 'noncoding_ref2'
+ },
+ 'groups': {},
+ 'vars': {},
},
'presence_absence1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'presence_absence1',
- 'known_var': 'no',
- 'novel_var': 'yes',
- 'pct_id': '98.96',
+ 'summary': {
+ 'assembled': 'yes',
+ 'known_var': 'no',
+ 'match': 'yes',
+ 'novel_var': 'yes',
+ 'pct_id': '98.96',
+ 'ref_seq': 'presence_absence_ref1'
+ },
+ 'groups': {},
+ 'vars': {},
},
- 'variants_only1': {
- 'assembled': 'no',
- 'match': 'no',
- 'ref_seq': 'NA',
- 'known_var': 'NA',
- 'novel_var': 'NA',
- 'pct_id': 'NA',
+ 'presence_absence2': {
+ 'summary': {
+ 'assembled': 'no',
+ 'known_var': 'NA',
+ 'match': 'no',
+ 'novel_var': 'NA',
+ 'pct_id': 'NA',
+ 'ref_seq': 'NA'
+ },
+ 'groups': {},
+ 'vars': {}
}
},
infiles[1]: {
'noncoding1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'noncoding1',
- 'known_var': 'yes',
- 'novel_var': 'no',
- 'pct_id': '98.33',
+ 'summary': {'assembled': 'yes',
+ 'known_var': 'yes',
+ 'match': 'yes',
+ 'novel_var': 'no',
+ 'pct_id': '98.33',
+ 'ref_seq': 'noncoding_ref1'
+ },
+ 'groups': {},
+ 'vars': {},
+ },
+ 'noncoding2': {
+ 'summary': {
+ 'assembled': 'yes',
+ 'known_var': 'yes',
+ 'match': 'yes',
+ 'novel_var': 'no',
+ 'pct_id': '98.33',
+ 'ref_seq': 'noncoding_ref2'
+ },
+ 'groups': {},
+ 'vars': {},
},
'presence_absence1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'presence_absence1',
- 'pct_id': '98.96',
- 'known_var': 'no',
- 'novel_var': 'yes',
+ 'summary': {
+ 'assembled': 'yes',
+ 'known_var': 'no',
+ 'match': 'yes',
+ 'novel_var': 'yes',
+ 'pct_id': '98.96',
+ 'ref_seq': 'presence_absence1'
+ },
+ 'groups': {},
+ 'vars': {}
},
- 'variants_only1': {
- 'assembled': 'no',
- 'match': 'no',
- 'ref_seq': 'NA',
- 'known_var': 'NA',
- 'novel_var': 'NA',
- 'pct_id': 'NA',
- }
+ }
+ }
+
+ expected_potential_cols = {
+ 'noncoding1': {
+ 'summary': {
+ 'assembled',
+ 'known_var',
+ 'match',
+ 'novel_var',
+ 'pct_id',
+ 'ref_seq'
+ },
+ 'groups': set(),
+ 'vars': set()
+ },
+ 'noncoding2': {
+ 'summary': {
+ 'assembled',
+ 'known_var',
+ 'match',
+ 'novel_var',
+ 'pct_id',
+ 'ref_seq'
+ },
+ 'groups': set(),
+ 'vars': set()
+ },
+ 'presence_absence1': {
+ 'summary': {
+ 'assembled',
+ 'known_var',
+ 'match',
+ 'novel_var',
+ 'pct_id',
+ 'ref_seq'
+ },
+ 'groups': set(),
+ 'vars': set()
},
+ 'presence_absence2': {
+ 'summary': {
+ 'assembled',
+ 'known_var',
+ 'match',
+ 'novel_var',
+ 'pct_id',
+ 'ref_seq'
+ },
+ 'groups': set(),
+ 'vars': set()
+ }
}
- got = s._gather_output_rows()
- self.assertEqual(expected, got)
- s.var_columns['groups'] = True
- expected[infiles[0]]['noncoding1']['vgroup.id1'] = 'yes'
- expected[infiles[0]]['noncoding1']['vgroup.id3'] = 'no'
- expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'yes'
- expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes'
- got = s._gather_output_rows()
- self.assertEqual(expected, got)
+ s = summary.Summary('out', filenames=infiles)
+ s.samples = summary.Summary._load_input_files(infiles, 90)
+ s._gather_unfiltered_output_data()
+ self.assertEqual(expected_potential_cols, s.all_potential_columns)
+ self.assertEqual(expected_all, s.all_data)
+
+ expected_potential_cols['noncoding1']['groups'] = {'id3', 'id1', 'id1.%'}
+ expected_potential_cols['noncoding2']['groups'] = {'id2.%', 'id2'}
+ expected_all[infiles[0]]['noncoding1']['groups'] = {'id1': 'yes'}
+ expected_all[infiles[0]]['noncoding2']['groups'] = {'id2': 'yes_multi_het', 'id2.%': 'NA'}
+ expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes'}
+ expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0}
+ s = summary.Summary('out', filenames=infiles, show_var_groups=True)
+ s.samples = summary.Summary._load_input_files(infiles, 90)
+ s._gather_unfiltered_output_data()
+ self.assertEqual(expected_potential_cols, s.all_potential_columns)
+ self.assertEqual(expected_all, s.all_data)
+
+ expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A14T'}
+ expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T'}
+
+ expected_all[infiles[0]]['noncoding1']['vars'] = {'A14T': 'yes'}
+ expected_all[infiles[0]]['noncoding2']['vars'] = {'A42T': 'yes', 'A52T': 'het', 'A52T.%': 40.0}
+ expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes'}
+ expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0}
+ s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True)
+ s.samples = summary.Summary._load_input_files(infiles, 90)
+ s._gather_unfiltered_output_data()
+ self.assertEqual(expected_potential_cols, s.all_potential_columns)
+ self.assertEqual(expected_all, s.all_data)
+
+ expected_potential_cols['presence_absence1']['vars'] = {'A10V'}
+ expected_all[infiles[0]]['presence_absence1']['vars'] = {'A10V': 'yes'}
+ expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'}
+ s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True)
+ s.samples = summary.Summary._load_input_files(infiles, 90)
+ s._gather_unfiltered_output_data()
+ self.assertEqual(expected_potential_cols, s.all_potential_columns)
+ self.assertEqual(expected_all, s.all_data)
- s.var_columns['grouped'] = True
- s.var_columns['ungrouped'] = True
- expected[infiles[0]]['noncoding1']['noncoding1.A14T'] = 'yes'
- expected[infiles[0]]['noncoding1']['noncoding1.A6G'] = 'no'
- expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'yes'
- expected[infiles[1]]['noncoding1']['noncoding1.A6G'] = 'yes'
- self.maxDiff = None
- got = s._gather_output_rows()
- self.assertEqual(expected, got)
+ def test_to_matrix_all_cols(self):
+ '''Test _to_matrix all columns'''
+ infiles = [
+ os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+ os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+ ]
- s.var_columns['novel'] = True
- expected[infiles[0]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
- expected[infiles[1]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
- got = s._gather_output_rows()
- self.assertEqual(expected, got)
+ s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True)
+ s.samples = summary.Summary._load_input_files(infiles, 90)
+ s._gather_unfiltered_output_data()
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
- s.show_known_het = True
- expected[infiles[0]]['noncoding1']['noncoding1.A14T.%'] = 'NA'
- expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'het'
- expected[infiles[1]]['noncoding1']['noncoding1.A14T.%'] = 80.0
- got = s._gather_output_rows()
- self.assertEqual(expected, got)
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncodin [...]
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.A42T', 'noncoding2. [...]
+ expected_matrix = [
+ [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'],
+ [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes']
+ ]
+
+ self.assertEqual(expected_phandango_header, got_phandango_header)
+ self.assertEqual(expected_csv_header, got_csv_header)
+ self.assertEqual(expected_matrix, got_matrix)
- for filename in expected:
- del expected[filename]['noncoding1']['vgroup.id1']
- del expected[filename]['noncoding1']['vgroup.id3']
- for gene_type in expected[filename]:
- del expected[filename][gene_type]['ref_seq']
- s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,match,pct_id,known_var,novel_var', variant_cols='ungrouped,grouped,novel')
+ def test_to_matrix_with_groups(self):
+ '''Test _to_matrix with groups'''
+ infiles = [
+ os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+ os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+ ]
+
+ s = summary.Summary('out', filenames=infiles, show_var_groups=True)
s.samples = summary.Summary._load_input_files(infiles, 90)
- s.include_all_variant_columns = True
- s.show_known_het = True
- got = s._gather_output_rows()
- self.assertEqual(expected, got)
+ s._gather_unfiltered_output_data()
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'presence_absence1.assembled:o1' [...]
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'p [...]
+ expected_matrix = [
+ [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'],
+ [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes']
+ ]
- def test_to_matrix(self):
- '''Test _to_matrix'''
- rows = {
- 'file1': {
- 'cluster.n.1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'noncoding1',
- 'known_var': 'yes',
- 'novel_var': 'no',
- 'pct_id': '98.33',
- 'noncoding1.A14T': 'yes'
- },
- 'cluster.p.1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'presence_absence1',
- 'known_var': 'yes',
- 'novel_var': 'no',
- 'pct_id': '98.96',
- 'presence_absence1.I42L': 'yes'
- },
- 'cluster.v.1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'varonly1',
- 'known_var': 'no',
- 'novel_var': 'no',
- 'pct_id': '99.42',
- }
- },
- 'file2': {
- 'cluster.n.1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'noncoding1',
- 'known_var': 'no',
- 'novel_var': 'no',
- 'pct_id': '98.33',
- 'noncoding1.A14T': 'no'
- },
- 'cluster.p.1': {
- 'assembled': 'yes',
- 'match': 'yes',
- 'ref_seq': 'presence_absence1',
- 'pct_id': '98.96',
- 'known_var': 'no',
- 'novel_var': 'no',
- 'presence_absence1.I42L': 'no'
- },
- 'cluster.v.1': {
- 'assembled': 'no',
- 'match': 'NA',
- 'ref_seq': 'NA',
- 'known_var': 'NA',
- 'novel_var': 'NA',
- 'pct_id': 'NA',
- }
- },
- }
- filenames = ['file1', 'file2']
- cluster_cols = {'assembled': True, 'match': True, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False}
- got_phandago_header, got_csv_header, got_lines = summary.Summary._to_matrix(filenames, rows, cluster_cols)
- expected_phandango_header = ['name', 'cluster.n.1.assembled:o1', 'cluster.n.1.match:o1', 'cluster.n.1.noncoding1.A14T:o1', 'cluster.p.1.assembled:o1', 'cluster.p.1.match:o1', 'cluster.p.1.presence_absence1.I42L:o1', 'cluster.v.1.assembled:o1', 'cluster.v.1.match:o1']
- expected_csv_header = ['name', 'cluster.n.1.assembled', 'cluster.n.1.match', 'cluster.n.1.noncoding1.A14T', 'cluster.p.1.assembled', 'cluster.p.1.match', 'cluster.p.1.presence_absence1.I42L', 'cluster.v.1.assembled', 'cluster.v.1.match']
- expected_lines = [
- ['file1', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'],
- ['file2', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'no', 'NA']
+ self.assertEqual(expected_phandango_header, got_phandango_header)
+ self.assertEqual(expected_csv_header, got_csv_header)
+ self.assertEqual(expected_matrix, got_matrix)
+
+
+ def test_to_matrix_with_vars(self):
+ '''Test _to_matrix with vars'''
+ infiles = [
+ os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+ os.path.join(data_dir, 'summary_to_matrix.2.tsv')
]
- self.assertEqual(expected_phandango_header, got_phandago_header)
+
+ s = summary.Summary('out', filenames=infiles, show_known_vars=True, show_novel_vars=True)
+ s.samples = summary.Summary._load_input_files(infiles, 90)
+ s._gather_unfiltered_output_data()
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.A42T:o1', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'prese [...]
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.A42T', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presenc [...]
+ expected_matrix = [
+ [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'],
+ [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes']
+ ]
+
+ self.assertEqual(expected_phandango_header, got_phandango_header)
+ self.assertEqual(expected_csv_header, got_csv_header)
+ self.assertEqual(expected_matrix, got_matrix)
+
+
+ def test_to_matrix_cluster_only(self):
+ '''Test _to_matrix with cluster columns only'''
+ infiles = [
+ os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+ os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+ ]
+
+ s = summary.Summary('out', filenames=infiles)
+ s.samples = summary.Summary._load_input_files(infiles, 90)
+ s._gather_unfiltered_output_data()
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_abse [...]
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var']
+ expected_matrix = [
+ [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'],
+ [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes']
+ ]
+
+ self.assertEqual(expected_phandango_header, got_phandango_header)
self.assertEqual(expected_csv_header, got_csv_header)
- self.assertEqual(expected_lines, got_lines)
+ self.assertEqual(expected_matrix, got_matrix)
+
+
+ def test_to_matrix_assembled_only(self):
+ '''Test _to_matrix with assembled column only'''
+ infiles = [
+ os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+ os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+ ]
+
+ s = summary.Summary('out', filenames=infiles, cluster_cols='assembled')
+ s.samples = summary.Summary._load_input_files(infiles, 90)
+ s._gather_unfiltered_output_data()
+ got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+ expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding2.assembled:o1', 'presence_absence1.assembled:o1']
+ expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding2.assembled', 'presence_absence1.assembled']
+ expected_matrix = [
+ [infiles[0], 'yes', 'yes', 'yes'],
+ [infiles[1], 'yes', 'yes', 'yes']
+ ]
+
+ self.assertEqual(expected_phandango_header, got_phandango_header)
+ self.assertEqual(expected_csv_header, got_csv_header)
+ self.assertEqual(expected_matrix, got_matrix)
def test_filter_matrix_rows(self):
@@ -373,10 +427,10 @@ class TestSummary(unittest.TestCase):
expected_header = ['head1', 'head2', 'head2:colour', 'head3', 'head3:colour', 'head4', 'head5', 'head5:colour']
expected_matrix = [
- ['yes', 'yes', '#1f78b4', 'yes_nonunique', '#a6cee3', 'yes', 'no', '#33a02c'],
- ['yes', 'yes_nonunique', '#a6cee3', 'no', '#33a02c', 'yes', 'NA', '#b2df8a'],
- ['yes', 'no', '#33a02c', 'NA', '#b2df8a', 'yes', 'yes', '#1f78b4'],
- ['yes', 'NA', '#b2df8a', 'yes', '#1f78b4', 'yes', 'yes_nonunique', '#a6cee3'],
+ ['yes', 'yes', '#33a02c', 'yes_nonunique', '#b2df8a', 'yes', 'no', '#fb9a99'],
+ ['yes', 'yes_nonunique', '#b2df8a', 'no', '#fb9a99', 'yes', 'NA', '#ffffff'],
+ ['yes', 'no', '#fb9a99', 'NA', '#ffffff', 'yes', 'yes', '#33a02c'],
+ ['yes', 'NA', '#ffffff', 'yes', '#33a02c', 'yes', 'yes_nonunique', '#b2df8a']
]
got_header, got_matrix = summary.Summary._add_phandango_colour_columns(header, matrix)
self.assertEqual(expected_header, got_header)
@@ -400,6 +454,23 @@ class TestSummary(unittest.TestCase):
os.unlink(tmpfile)
+ def test_matrix_to_csv_remove_nas(self):
+ '''Test _matrix_to_csv with remove_nas '''
+ matrix = [
+ ['line1_1', 'line1_2', 'NA', 'foo'],
+ ['NA', 'NA', 'bar', 'NA'],
+ ]
+ header = ['head1', 'head2', 'head3', 'head4']
+ tmpfile = 'tmp.test.matrix_to_csv_remove_nas.csv'
+ summary.Summary._matrix_to_csv(matrix, header, tmpfile, remove_nas=True)
+ with open(tmpfile) as f:
+ got = f.read()
+
+ expected = 'head1,head2,head3,head4\nline1_1,line1_2,,foo\n,,bar,\n'
+ self.assertEqual(expected, got)
+ os.unlink(tmpfile)
+
+
def test_distance_score_bewteen_values(self):
'''Test _distance_score_bewteen_values'''
tests = [
diff --git a/ariba/vfdb_parser.py b/ariba/vfdb_parser.py
index 3b9e2a4..9af1ff8 100644
--- a/ariba/vfdb_parser.py
+++ b/ariba/vfdb_parser.py
@@ -36,8 +36,11 @@ class VfdbParser:
tsv_out = pyfastaq.utils.open_file_write(self.outprefix + '.tsv')
for seq in file_reader:
+ original_id = seq.id
seq.id, description = self._fa_header_to_name_and_metadata(seq.id)
- print(seq.id, '1', '0', '.', '.', description, sep='\t', file=tsv_out)
+ if description == '.':
+ seq.id = original_id.split()[0]
+ print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=tsv_out)
print(seq, file=fa_out)
pyfastaq.utils.close(fa_out)
diff --git a/scripts/ariba b/scripts/ariba
index 5140aed..5c70787 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -42,7 +42,7 @@ subparser_flag.set_defaults(func=ariba.tasks.flag.run)
#---------------------------- getref ------------------------------------
-allowed_dbs = ['argannot', 'card', 'plasmidfinder', 'resfinder','vfdb']
+allowed_dbs = sorted(list(ariba.ref_genes_getter.allowed_ref_dbs))
subparser_getref = subparsers.add_parser(
'getref',
help='Download reference data',
@@ -50,6 +50,7 @@ subparser_getref = subparsers.add_parser(
description='Download reference data from one of a few supported public resources',
)
subparser_getref.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
+subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Only applies to card')
subparser_getref.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs, metavar="DB name")
subparser_getref.add_argument('outprefix', help='Prefix of output filenames')
subparser_getref.set_defaults(func=ariba.tasks.getref.run)
@@ -61,11 +62,13 @@ subparser_prepareref = subparsers.add_parser(
help='Prepare reference data for input to "run"',
usage='ariba prepareref [options] <outdir>',
description='Prepare reference data for running the pipeline with "ariba run"',
- epilog='REQUIRED: -f and -m must each be used at least once',
+ epilog='REQUIRED: -f/--fasta, and also either -m/--metadata or --all_coding must be used',
)
input_group = subparser_prepareref.add_argument_group('input files options')
input_group.add_argument('-f', '--fasta', action='append', dest='fasta_files', required=True, help='REQUIRED. Name of fasta file. Can be used more than once if your sequences are spread over more than on file', metavar='FILENAME')
-input_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', required=True, help='REQUIRED. Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file', metavar='FILENAME')
+meta_group = input_group.add_mutually_exclusive_group(required=True)
+meta_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', help='Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file. Incompatible with --all_coding', metavar='FILENAME')
+meta_group.add_argument('--all_coding', choices=['yes', 'no'], help='Use this if you only have a fasta of presence absence sequences as input, and no metadata. Use "yes" if all sequences are coding, or "no" if they are all non-coding. Incompatible with -m/--metadata')
cdhit_group = subparser_prepareref.add_argument_group('cd-hit options')
cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
@@ -138,7 +141,8 @@ assembly_group.add_argument('--assembly_cov', type=int, help='Target read covera
assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs [%(default)s]', default=10, metavar='INT')
other_group = subparser_run.add_argument_group('Other options')
-other_group.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+#other_group.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+other_group.add_argument('--threads', type=int, help=argparse.SUPPRESS, default=1, metavar='INT')
other_group.add_argument('--assembled_threshold', type=float, help='If proportion of gene assembled (regardless of into how many contigs) is at least this value then the flag gene_assembled is set [%(default)s]', default=0.95, metavar='FLOAT (between 0 and 1)')
other_group.add_argument('--gene_nt_extend', type=int, help='Max number of nucleotides to extend ends of gene matches to look for start/stop codons [%(default)s]', default=30, metavar='INT')
other_group.add_argument('--unique_threshold', type=float, help='If proportion of bases in gene assembled more than once is <= this value, then the flag unique_contig is set [%(default)s]', default=0.03, metavar='FLOAT (between 0 and 1)')
@@ -149,7 +153,7 @@ subparser_run.set_defaults(func=ariba.tasks.run.run)
#----------------------------- summary -------------------------------
-summary_presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'cluster_known_vars', 'all', 'all_no_filter']
+summary_presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'all', 'all_no_filter']
subparser_summary = subparsers.add_parser(
'summary',
help='Summarise multiple reports made by "run"',
@@ -159,13 +163,16 @@ subparser_summary = subparsers.add_parser(
)
subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
-subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--known_vars,--novel_vars. Using this overrides those options', metavar='|'.join(summary_presets))
+subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--v_groups,--variants. Using this overrides those options', metavar='|'.join(summary_presets))
subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...')
subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
-subparser_summary.add_argument('--het', action='store_true', help='For known noncoding SNPs, report if they are heterozygous or not, and the percent of reads supporting the variant type')
+subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree')
subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
-subparser_summary.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='')
subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
+subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name')
+subparser_summary.add_argument('--v_groups', action='store_true', help='Show a group column for each group of variants')
+subparser_summary.add_argument('--known_variants', action='store_true', help='Report all known variants')
+subparser_summary.add_argument('--novel_variants', action='store_true', help='Report all novel variants')
subparser_summary.add_argument('--verbose', action='store_true', help='Be verbose')
subparser_summary.add_argument('outprefix', help='Prefix of output files')
subparser_summary.add_argument('infiles', nargs='*', help='Files to be summarised')
@@ -179,7 +186,8 @@ subparser_test = subparsers.add_parser(
description='Run ARIBA on a small made up built-in test dataset'
)
-subparser_test.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+#subparser_test.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+subparser_test.add_argument('--threads', type=int, help=argparse.SUPPRESS, default=1, metavar='INT')
subparser_test.add_argument('outdir', help='Name of output directory')
subparser_test.set_defaults(func=ariba.tasks.test.run)
diff --git a/setup.py b/setup.py
index 77ba935..0d421cf 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@ fermilite_mod = Extension(
setup(
ext_modules=[minimap_mod, fermilite_mod],
name='ariba',
- version='2.1.0',
+ version='2.2.0',
description='ARIBA: Antibiotic Resistance Identification By Assembly',
packages = find_packages(),
package_data={'ariba': ['test_run_data/*']},
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ariba.git
More information about the debian-med-commit
mailing list