[med-svn] [ariba] 01/03: Imported Upstream version 2.2.0+ds

Sascha Steinbiss satta at debian.org
Thu Aug 18 15:09:52 UTC 2016


This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository ariba.

commit 61985a317010528545f5f75742e7c0d10eb652cc
Author: Sascha Steinbiss <satta at debian.org>
Date:   Thu Aug 18 15:02:10 2016 +0000

    Imported Upstream version 2.2.0+ds
---
 ariba/__init__.py                                  |   1 +
 ariba/ref_genes_getter.py                          | 116 ++++-
 ariba/ref_preparer.py                              |  41 +-
 ariba/reference_data.py                            |   7 +-
 ariba/report.py                                    |   3 +-
 ariba/summary.py                                   | 289 +++++------
 ariba/summary_cluster.py                           |  30 +-
 ariba/summary_cluster_variant.py                   |  83 ++++
 ariba/summary_sample.py                            |  21 +-
 ariba/tasks/getref.py                              |   6 +-
 ariba/tasks/prepareref.py                          |   3 +-
 ariba/tasks/summary.py                             |  47 +-
 ariba/tests/assembly_variants_test.py              |  12 +-
 ariba/tests/cluster_test.py                        | 111 +++--
 ariba/tests/clusters_test.py                       |  24 +-
 ...fa => assembly_variants_one_var_one_ctg_cdg.fa} |   0
 ...v => assembly_variants_one_var_one_ctg_cdg.tsv} |   0
 ...=> assembly_variants_one_var_one_ctg_noncdg.fa} |   0
 ...> assembly_variants_one_var_one_ctg_noncdg.tsv} |   0
 ...uster_full_run_known_smtls_snp_presabs_gene.fa} |   0
 ..._known_smtls_snp_presabs_gene.ref_for_reads.fa} |   0
 ...ster_full_run_known_smtls_snp_presabs_gene.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ...uster_full_run_known_smtls_snp_presabs_nonc.fa} |   0
 ...ster_full_run_known_smtls_snp_presabs_nonc.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ... => cluster_full_run_smtls_snp_presabs_gene.fa} |   0
 ...ll_run_smtls_snp_presabs_gene.ref_for_reads.fa} |   0
 ...=> cluster_full_run_smtls_snp_presabs_gene.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ... => cluster_full_run_smtls_snp_presabs_nonc.fa} |   0
 ...=> cluster_full_run_smtls_snp_presabs_nonc.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ... => cluster_full_run_smtls_snp_varonly_gene.fa} |   0
 ...=> cluster_full_run_smtls_snp_varonly_gene.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ...> cluster_full_run_smtls_snp_varonly_gene_2.fa} |   0
 ... cluster_full_run_smtls_snp_varonly_gene_2.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ...ster_full_run_smtls_snp_varonly_gene_no_snp.fa} |   0
 ...ter_full_run_smtls_snp_varonly_gene_no_snp.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ... => cluster_full_run_smtls_snp_varonly_nonc.fa} |   0
 ...=> cluster_full_run_smtls_snp_varonly_nonc.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ...ster_full_run_smtls_snp_varonly_nonc_no_snp.fa} |   0
 ...ter_full_run_smtls_snp_varonly_nonc_no_snp.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ...full_run_varonly.not_present.always_report.tsv} |   0
 ...ll_run_ok_gene_start_mismatch.ref_for_reads.fa} |   0
 ..._test_full_run_ok_samtools_snp_pres_abs_gene.fa |   3 -
 ..._full_run_ok_samtools_snp_pres_abs_noncoding.fa |   3 -
 ..._test_full_run_ok_samtools_snp_var_only_gene.fa |   3 -
 ..._full_run_ok_samtools_snp_var_only_noncoding.fa |   3 -
 ....fa => cluster_test_full_run_partial_asmbly.fa} |   2 +-
 .../data/cluster_test_full_run_partial_asmbly.tsv  |   1 +
 .../reads_1.fq                                     | 432 ++++++++++++++++
 .../reads_2.fq                                     | 432 ++++++++++++++++
 .../references.fa                                  |   2 +-
 ...luster_test_full_run_smtls_snp_varonly_nonc.fa} |   0
 ...uster_test_full_run_smtls_snp_varonly_nonc.tsv} |   0
 .../reads_1.fq                                     |   0
 .../reads_2.fq                                     |   0
 .../references.fa                                  |   0
 ...cted.out.fa => clusters_cat_genes_match_ref.fa} |   0
 .../00.info.txt                                    |   0
 .../01.filter.check_metadata.tsv                   |   0
 .../02.cdhit.all.fa                                |   0
 .../02.cdhit.clusters.pickle                       | Bin
 ... clusters_minimap_reads_to_all_refs.clstrs.tsv} |   0
 ...usters_minimap_reads_to_all_refs.out.clstr2rep} |   0
 ...ters_minimap_reads_to_all_refs.out.clstr_count} |   0
 ...=> clusters_minimap_reads_to_all_refs.out.hist} |   0
 ...> clusters_minimap_reads_to_all_refs.out.pairs} |   0
 ... clusters_minimap_reads_to_all_refs.reads_1.fq} |   0
 ... clusters_minimap_reads_to_all_refs.reads_2.fq} |   0
 ...a => clusters_minimap_reads_to_all_refs.ref.fa} |   0
 .../ref_preparer_test_fasta_to_metadata.coding.tsv |   3 +
 .../data/ref_preparer_test_fasta_to_metadata.fa    |   6 +
 ...f_preparer_test_fasta_to_metadata.noncoding.tsv |   3 +
 .../data/ref_preparer_test_run.out/00.info.txt     |  10 +-
 .../ref_preparer_test_run.out/00.version_info.txt  |   4 +-
 .../02.cdhit.clusters.pickle                       | Bin 276 -> 312 bytes
 .../02.cdhit.clusters.tsv                          |   4 +-
 .../00.auto_metadata.tsv                           |   9 +
 .../00.info.txt                                    |   5 +
 .../00.version_info.txt                            |   5 +
 .../01.filter.check_genes.log                      |   0
 .../01.filter.check_metadata.log                   |   0
 .../01.filter.check_metadata.tsv                   |   9 +
 .../02.cdhit.all.fa                                |  18 +
 .../02.cdhit.clusters.pickle                       | Bin 0 -> 344 bytes
 .../02.cdhit.clusters.tsv                          |   6 +-
 .../02.cdhit.gene.fa                               |   0
 .../02.cdhit.gene.varonly.fa                       |   0
 .../02.cdhit.noncoding.fa                          |  18 +
 .../02.cdhit.noncoding.varonly.fa                  |   0
 ..._cluster_w_cdhit_clstrs_file.expect.clstrs.tsv} |   0
 ...data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv} |   0
 ...ference_data_cluster_w_cdhit_clstrs_file.in.fa} |   0
 ...e_data_cluster_w_cdhit_clstrs_file.in.meta.tsv} |   0
 ...ence_data_cluster_w_cdhit_nocluster.expect.tsv} |   0
 ...reference_data_cluster_w_cdhit_nocluster.in.fa} |   0
 ...eference_data_cluster_w_cdhit_nocluster.in.tsv} |   0
 ariba/tests/data/reference_data_load_fasta_file.fa |   2 +-
 ...ence_data_load_input_check_seq_names.bad.csv.1} |   0
 ...ence_data_load_input_check_seq_names.bad.csv.2} |   0
 ...rence_data_load_input_check_seq_names.bad.fa.1} |   0
 ...rence_data_load_input_check_seq_names.bad.fa.2} |   0
 ...nce_data_load_input_check_seq_names.good.csv.1} |   0
 ...nce_data_load_input_check_seq_names.good.csv.2} |   0
 ...ence_data_load_input_check_seq_names.good.fa.1} |   0
 ...ence_data_load_input_check_seq_names.good.fa.2} |   0
 .../tests/data/reference_data_rename_sequences.fa  |   8 +-
 .../reference_data_rename_sequences_metadata.tsv   |  12 +-
 .../data/reference_data_test_rename_sequences.out  |  10 +-
 ...s_variants_make_vcf_and_depths_files.asmbly.fa} |   0
 ...riants_make_vcf_and_depths_files.asmbly.fa.fai} |   0
 ...amtools_variants_make_vcf_and_depths_files.bam} | Bin
 ...nts_make_vcf_and_depths_files.expect.depths.gz} | Bin
 ...make_vcf_and_depths_files.expect.depths.gz.tbi} | Bin
 ..._variants_make_vcf_and_depths_files.expect.vcf} |   0
 .../summary_gather_unfiltered_output_data.in.1.tsv |   6 +
 .../summary_gather_unfiltered_output_data.in.2.tsv |   6 +
 ...ample_test_column_names_tuples_and_het_snps.tsv |   8 +-
 .../summary_sample_test_column_summary_data.tsv    |   8 +-
 .../tests/data/summary_sample_test_var_groups.tsv  |   8 +-
 .../data/summary_test_gather_output_rows.in.1.tsv  |   3 -
 .../data/summary_test_gather_output_rows.in.2.tsv  |   5 -
 .../data/summary_test_get_all_cluster_names.1.tsv  |   3 -
 .../data/summary_test_get_all_cluster_names.2.tsv  |   5 -
 .../tests/data/summary_test_get_all_het_snps.1.tsv |   3 -
 .../tests/data/summary_test_get_all_het_snps.2.tsv |   5 -
 .../data/summary_test_get_all_var_groups.1.tsv     |   3 -
 .../data/summary_test_get_all_var_groups.2.tsv     |   5 -
 ariba/tests/data/summary_to_matrix.1.tsv           |   5 +
 ariba/tests/data/summary_to_matrix.2.tsv           |   6 +
 ariba/tests/data/vfdb_parser_test_run.out.fa       |   2 +-
 ariba/tests/data/vfdb_parser_test_run.out.tsv      |   6 +-
 ariba/tests/ref_preparer_test.py                   | 108 +++-
 ariba/tests/reference_data_test.py                 |  69 ++-
 ariba/tests/samtools_variants_test.py              |  12 +-
 ariba/tests/summary_cluster_test.py                | 191 +++++---
 ariba/tests/summary_cluster_variant_test.py        |  67 +++
 ariba/tests/summary_sample_test.py                 |   8 +-
 ariba/tests/summary_test.py                        | 541 ++++++++++++---------
 ariba/vfdb_parser.py                               |   5 +-
 scripts/ariba                                      |  26 +-
 setup.py                                           |   2 +-
 167 files changed, 2165 insertions(+), 758 deletions(-)

diff --git a/ariba/__init__.py b/ariba/__init__.py
index 0c36b1a..1d589dc 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -39,6 +39,7 @@ __all__ = [
     'sequence_variant',
     'summary',
     'summary_cluster',
+    'summary_cluster_variant',
     'summary_sample',
     'tasks',
     'versions',
diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py
index 1ae8d39..541f0c4 100644
--- a/ariba/ref_genes_getter.py
+++ b/ariba/ref_genes_getter.py
@@ -12,20 +12,33 @@ import json
 from ariba import common, card_record, vfdb_parser
 
 
+allowed_ref_dbs = {
+    'argannot',
+    'card',
+    'plasmidfinder',
+    'resfinder',
+    'srst2_argannot',
+    'vfdb_core',
+    'vfdb_full',
+}
+
+argannot_ref = '"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n'
+
+
 class RefGenesGetter:
-    def __init__(self, ref_db, genetic_code=11):
-        allowed_ref_dbs = {'card', 'argannot', 'plasmidfinder', 'resfinder','vfdb'}
+    def __init__(self, ref_db, genetic_code=11, version=None):
         if ref_db not in allowed_ref_dbs:
             raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db)
         self.ref_db=ref_db
         self.genetic_code = genetic_code
         self.max_download_attempts = 3
         self.sleep_time = 2
+        self.version = version
         pyfastaq.sequences.genetic_code = self.genetic_code
 
 
     def _download_file(self, url, outfile):
-        print('Downloading "', url, '" and saving as "', outfile, '" ...', end='', sep='')
+        print('Downloading "', url, '" and saving as "', outfile, '" ...', end='', sep='', flush=True)
         for i in range(self.max_download_attempts):
             time.sleep(self.sleep_time)
             try:
@@ -38,6 +51,30 @@ class RefGenesGetter:
         print(' done', flush=True)
 
 
+    def _get_card_versions(self, tmp_file):
+        print('Getting available CARD versions')
+        self._download_file('https://card.mcmaster.ca/download', tmp_file)
+        p = re.compile(r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.gz)"''')
+        versions = {}
+
+        with open(tmp_file) as f:
+            for line in f:
+                got = p.findall(line)
+                for match in got:
+                    key = tuple([int(x) for x in match[1].split('.')])
+                    versions[key] = 'https://card.mcmaster.ca' + match[0]
+
+        if len(versions) == 0:
+            raise Error('Error getting CARD versions. Cannot continue')
+
+        print('Found versions:')
+
+        for key, url in sorted(versions.items()):
+            print('.'.join([str(x) for x in key]), url, sep='\t')
+
+        os.unlink(tmp_file)
+        return versions
+
 
     def _get_from_card(self, outprefix):
         outprefix = os.path.abspath(outprefix)
@@ -50,8 +87,17 @@ class RefGenesGetter:
         except:
             raise Error('Error mkdir/chdir ' + tmpdir)
 
-        card_version = '1.0.9'
-        card_tarball_url = 'https://card.mcmaster.ca/download/0/broadstreet-v' + card_version + '.tar.gz'
+        versions = self._get_card_versions('download.html')
+        if self.version is not None:
+            key = tuple([int(x) for x in self.version.split('.')])
+            if key not in versions:
+                raise Error('Error! Did not find requested version ' + self.version)
+        else:
+            key = sorted(list(versions.keys()))[-1]
+            self.version = '.'.join([str(x) for x in key])
+
+        print('Getting version', self.version)
+        card_tarball_url = versions[key]
         card_tarball = 'card.tar.gz'
         print('Working in temporary directory', tmpdir)
         print('Downloading data from card:', card_tarball_url, flush=True)
@@ -146,7 +192,7 @@ class RefGenesGetter:
         print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
         print('If you use this downloaded data, please cite:')
         print('"The Comprehensive Antibiotic Resistance Database", McArthur et al 2013, PMID: 23650175')
-        print('and in your methods say that version', card_version, 'of the database was used')
+        print('and in your methods say that version', self.version, 'of the database was used')
 
 
     def _get_from_resfinder(self, outprefix):
@@ -221,9 +267,9 @@ class RefGenesGetter:
 
         for seq in seq_reader:
             original_id = seq.id
-            seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id)
+            seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0])
             print(seq, file=f_out_fa)
-            print(seq.id, '1', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=f_out_tsv)
+            print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv)
 
 
         pyfastaq.utils.close(f_out_tsv)
@@ -234,7 +280,7 @@ class RefGenesGetter:
         print('You can use them with ARIBA like this:')
         print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
         print('If you use this downloaded data, please cite:')
-        print('"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n')
+        print(argannot_ref)
 
 
     def _get_from_plasmidfinder(self, outprefix):
@@ -289,7 +335,49 @@ class RefGenesGetter:
         print('"PlasmidFinder and pMLST: in silico detection and typing of plasmids", Carattoli et al 2014, PMID: 24777092\n')
 
 
-    def _get_from_vfdb(self, outprefix):
+    def _get_from_srst2_argannot(self, outprefix):
+        srst2_version = '0.2.0'
+        srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta'
+        srst2_fa = outprefix + '.original.fa'
+        command = 'wget -O ' + srst2_fa + ' ' + srst2_url
+        common.syscall(command, verbose=True)
+
+        final_fasta = outprefix + '.fa'
+        final_tsv = outprefix + '.tsv'
+
+        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)
+        f_out_meta = pyfastaq.utils.open_file_write(final_tsv)
+        seq_reader = pyfastaq.sequences.file_reader(srst2_fa)
+
+        for seq in seq_reader:
+            original_id = seq.id
+            name, extra = seq.id.split()
+            cluster_id, cluster_name, allele_name, allele_id = name.split('__')
+            seq.id = cluster_name + '.' + name
+            print(seq, file=f_out_fa)
+            print(seq.id, 1, 0, '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_meta)
+
+        pyfastaq.utils.close(f_out_fa)
+        pyfastaq.utils.close(f_out_meta)
+
+        print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
+        print('You can use them with ARIBA like this:')
+        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
+        print('If you use this downloaded data, please cite:')
+        print('"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n')
+        print(argannot_ref)
+        print('and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.')
+
+
+    def _get_from_vfdb_core(self, outprefix):
+        self._get_from_vfdb_common(outprefix, 'VFDB_setA_nt.fas.gz','core')
+
+
+    def _get_from_vfdb_full(self, outprefix):
+         self._get_from_vfdb_common(outprefix, 'VFDB_setB_nt.fas.gz','full')
+
+
+    def _get_from_vfdb_common(self, outprefix, filename, info_text):
         outprefix = os.path.abspath(outprefix)
         tmpdir = outprefix + '.tmp.download'
 
@@ -298,12 +386,13 @@ class RefGenesGetter:
         except:
             raise Error('Error mkdir ' + tmpdir)
 
-        zipfile = os.path.join(tmpdir, 'VFDB_setA_nt.fas.gz')
-        self._download_file('http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz', zipfile)
+        zipfile = os.path.join(tmpdir, filename)
+        self._download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile)
+        print('Extracting files ... ', end='', flush=True)
         vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
         vparser.run()
         shutil.rmtree(tmpdir)
-        print('Extracted files.')
+        print('done')
         final_fasta = outprefix + '.fa'
         final_tsv = outprefix + '.tsv'
 
@@ -313,6 +402,7 @@ class RefGenesGetter:
         print('If you use this downloaded data, please cite:')
         print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
 
+
     def run(self, outprefix):
         exec('self._get_from_' + self.ref_db + '(outprefix)')
 
diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py
index 9fee5fd..c2cb310 100644
--- a/ariba/ref_preparer.py
+++ b/ariba/ref_preparer.py
@@ -1,6 +1,7 @@
 import sys
 import os
 import pickle
+import pyfastaq
 from ariba import reference_data
 
 class Error (Exception): pass
@@ -9,8 +10,9 @@ class Error (Exception): pass
 class RefPreparer:
     def __init__(self,
         fasta_files,
-        metadata_tsv_files,
         extern_progs,
+        metadata_tsv_files=None,
+        all_coding=None,
         version_report_lines=None,
         min_gene_length=6,
         max_gene_length=10000,
@@ -30,7 +32,8 @@ class RefPreparer:
             self.version_report_lines = version_report_lines
 
         self.fasta_files = fasta_files
-        self.metadata_tsv_files = metadata_tsv_files
+        self.metadata_tsv_files = [] if metadata_tsv_files is None else metadata_tsv_files
+        self.all_coding = all_coding
         self.min_gene_length = min_gene_length
         self.max_gene_length = max_gene_length
         self.genetic_code = genetic_code
@@ -42,6 +45,21 @@ class RefPreparer:
         self.verbose = verbose
 
 
+    @classmethod
+    def _fasta_to_metadata(cls, infile, out_fh, all_coding):
+       seq_reader = pyfastaq.sequences.file_reader(infile)
+       coding = '1' if all_coding else '0'
+
+       for seq in seq_reader:
+           fields = seq.id.split(maxsplit=1)
+           if len(fields) > 1:
+               info_column = 'Original name: ' + seq.id
+               seq.id = fields[0]
+           else:
+               info_column = '.'
+           print(seq.id, coding, 0, '.', '.', info_column, sep='\t', file=out_fh)
+
+
     def _write_info_file(self, outfile):
         with open(outfile, 'w') as fout:
             for filename in self.fasta_files:
@@ -83,6 +101,13 @@ class RefPreparer:
                 else:
                     new_key = common_prefix + '-'
 
+            i = 1
+            new_new_key = new_key
+            while new_new_key in new_clusters:
+                new_new_key = new_key + '_' + str(i)
+                i += 1
+            new_key = new_new_key
+
             if new_key in key_count:
                 if new_key in new_clusters:
                     assert key_count[new_key] == 1
@@ -126,6 +151,18 @@ class RefPreparer:
             print(file=f)
             print(*self.version_report_lines, sep='\n', file=f)
 
+        if self.all_coding is not None:
+            assert len(self.metadata_tsv_files) == 0
+            assert self.all_coding in {'yes', 'no'}
+            self.metadata_tsv_files = [os.path.join(outdir, '00.auto_metadata.tsv')]
+            f_out = pyfastaq.utils.open_file_write(self.metadata_tsv_files[0])
+            for fasta_file in self.fasta_files:
+                RefPreparer._fasta_to_metadata(fasta_file, f_out, self.all_coding=='yes')
+            pyfastaq.utils.close(f_out)
+        else:
+            assert self.all_coding is None
+            assert len(self.metadata_tsv_files) > 0
+
         self._write_info_file(os.path.join(outdir, '00.info.txt'))
 
         self.refdata = reference_data.ReferenceData(
diff --git a/ariba/reference_data.py b/ariba/reference_data.py
index ed5ef8b..8d25e92 100644
--- a/ariba/reference_data.py
+++ b/ariba/reference_data.py
@@ -8,7 +8,7 @@ from ariba import sequence_metadata, cdhit
 
 class Error (Exception): pass
 
-rename_sub_regex = re.compile(r'[^\w.-]')
+rename_sub_regex = re.compile(r'''[':!@,-]''')
 
 
 class ReferenceData:
@@ -83,6 +83,8 @@ class ReferenceData:
         if filename is not None:
             seq_reader = pyfastaq.sequences.file_reader(filename)
             for seq in seq_reader:
+                seq.id = seq.id.split()[0]
+
                 if seq.id in seq_dict:
                     raise Error('Duplicate name "' + seq.id + '" found in file ' + filename + '. Cannot continue)')
                 seq_dict[seq.id] = copy.copy(seq)
@@ -281,7 +283,7 @@ class ReferenceData:
 
     @classmethod
     def _new_seq_name(cls, name):
-        name = name.split()[0]
+        assert len(name.split()) == 1 and name.strip() == name
         return re.sub(rename_sub_regex, '_', name)
 
 
@@ -291,6 +293,7 @@ class ReferenceData:
         old_name_to_new = {}
 
         for old_name in sorted(names):
+            assert len(old_name.split()) == 1 and old_name.strip() == old_name
             new_name = ReferenceData._new_seq_name(old_name)
             if new_name in used_names:
                 i = 1
diff --git a/ariba/report.py b/ariba/report.py
index fdbe50f..1884c9a 100644
--- a/ariba/report.py
+++ b/ariba/report.py
@@ -137,8 +137,9 @@ def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymum
     lines = []
     contig_length = len(cluster.assembly.sequences[contig_name])
     assert contig_length != 0
+
     if contig_name in ref_cov_per_contig:
-        if contig_name == cluster.assembly_compare.scaff_name_matching_ref:
+        if contig_name == cluster.assembly_compare.scaff_name_matching_ref and cluster.assembly_compare.gene_matching_ref_type == 'GENE_FOUND':
             ref_cov = len(cluster.ref_sequence)
         else:
             ref_cov = ref_cov_per_contig[contig_name]
diff --git a/ariba/summary.py b/ariba/summary.py
index f1b767f..d0e920c 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -18,9 +18,12 @@ class Summary:
       filter_rows=True,
       filter_columns=True,
       min_id=90.0,
-      show_known_het=False,
       cluster_cols='assembled,match,ref_seq,pct_id,known_var,novel_var',
-      variant_cols='groups,grouped,ungrouped,novel',
+      make_phandango_tree=True,
+      only_clusters=None,
+      show_var_groups=False,
+      show_known_vars=False,
+      show_novel_vars=False,
       verbose=False,
     ):
         if filenames is None and fofn is None:
@@ -34,13 +37,16 @@ class Summary:
         if fofn is not None:
             self.filenames.extend(self._load_fofn(fofn))
 
-        self.show_known_het = show_known_het
         self.cluster_columns = self._determine_cluster_cols(cluster_cols)
-        self.var_columns = self._determine_var_cols(variant_cols)
         self.filter_rows = filter_rows
         self.filter_columns = filter_columns
         self.min_id = min_id
         self.outprefix = outprefix
+        self.make_phandango_tree = make_phandango_tree
+        self.only_clusters = only_clusters
+        self.show_var_groups = show_var_groups
+        self.show_known_vars = show_known_vars
+        self.show_novel_vars = show_novel_vars
         self.verbose = verbose
 
 
@@ -60,12 +66,6 @@ class Summary:
         return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns')
 
 
-    @staticmethod
-    def _determine_var_cols(cols_string):
-        allowed_cols = {'groups', 'grouped', 'ungrouped', 'novel'}
-        return Summary._determine_cols(cols_string, allowed_cols, 'variant columns')
-
-
     def _load_fofn(self, fofn):
         f = pyfastaq.utils.open_file_read(fofn)
         filenames = [x.rstrip() for x in f.readlines()]
@@ -80,172 +80,127 @@ class Summary:
 
 
     @classmethod
-    def _load_input_files(cls, filenames, min_id, verbose=False):
+    def _load_input_files(cls, filenames, min_id, verbose=False, only_clusters=None):
         samples = {}
         for filename in filenames:
-            samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id)
+            samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id, only_clusters=only_clusters)
             samples[filename].run()
             if verbose:
                 print('Loaded file', filename, flush=True)
         return samples
 
 
-    @classmethod
-    def _get_all_cluster_names(cls, samples_dict):
-        '''Input should be output of _load_input_files'''
-        cluster_names = set()
-        for filename, sample in samples_dict.items():
-            cluster_names.update(set(sample.clusters.keys()))
-        return cluster_names
-
+    def _gather_unfiltered_output_data(self):
+        self.all_potential_columns = {}
+        self.all_data = {}
 
-    @classmethod
-    def _get_all_variant_columns(cls, samples_dict):
-        '''Input should be output of _load_input_files'''
-        columns = {}
-        for filename, sample in samples_dict.items():
-            for cluster in sample.column_summary_data:
-                if sample.column_summary_data[cluster]['assembled'] == 'yes':
-                    for key, tuple_set in sample.variant_column_names_tuples.items():
-                        for t in tuple_set:
-                            if key not in columns:
-                                columns[key] = set()
-                            columns[key].add(t)
-        return columns
-
-
-    @classmethod
-    def _get_all_het_snps(cls, samples_dict):
-        snps = set()
-        for filename, sample in samples_dict.items():
-            for cluster, snp_dict in sample.het_snps.items():
-                if len(snp_dict):
-                    for snp in snp_dict:
-                        snps.add((cluster, snp))
+        for filename in sorted(self.samples):
+            self.all_data[filename] = {}
+            for cluster in self.samples[filename].clusters.values():
+                self.all_data[filename][cluster.name] = {}
+                if cluster.name not in self.all_potential_columns:
+                    self.all_potential_columns[cluster.name] = {'summary' : set(), 'groups': set(), 'vars': set()}
 
-        return snps
+                this_cluster_dict = {'groups': {}, 'vars': {}}
 
-    @classmethod
-    def _get_all_var_groups(cls, samples_dict):
-        groups = {}
-        for filename, sample in samples_dict.items():
-            for name, name_set in sample.var_groups.items():
-                if name not in groups:
-                    groups[name] = set()
-                groups[name].update(name_set)
-        return groups
-
-
-    def _gather_output_rows(self):
-        all_cluster_names = Summary._get_all_cluster_names(self.samples)
-        all_var_columns = Summary._get_all_variant_columns(self.samples)
-        all_het_snps = Summary._get_all_het_snps(self.samples)
-
-        if self.var_columns['groups']:
-            var_groups = Summary._get_all_var_groups(self.samples)
-        else:
-            var_groups = set()
-        rows = {}
+                if cluster.summary['assembled'] == 'no':
+                    this_cluster_dict['summary'] = {
+                            'assembled': 'no',
+                            'known_var': 'NA',
+                            'match': 'no',
+                            'novel_var': 'NA',
+                            'pct_id': 'NA',
+                            'ref_seq': 'NA'
+                    }
+                else:
+                    this_cluster_dict['summary'] = copy.copy(cluster.summary)
+                    seen_groups = {}
 
-        for filename, sample in self.samples.items():
-            rows[filename] = {}
+                    for variant in cluster.variants:
+                        if (self.show_known_vars and variant.known) or (self.show_novel_vars and not variant.known):
+                            this_cluster_dict['vars'][variant.var_string] = 'yes' if variant.het_percent is None else 'het'
+                            if variant.het_percent is not None:
+                                this_cluster_dict['vars'][variant.var_string + '.%'] = variant.het_percent
 
-            for cluster in all_cluster_names:
-                rows[filename][cluster] = {}
+                        if self.show_var_groups and variant.var_group != '.':
+                            if variant.var_group not in seen_groups:
+                                seen_groups[variant.var_group] = {'yes': 0, 'het': 0}
 
-                if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'].startswith('yes'):
-                    rows[filename][cluster] = sample.column_summary_data[cluster]
-                else:
-                    rows[filename][cluster] = {
-                        'assembled': 'no',
-                        'match': 'no',
-                        'ref_seq': 'NA',
-                        'known_var': 'NA',
-                        'novel_var': 'NA',
-                        'pct_id': 'NA'
-                    }
+                            if variant.het_percent is None:
+                                seen_groups[variant.var_group]['yes'] += 1
+                                this_cluster_dict['groups'][variant.var_group] = 'yes'
+                            else:
+                                seen_groups[variant.var_group]['het'] += 1
+                                this_cluster_dict['groups'][variant.var_group] = 'het'
+                                this_cluster_dict['groups'][variant.var_group + '.%'] = variant.het_percent
 
-                if self.var_columns['groups']:
-                    for group_name in var_groups[cluster]:
-                        if cluster in sample.var_groups and group_name in sample.var_groups[cluster]:
-                            rows[filename][cluster]['vgroup.' + group_name] = 'yes'
-                        else:
-                            rows[filename][cluster]['vgroup.' + group_name] = 'no'
-
-                if cluster in all_var_columns:
-                    for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]:
-                        if not self.var_columns[grouped_or_novel]:
-                            continue
-
-                        key = ref_name + '.' + variant
-
-                        if rows[filename][cluster]['assembled'] == 'no':
-                            rows[filename][cluster][key] = 'NA'
-                        elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]:
-                            rows[filename][cluster][key] = 'yes'
-                            if self.show_known_het:
-                                if cluster in sample.het_snps and variant in sample.het_snps[cluster]:
-                                    rows[filename][cluster][key] = 'het'
-                                    rows[filename][cluster][key + '.%'] = sample.het_snps[cluster][variant]
-                        else:
-                            rows[filename][cluster][key] = 'no'
-                            if self.show_known_het and (cluster, variant) in all_het_snps:
-                                rows[filename][cluster][key + '.%'] = 'NA'
+                    for group, d in seen_groups.items():
+                        if d['het'] > 0 and d['het'] + d['yes'] > 1:
+                            this_cluster_dict['groups'][group] = 'yes_multi_het'
+                            this_cluster_dict['groups'][group + '.%'] = 'NA'
 
-                        if self.show_known_het and (cluster, variant) in all_het_snps and key + '.%' not in rows[filename][cluster]:
-                            rows[filename][cluster][key + '.%'] = 'NA'
+                for x in this_cluster_dict:
+                    self.all_potential_columns[cluster.name][x].update(set(this_cluster_dict[x].keys()))
 
-                for key, wanted in self.cluster_columns.items():
-                    if not wanted:
-                        del rows[filename][cluster][key]
-
-        return rows
+                self.all_data[filename][cluster.name] = this_cluster_dict
 
 
     @classmethod
-    def _to_matrix(cls, filenames, rows, cluster_cols):
-        '''rows = output from _gather_output_rows().
-           filenames = self.filenames
-           cluster_cols = self.cluster_columns'''
+    def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols):
         matrix = []
         making_header_lines = True
         phandango_header = ['name']
-        phandago_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'}
+        phandango_suffixes = {'assembled': ':o1', 'match': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': ':o1'}
         ref_seq_counter = 2
         csv_header = ['name']
-        all_cluster_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
-        all_cluster_cols_in_order_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
-        cluster_cols_in_order = [x for x in all_cluster_cols_in_order if cluster_cols[x]]
+        summary_cols_in_order = ['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
+        summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
+        summary_cols_in_order = [x for x in summary_cols_in_order if cluster_cols[x]]
 
         for filename in filenames:
-            assert filename in rows
             line = [filename]
 
-            for cluster_name in sorted(rows[filename]):
-                for col in cluster_cols_in_order:
+            for cluster_name in sorted(all_potential_columns):
+                group_cols = sorted(list(all_potential_columns[cluster_name]['groups']))
+                var_cols = sorted(list(all_potential_columns[cluster_name]['vars']))
+
+                for col in summary_cols_in_order + group_cols + var_cols:
                     if making_header_lines:
                         csv_header.append(cluster_name + '.' + col)
                         if col == 'ref_seq':
-                            phandago_suffixes[col] = ':o' + str(ref_seq_counter)
+                            phandango_suffixes[col] = ':o' + str(ref_seq_counter)
                             ref_seq_counter += 1
-                        phandango_header.append(cluster_name + '.' + col + phandago_suffixes[col])
-
-                    line.append(rows[filename][cluster_name][col])
-
-                for col in sorted(rows[filename][cluster_name]):
-                    if col in all_cluster_cols_in_order_set:
-                        continue
-
-                    if making_header_lines:
-                        csv_header.append(cluster_name + '.' + col)
-                        suffix = ':c2' if col.endswith('.%') else ':o1'
-                        phandango_header.append(cluster_name + '.' + col + suffix)
-
-                    line.append(rows[filename][cluster_name][col])
+                            phandango_header.append(cluster_name + '.' + col + phandango_suffixes[col])
+                        elif col in phandango_suffixes:
+                            phandango_header.append(cluster_name + '.' + col + phandango_suffixes[col])
+                        elif col.endswith('.%'):
+                            phandango_header.append(cluster_name + '.' + col + ':c2')
+                        else:
+                            phandango_header.append(cluster_name + '.' + col + ':o1')
+
+                    for col_type in ['summary', 'groups', 'vars']:
+                        if cluster_name in all_data[filename] and col in all_data[filename][cluster_name][col_type]:
+                            line.append(all_data[filename][cluster_name][col_type][col])
+                            break
+                    else:
+                        if col in {'assembled', 'match'}:
+                            line.append('no')
+                        elif col in summary_cols_set:
+                            line.append('NA')
+                        elif cluster_name in all_data[filename] and all_data[filename][cluster_name]['summary'].get('assembled', 'no')  != 'no':
+                            if col.endswith('.%'):
+                                line.append('NA')
+                            else:
+                                line.append('no')
+                        else:
+                            line.append('NA')
 
             making_header_lines = False
             matrix.append(line)
 
+        assert len(phandango_header) == len(csv_header)
+        for line in matrix:
+            assert len(line) == len(csv_header)
         return phandango_header, csv_header, matrix
 
 
@@ -292,11 +247,13 @@ class Summary:
         matrix = copy.deepcopy(matrix)
         cols_to_add_colour_col = [i for i in range(len(header)) if header[i].endswith(':o1')]
         field_to_col = {
-            'yes': '#1f78b4',
-            'yes_nonunique': '#a6cee3',
-            'no': '#33a02c',
-            'NA': '#b2df8a',
-            'het': '#fb9a99',
+            'yes': '#33a02c',
+            'yes_nonunique': '#b2df8a',
+            'no': '#fb9a99',
+            'NA': '#ffffff',
+            'het': '#fdbf6f',
+            'fragmented': '#1f78b4',
+            'interrupted': '#a6cee3',
         }
 
         cols_to_add_colour_col.reverse()
@@ -313,11 +270,15 @@ class Summary:
 
 
     @classmethod
-    def _matrix_to_csv(cls, matrix, header, outfile):
+    def _matrix_to_csv(cls, matrix, header, outfile, remove_nas=False):
         f = pyfastaq.utils.open_file_write(outfile)
         print(*header, sep=',', file=f)
         for line in matrix:
-            print(*line, sep=',', file=f)
+            if remove_nas:
+                new_line = ['' if x=='NA' else x for x in line]
+                print(*new_line, sep=',', file=f)
+            else:
+                print(*line, sep=',', file=f)
         pyfastaq.utils.close(f)
 
 
@@ -372,15 +333,14 @@ class Summary:
         if self.verbose:
             print('Loading input files...', flush=True)
         self._check_files_exist()
-        self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose)
+        self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose, only_clusters=self.only_clusters)
         if self.verbose:
             print('Generating output rows', flush=True)
-        self.rows = self._gather_output_rows()
-        phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.rows, self.cluster_columns)
+        self._gather_unfiltered_output_data()
+        phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.all_data, self.all_potential_columns, self.cluster_columns)
 
         # sanity check same number of columns in headers and matrix
         lengths = {len(x) for x in matrix}
-        print(lengths, len(phandango_header), len(csv_header))
         assert len(lengths) == 1
         assert len(matrix[0]) == len(phandango_header) == len(csv_header)
 
@@ -416,20 +376,25 @@ class Summary:
                 print('Making Phandango csv file', csv_file, flush=True)
             csv_file = self.outprefix + '.phandango.csv'
             phandango_header, phandango_matrix = Summary._add_phandango_colour_columns(phandango_header, matrix)
-            Summary._matrix_to_csv(phandango_matrix, phandango_header, csv_file)
-            dist_matrix_file = self.outprefix + '.phandango.distance_matrix'
-            tree_file = self.outprefix + '.phandango.tre'
-
-            if self.verbose:
-                print('Making Phandango distance matrix', dist_matrix_file, flush=True)
-            Summary._write_distance_matrix(matrix, dist_matrix_file)
-
-            if self.verbose:
-                print('Making Phandango tree file', tree_file, flush=True)
-            Summary._newick_from_dist_matrix(dist_matrix_file, tree_file)
-            os.unlink(dist_matrix_file)
+            Summary._matrix_to_csv(phandango_matrix, phandango_header, csv_file, remove_nas=True)
+
+            if self.make_phandango_tree:
+                dist_matrix_file = self.outprefix + '.phandango.distance_matrix'
+                tree_file = self.outprefix + '.phandango.tre'
+
+                if self.verbose:
+                    print('Making Phandango distance matrix', dist_matrix_file, flush=True)
+                Summary._write_distance_matrix(matrix, dist_matrix_file)
+
+                if self.verbose:
+                    print('Making Phandango tree file', tree_file, flush=True)
+                Summary._newick_from_dist_matrix(dist_matrix_file, tree_file)
+                os.unlink(dist_matrix_file)
+            elif self.verbose:
+                print('Skipping making tree because you asked me not to make it', flush=True)
         else:
             print('Made csv file. Not making Phandango files because only one sample remains after filtering', file=sys.stderr)
 
         if self.verbose:
             print('Finished', flush=True)
+
diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
index 7a53f55..efc4cf8 100644
--- a/ariba/summary_cluster.py
+++ b/ariba/summary_cluster.py
@@ -1,4 +1,4 @@
-from ariba import flag, report
+from ariba import flag, report, summary_cluster_variant
 
 class Error (Exception): pass
 
@@ -56,7 +56,7 @@ class SummaryCluster:
             d['var_group'] = '.'
         else:
             try:
-                d['var_group'] = d['var_description'].split(':')[3]
+                d['var_group'] = d['var_description'].split(':')[4]
             except:
                 raise Error('Error getting variant group from the following line:\n' + line)
 
@@ -118,6 +118,8 @@ class SummaryCluster:
                 return 'yes'
             else:
                 return 'yes_nonunique'
+        elif self.flag.has('assembled_into_one_contig'):
+            return 'interrupted'
         else:
             return 'fragmented'
 
@@ -218,6 +220,7 @@ class SummaryCluster:
             return None
 
 
+
     @staticmethod
     def _get_nonsynonymous_var(data_dict):
         '''if data_dict has a non synonymous variant, return string:
@@ -295,5 +298,26 @@ class SummaryCluster:
         for d in self.data:
             snp_tuple = self._get_known_noncoding_het_snp(d)
             if snp_tuple is not None:
-                snps[snp_tuple[0]] = snp_tuple[1]
+                snp_id = d['var_description'].split(':')[4]
+                if snp_id not in snps:
+                    snps[snp_id] = {}
+                snps[snp_id][snp_tuple[0]] = snp_tuple[1]
         return snps
+
+
+    @classmethod
+    def _get_all_nonsynon_variants_set(cls, data_dicts):
+        variants = set()
+
+        for data_dict in data_dicts:
+            cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
+            if cluster_var.has_nonsynon:
+                variants.add(cluster_var)
+
+        return variants
+
+
+    def gather_data(self):
+        self.summary = self.column_summary_data()
+        self.variants = self._get_all_nonsynon_variants_set(self.data)
+
diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py
new file mode 100644
index 0000000..51e00d9
--- /dev/null
+++ b/ariba/summary_cluster_variant.py
@@ -0,0 +1,83 @@
+from ariba import flag, report
+
+class Error (Exception): pass
+
+class SummaryClusterVariant:
+    def __init__(self, data_dict):
+        self._get_nonsynon_variant_data(data_dict)
+
+
+    def __eq__(self, other):
+       return type(other) is type(self) and self.__dict__ == other.__dict__
+
+
+    def __hash__(self):
+        return hash(tuple([self.__dict__[x] for x in sorted(self.__dict__.keys())]))
+
+
+    def __str__(self):
+        if self.has_nonsynon:
+            return ', '.join((str(self.known), self.var_group, str(self.coding), self.var_string, str(self.het_percent)))
+        else:
+            return 'None'
+
+
+    @classmethod
+    def _has_nonsynonymous(cls, data_dict):
+        return data_dict['ref_ctg_effect'] != 'SYN' and \
+          (
+              data_dict['has_known_var'] == '1' or \
+              (data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
+          )
+
+
+    @classmethod
+    def _get_het_percent(cls, data_dict):
+        if data_dict['gene'] == '1' or data_dict['ref_ctg_effect'] != 'SNP' or data_dict['smtls_alt_nt'] == '.' or ';' in data_dict['smtls_alt_nt']:
+            return None
+        else:
+            nucleotides = [data_dict['ctg_nt']] + data_dict['smtls_alt_nt'].split(',')
+            depths = data_dict['smtls_alt_depth'].split(',')
+
+            if len(nucleotides) != len(depths):
+                raise Error('Mismatch in number of inferred nucleotides from ctg_nt, smtls_alt_nt, smtls_alt_depth columns. Cannot continue\n' + str(data_dict))
+
+            try:
+                var_nucleotide = data_dict['known_var_change'][-1] if data_dict['known_var_change'] != '.' else data_dict['ref_ctg_change'][-1]
+                if var_nucleotide == '.':
+                    return None
+                depths = [int(x) for x in depths]
+                nuc_to_depth = dict(zip(nucleotides, depths))
+                total_depth = sum(depths)
+                var_depth = nuc_to_depth.get(var_nucleotide, 0)
+                return round(100 * var_depth / total_depth, 1)
+            except:
+                return None
+
+
+    def _get_nonsynon_variant_data(self, data_dict):
+        if not SummaryClusterVariant._has_nonsynonymous(data_dict):
+            self.has_nonsynon = False
+            return
+
+        self.has_nonsynon = True
+
+        if data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
+            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
+        elif '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \
+          data_dict['known_var_change'] != data_dict['ref_ctg_change']:
+            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue')
+
+        self.known = data_dict['known_var'] == '1'
+        self.var_group = data_dict['var_group']
+        self.coding = data_dict['gene'] == '1'
+
+        if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.':
+            self.var_string = data_dict['known_var_change']
+        elif data_dict['ref_ctg_change'] != '.':
+            self.var_string = data_dict['ref_ctg_change']
+        else:
+            self.var_string = data_dict['ref_ctg_effect']
+
+        self.het_percent = SummaryClusterVariant._get_het_percent(data_dict)
+
diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
index c5349f4..bc1ea25 100644
--- a/ariba/summary_sample.py
+++ b/ariba/summary_sample.py
@@ -4,9 +4,10 @@ from ariba import report, summary_cluster
 class Error (Exception): pass
 
 class SummarySample:
-    def __init__(self, report_tsv, min_pc_id=90):
+    def __init__(self, report_tsv, min_pc_id=90, only_clusters=None):
         self.report_tsv = report_tsv
         self.min_pc_id = min_pc_id
+        self.only_clusters = only_clusters
         self.clusters = {}
 
 
@@ -15,7 +16,7 @@ class SummarySample:
 
 
     @staticmethod
-    def _load_file(filename, min_pc_id):
+    def _load_file(filename, min_pc_id, only_clusters=None):
         f = pyfastaq.utils.open_file_read(filename)
         clusters = {}
 
@@ -28,11 +29,25 @@ class SummarySample:
 
             data_dict = summary_cluster.SummaryCluster.line2dict(line)
             cluster = data_dict['cluster']
+            if only_clusters is not None and cluster not in only_clusters:
+                continue
+
             if cluster not in clusters:
                 clusters[cluster] = summary_cluster.SummaryCluster(min_pc_id=min_pc_id)
             clusters[cluster].add_data_dict(data_dict)
 
         pyfastaq.utils.close(f)
+
+        to_delete = set()
+
+        for cluster_name, cluster in clusters.items():
+            cluster.gather_data()
+            if cluster.name is None:
+                to_delete.add(cluster_name)
+
+        for name in to_delete:
+            del clusters[name]
+
         return clusters
 
 
@@ -58,7 +73,7 @@ class SummarySample:
 
 
     def run(self):
-        self.clusters = self._load_file(self.report_tsv, self.min_pc_id)
+        self.clusters = self._load_file(self.report_tsv, self.min_pc_id, only_clusters=self.only_clusters)
         self.column_summary_data = self._column_summary_data()
         self.variant_column_names_tuples, self.het_snps = self._variant_column_names_tuples_and_het_snps()
         self.var_groups = self._var_groups()
diff --git a/ariba/tasks/getref.py b/ariba/tasks/getref.py
index d83e028..a9292c1 100644
--- a/ariba/tasks/getref.py
+++ b/ariba/tasks/getref.py
@@ -3,6 +3,10 @@ from ariba import ref_genes_getter
 
 
 def run(options):
-    getter = ref_genes_getter.RefGenesGetter(options.db, genetic_code=options.genetic_code)
+    getter = ref_genes_getter.RefGenesGetter(
+        options.db,
+        genetic_code=options.genetic_code,
+        version=options.version
+    )
     getter.run(options.outprefix)
 
diff --git a/ariba/tasks/prepareref.py b/ariba/tasks/prepareref.py
index 7a7591a..ef52684 100644
--- a/ariba/tasks/prepareref.py
+++ b/ariba/tasks/prepareref.py
@@ -12,8 +12,9 @@ def run(options):
 
     preparer = ref_preparer.RefPreparer(
         options.fasta_files,
-        options.tsv_files,
         extern_progs,
+        metadata_tsv_files=options.tsv_files,
+        all_coding=options.all_coding,
         version_report_lines=version_report_lines,
         min_gene_length=options.min_gene_length,
         max_gene_length=options.max_gene_length,
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index 782c905..c91e982 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -9,66 +9,33 @@ def use_preset(options):
     preset_to_vals = {
         'minimal': {
             'cluster_cols': 'match',
-            'variant_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'n',
-            'known_vars': 'n',
-            'novel_vars': 'n'
         },
         'cluster_small': {
             'cluster_cols': 'assembled,match,ref_seq,known_var',
-            'variant_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'n',
-            'known_vars': 'n',
-            'novel_vars': 'n'
         },
         'cluster_all': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': '',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'n',
-            'known_vars': 'n',
-            'novel_vars': 'n'
         },
         'cluster_var_groups': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': 'groups',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'y',
-            'known_vars': 'n',
-            'novel_vars': 'n'
-        },
-        'cluster_known_vars': {
-            'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': 'groups,grouped,ungrouped',
-            'col_filter': 'y',
-            'row_filter': 'y',
-            'var_groups': 'y',
-            'known_vars': 'y',
-            'novel_vars': 'n'
         },
         'all': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': 'groups,grouped,ungrouped,novel',
             'col_filter': 'y',
             'row_filter': 'y',
-            'var_groups': 'y',
-            'known_vars': 'y',
-            'novel_vars': 'y'
         },
         'all_no_filter': {
             'cluster_cols': 'assembled,match,ref_seq,pct_id,known_var,novel_var',
-            'variant_cols': 'groups,grouped,ungrouped,novel',
             'col_filter': 'n',
             'row_filter': 'n',
-            'var_groups': 'y',
-            'known_vars': 'y',
-            'novel_vars': 'y'
         },
     }
 
@@ -77,6 +44,13 @@ def use_preset(options):
     for key, val in preset_to_vals[options.preset].items():
         exec('options.' + key + ' = "' + val + '"')
 
+    if options.preset in {'cluster_var_groups', 'all', 'all_no_filter'}:
+        options.v_groups = True
+
+    if options.preset in {'all', 'all_no_filter'}:
+        options.known_variants = True
+        options.novel_variants = True
+
     return options
 
 
@@ -93,9 +67,12 @@ def run(options):
         filter_rows=options.col_filter == 'y',
         filter_columns=options.row_filter == 'y',
         min_id=options.min_id,
-        show_known_het=options.het,
         cluster_cols=options.cluster_cols,
-        variant_cols=options.var_cols,
+        make_phandango_tree=(not options.no_tree),
+        only_clusters=None if options.only_cluster is None else {options.only_cluster},
+        show_var_groups=options.v_groups,
+        show_known_vars=options.known_variants,
+        show_novel_vars=options.novel_variants,
         verbose=options.verbose
     )
     s.run()
diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py
index 83d805f..898106b 100644
--- a/ariba/tests/assembly_variants_test.py
+++ b/ariba/tests/assembly_variants_test.py
@@ -102,10 +102,10 @@ class TestAssemblyVariants(unittest.TestCase):
         self.assertEqual(expected, mummer_variants)
 
 
-    def test_get_one_variant_for_one_contig_non_coding(self):
+    def test_one_var_one_ctg_noncdg(self):
         '''test _get_one_variant_for_one_contig_non_coding'''
-        fasta_in = os.path.join(data_dir, 'assembly_variants_test_get_variants_non_coding.fa')
-        tsv_in = os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_noncdg.fa')
+        tsv_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_noncdg.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         ref_sequence_name = 'non_coding'
         refdata_var_dict = refdata.metadata[ref_sequence_name]
@@ -144,10 +144,10 @@ class TestAssemblyVariants(unittest.TestCase):
             self.assertEqual(expected_used_variants[i], got_used_variants)
 
 
-    def test_get_one_variant_for_one_contig_coding(self):
+    def test_one_var_one_ctg_cdg(self):
         '''test _get_one_variant_for_one_contig_coding'''
-        fasta_in = os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa')
-        tsv_in = os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_cdg.fa')
+        tsv_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_cdg.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         ref_sequence_name = 'presence_absence'
         ref_sequence = refdata.sequence(ref_sequence_name)
diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py
index 7b7de6a..352b74e 100644
--- a/ariba/tests/cluster_test.py
+++ b/ariba/tests/cluster_test.py
@@ -216,9 +216,9 @@ class TestCluster(unittest.TestCase):
     def test_full_run_ok_variants_only_variant_not_present_always_report(self):
         '''test complete run of cluster on a variants only gene when variant not present but always report variant'''
         fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_varonly.not_present.always_report.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
-        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.not_present.always_report'
+        tmpdir = 'tmp.cluster_full_run_varonly.not_present.always_report'
         shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)
 
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300)
@@ -265,13 +265,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_pres_abs_gene(self):
+    def test_full_run_smtls_snp_presabs_gene(self):
         '''test complete run where samtools calls a snp in a presence/absence gene'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_gene.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_gene.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_pres_abs_gene'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_gene'), tmpdir)
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
         expected = [
@@ -281,13 +281,15 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_var_only_gene(self):
+    def test_full_run_smtls_snp_varonly_gene_2(self):
         '''test complete run where samtools calls a snp in a variant only gene'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_gene.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_gene.metadata.tsv')
+        # _2 because I think test_full_run_smtls_snp_varonly_gene tests the asame functionality.
+        # ... but let's leave both tests in anyway
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
-        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_var_only_gene'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_gene'), tmpdir)
+        tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2'
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
         expected = [
@@ -297,13 +299,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_known_position_pres_abs_gene(self):
+    def test_full_run_known_smtls_snp_presabs_gene(self):
         '''test complete run where samtools calls a snp at a known snp location in a presence/absence gene'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene'), tmpdir)
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
 
@@ -316,13 +318,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var(self):
+    def test_full_run_smtls_snp_varonly_gene_no_snp(self):
         '''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does not have variant'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_no_snp.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_no_snp.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
-        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var'), tmpdir)
+        tmpdir = 'tmp.cluster_test_full_run_smtls_snp_varonly_gene_no_snp'
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_no_snp'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
 
@@ -335,13 +337,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var(self):
+    def test_full_run_smtls_snp_varonly_gene(self):
         '''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does have variant'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var'), tmpdir)
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
 
@@ -354,13 +356,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_pres_abs_noncoding(self):
+    def test_full_run_smtls_snp_presabs_nonc(self):
         '''test complete run where samtools calls a snp in a presence/absence noncoding sequence'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_nonc.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_nonc.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
-        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding'), tmpdir)
+        tmpdir = 'tmp.cluster_test_full_run_smtls_snp_presabs_nonc'
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_nonc'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
         expected = [
@@ -370,13 +372,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_var_only_noncoding(self):
+    def test_full_run_smtls_snp_varonly_nonc(self):
         '''test complete run where samtools calls a snp in a presence/absence noncoding sequence'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_noncoding.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_noncoding.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
-        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_var_only_noncoding'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_var_only_noncoding'), tmpdir)
+        tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc'
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
         expected = [
@@ -386,13 +388,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding(self):
+    def test_full_run_known_smtls_snp_presabs_nonc(self):
         '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_nonc.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_nonc.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding'), tmpdir)
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_nonc'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
 
@@ -405,13 +407,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var(self):
+    def test_full_run_smtls_snp_varonly_nonc_no_snp(self):
         '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample does not have the var'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc_no_snp.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc_no_snp.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var'), tmpdir)
+        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc_no_snp'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
 
@@ -424,13 +426,13 @@ class TestCluster(unittest.TestCase):
         shutil.rmtree(tmpdir)
 
 
-    def test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var(self):
+    def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self):
         '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample has the var'''
-        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.fa')
-        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.metadata.tsv')
+        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding'
-        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var'), tmpdir)
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc'), tmpdir)
         c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
         c.run()
 
@@ -441,3 +443,20 @@ class TestCluster(unittest.TestCase):
         ]
         self.assertEqual(expected, c.report_lines)
         shutil.rmtree(tmpdir)
+
+
+    def test_full_run_partial_assembly(self):
+        '''Test complete run where only part of the ref gene is present in the reads'''
+        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.fa')
+        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.tsv')
+        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
+        tmpdir = 'tmp.cluster_test_full_run_partial_assembly'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly'), tmpdir)
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=278, total_reads_bases=15020)
+        c.run()
+
+        expected = [
+            'presence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.scaffold.1\t949\t20.5\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1'
+        ]
+        self.assertEqual(expected, c.report_lines)
+        shutil.rmtree(tmpdir)
diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py
index 38d1ae0..a107003 100644
--- a/ariba/tests/clusters_test.py
+++ b/ariba/tests/clusters_test.py
@@ -49,9 +49,9 @@ class TestClusters(unittest.TestCase):
         self.assertEqual(expected, got)
 
 
-    def test_load_reference_data_from_dir(self):
+    def test_load_ref_data_from_dir(self):
         '''test _load_reference_data_from_dir'''
-        indir = os.path.join(data_dir, 'clusters_test_load_reference_data_from_dir')
+        indir = os.path.join(data_dir, 'clusters_load_ref_data_from_dir')
         got_refdata, got_clusters = clusters.Clusters._load_reference_data_from_dir(indir)
         expected_seq_dict = {
             'variants_only1': pyfastaq.sequences.Fasta('variants_only1', 'atggcgtgcgatgaataa'),
@@ -92,16 +92,16 @@ class TestClusters(unittest.TestCase):
 
     def test_minimap_reads_to_all_ref_seqs(self):
         '''test test_minimap_reads_to_all_ref_seqs'''
-        clusters_tsv = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.clusters.tsv')
-        ref_fasta = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.ref.fa')
-        reads_1 = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.reads_1.fq')
-        reads_2 = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.reads_2.fq')
+        clusters_tsv = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.clstrs.tsv')
+        ref_fasta = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.ref.fa')
+        reads_1 = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.reads_1.fq')
+        reads_2 = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.reads_2.fq')
         tmp_outprefix = 'tmp.clusters_test_minimap_reads_to_all_ref_seqs'
         clusters.Clusters._minimap_reads_to_all_ref_seqs(clusters_tsv, ref_fasta, reads_1, reads_2, tmp_outprefix)
-        expected_cluster2rep = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.out.cluster2representative')
-        expected_cluster_counts = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.out.clusterCounts')
-        expected_proper_pairs = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.out.properPairs')
-        expected_insert_hist = os.path.join(data_dir, 'clusters_test_minimap_reads_to_all_ref_seqs.out.insertHistogram')
+        expected_cluster2rep = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.out.clstr2rep')
+        expected_cluster_counts = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.out.clstr_count')
+        expected_proper_pairs = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.out.pairs')
+        expected_insert_hist = os.path.join(data_dir, 'clusters_minimap_reads_to_all_refs.out.hist')
 
         # not sure that the reads order is preserved, so just check read store file exists
         self.assertTrue(os.path.exists(os.path.join(tmp_outprefix + '.reads')))
@@ -258,7 +258,7 @@ class TestClusters(unittest.TestCase):
         os.unlink(tmp_file)
 
 
-    def test_write_catted_genes_matching_refs_fasta(self):
+    def test_cat_genes_match_ref(self):
         '''test _write_catted_genes_matching_refs_fasta'''
         seq1 = pyfastaq.sequences.Fasta('seq1', 'ACGT')
         seq3 = pyfastaq.sequences.Fasta('seq3', 'AAAA')
@@ -281,7 +281,7 @@ class TestClusters(unittest.TestCase):
 
         tmp_file = 'tmp.test_write_catted_genes_matching_refs_fasta.fa'
         self.clusters._write_catted_genes_matching_refs_fasta(tmp_file)
-        expected = os.path.join(data_dir, 'clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa')
+        expected = os.path.join(data_dir, 'clusters_cat_genes_match_ref.fa')
         self.assertTrue(filecmp.cmp(expected, tmp_file, shallow=False))
         os.unlink(tmp_file)
 
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa b/ariba/tests/data/assembly_variants_one_var_one_ctg_cdg.fa
similarity index 100%
rename from ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa
rename to ariba/tests/data/assembly_variants_one_var_one_ctg_cdg.fa
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv b/ariba/tests/data/assembly_variants_one_var_one_ctg_cdg.tsv
similarity index 100%
rename from ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
rename to ariba/tests/data/assembly_variants_one_var_one_ctg_cdg.tsv
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa b/ariba/tests/data/assembly_variants_one_var_one_ctg_noncdg.fa
similarity index 100%
rename from ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa
rename to ariba/tests/data/assembly_variants_one_var_one_ctg_noncdg.fa
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv b/ariba/tests/data/assembly_variants_one_var_one_ctg_noncdg.tsv
similarity index 100%
rename from ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
rename to ariba/tests/data/assembly_variants_one_var_one_ctg_noncdg.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
copy to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.ref_for_reads.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.ref_for_reads.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.ref_for_reads.fa
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.ref_for_reads.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.metadata.tsv b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.metadata.tsv
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/reads_1.fq b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/reads_1.fq
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/reads_2.fq b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/reads_2.fq
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/references.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/references.fa
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_gene/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa
copy to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.metadata.tsv b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.metadata.tsv
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/reads_1.fq b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/reads_1.fq
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/reads_2.fq b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/reads_2.fq
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/references.fa b/ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/references.fa
rename to ariba/tests/data/cluster_full_run_known_smtls_snp_presabs_nonc/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.ref_for_reads.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.ref_for_reads.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.ref_for_reads.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.ref_for_reads.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_have_var/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_gene/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_gene_does_not_have_var/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_presabs_nonc/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_have_var/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_var_only_noncoding_does_not_have_var/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/references.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
copy to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_2/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_not_have_var.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/references.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
copy to ariba/tests/data/cluster_full_run_smtls_snp_varonly_gene_no_snp/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/references.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.fa
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.metadata.tsv b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.metadata.tsv
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/reads_1.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/reads_1.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/reads_2.fq b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene/reads_2.fq
rename to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa b/ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/references.fa
similarity index 100%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa
copy to ariba/tests/data/cluster_full_run_smtls_snp_varonly_nonc_no_snp/references.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv b/ariba/tests/data/cluster_full_run_varonly.not_present.always_report.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
rename to ariba/tests/data/cluster_full_run_varonly.not_present.always_report.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_gene_start_mismatch.ref_to_make_reads.fa b/ariba/tests/data/cluster_test_full_run_ok_gene_start_mismatch.ref_for_reads.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_gene_start_mismatch.ref_to_make_reads.fa
rename to ariba/tests/data/cluster_test_full_run_ok_gene_start_mismatch.ref_for_reads.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.fa b/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.fa
deleted file mode 100644
index c42d06b..0000000
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_gene.fa
+++ /dev/null
@@ -1,3 +0,0 @@
->ref_gene
-ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
-ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.fa b/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.fa
deleted file mode 100644
index a23e635..0000000
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_pres_abs_noncoding.fa
+++ /dev/null
@@ -1,3 +0,0 @@
->ref_seq
-ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
-ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.fa b/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.fa
deleted file mode 100644
index c42d06b..0000000
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_gene.fa
+++ /dev/null
@@ -1,3 +0,0 @@
->ref_gene
-ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
-ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.fa b/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.fa
deleted file mode 100644
index a23e635..0000000
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding.fa
+++ /dev/null
@@ -1,3 +0,0 @@
->ref_seq
-ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
-ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_test_full_run_partial_asmbly.fa
similarity index 83%
copy from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
copy to ariba/tests/data/cluster_test_full_run_partial_asmbly.fa
index c42d06b..d7a6970 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly.fa
@@ -1,3 +1,3 @@
->ref_gene
+>presence_absence1
 ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
 ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_partial_asmbly.tsv b/ariba/tests/data/cluster_test_full_run_partial_asmbly.tsv
new file mode 100644
index 0000000..da35140
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly.tsv
@@ -0,0 +1 @@
+presence_absence1	1	0	.	.	Generic description of presence_absence1
diff --git a/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_1.fq b/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_1.fq
new file mode 100644
index 0000000..f53750a
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_1.fq
@@ -0,0 +1,432 @@
+ at presence_absence1:1:128:236/1
+CTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:2:637:746/1
+CCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:3:420:530/1
+TAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:4:125:236/1
+GAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:5:297:405/1
+GTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:6:649:758/1
+AGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:7:568:678/1
+TCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:8:494:606/1
+CGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:9:386:495/1
+CAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:10:129:238/1
+TACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:11:735:846/1
+TCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:12:360:470/1
+TTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:13:730:840/1
+TCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:14:669:777/1
+GACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:15:657:766/1
+ACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:16:778:887/1
+ACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATCCACTTGCTGCGACTTATTCCGTCTGTATCTGAAGATCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:17:340:450/1
+GCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:18:463:573/1
+ACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:19:371:481/1
+ACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:20:520:629/1
+ATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:21:300:411/1
+AAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:22:316:426/1
+TGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:23:494:603/1
+CGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:24:507:619/1
+ACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:25:351:459/1
+GCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:26:709:818/1
+TATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:27:474:583/1
+CACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:28:181:292/1
+ATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:29:200:308/1
+GGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:30:435:545/1
+TTCTCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:31:212:323/1
+GGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:32:134:244/1
+ATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:33:652:761/1
+CGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:34:200:311/1
+GGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:35:516:624/1
+AGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:36:667:778/1
+TTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:37:767:875/1
+AGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATCCACTTGCTGCGACTTATTCCGTCTGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:38:78:188/1
+CCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:39:183:292/1
+GTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:40:446:556/1
+TACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:41:747:856/1
+CGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATCCACTTGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:42:672:781/1
+AACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:43:103:214/1
+CAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:44:115:225/1
+CTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:45:615:725/1
+CGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:46:775:886/1
+AAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATCCACTTGCTGCGACTTATTCCGTCTGTATCTGAAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:47:721:832/1
+ACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:48:503:614/1
+GAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:49:279:388/1
+CCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:50:691:802/1
+ATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:51:713:821/1
+CGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:52:626:736/1
+ATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:53:739:849/1
+TTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAGGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:54:650:757/1
+GACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:55:544:654/1
+AAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:56:75:184/1
+GGACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:57:77:187/1
+ACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:58:493:602/1
+CCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:59:250:361/1
+TCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:60:438:547/1
+TCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:61:656:767/1
+AACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:62:674:784/1
+CGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:63:609:720/1
+CTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:64:459:570/1
+TGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:65:514:624/1
+TTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:66:628:736/1
+GGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:67:735:845/1
+TCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:68:273:385/1
+GGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:69:676:787/1
+ACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:70:28:137/1
+TCTACTTCCAGACCCGTCTCGATATCTCACCTTTGCCCCAACTCGGCGGACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:71:634:744/1
+CACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:72:346:455/1
+TGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:73:153:263/1
+GAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:74:106:215/1
+TAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:75:219:329/1
+CACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:76:591:700/1
+GGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:77:260:371/1
+CTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:78:60:170/1
+TTGCCCCAACTCGGCGGACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:79:611:720/1
+CTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:80:688:797/1
+GATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:81:604:713/1
+TCTTCCTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:82:622:731/1
+TCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:83:72:182/1
+GGCGGACCCGCAGAACAGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:84:327:438/1
+TAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:85:231:342/1
+CAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:86:733:843/1
+CCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACACGTCTCAGTTCTGCTTATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:87:154:265/1
+AGTGAAACATCATGTAGAGCGCTCCACATGTAGCGTTAGGGACACCGGACGACATGAGGGGACTGCACTGCACGGCGCAGTGGACTGGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:88:218:328/1
+GCACTGCACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:89:239:349/1
+TGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:90:315:422/1
+TTGACGGCAGCATAGCGGGGTTAGTGCCAATTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:91:351:460/1
+GCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:92:224:333/1
+CACGGCGCAGTGGACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:93:371:479/1
+ACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:94:543:653/1
+GAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:95:345:454/1
+TTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:96:88:196/1
+AGCGTCCATGGCCCGCAGTAACACTAACTACATACTGGAGCTACACATTGGTATCCCCGTTTGTTGAGTGAAACATCATGTAGAGCGCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:97:680:789/1
+CTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:98:439:548/1
+CCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:99:523:633/1
+TTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCCGCATCGGGACAGATAACGTCTTCCTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:100:455:566/1
+GCACTGTAACATTAGTTTACACTAACCCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:101:365:474/1
+AGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:102:496:608/1
+AACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAGCTGGCTGGTCGACTGACGGCTCACCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:103:392:502/1
+GTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTATTCTCCCTTGGTACAACTCTGCACTGTAACATTAGTTTACACTAACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:104:345:455/1
+TTGTACGCTTATCTTTTATAAGGCCGACGGGAGGTAACAGGCAGGCTGTTGCGGTAAGATTAAGGTCTTAGACCATAAGGGTCTTGATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:105:714:825/1
+GCTCCGAACTCTCGAGTCGCCTCAATTTATCTCCGACAAGGCTAATTAGCGTAAGTCAACAAAAACTTGGCTCATATCTTGCACATGACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:106:589:699/1
+TCGGGACAGATAACGTCTTCCTCTCGCGTCGTATCGAATGGAATCCACCCGTTTATCATCAGACGCTAACCCTCGGGTTTGACAACGACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:107:237:346/1
+ACTGGCTCGCCTGTCAAATGTTGCTTAAGGCCGAGAGGCTTGCCTATCCACGAGAAGATTGTCAAAGAGTCTAATCCATTGACGGCAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:108:642:750/1
+TATCATCAGACGCTAACCCTCGGGTTTGACAACGACATCTAACGCGGATATAGTAACAACTATCCGATATCCGCTCCGAACTCTCGAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_2.fq b/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_2.fq
new file mode 100644
index 0000000..de62a9d
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly/reads_2.fq
@@ -0,0 +1,432 @@
+ at presence_absence1:1:128:236/2
+GCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:2:637:746/2
+GCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:3:420:530/2
+CGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:4:125:236/2
+GCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:5:297:405/2
+GGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:6:649:758/2
+GAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:7:568:678/2
+TTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:8:494:606/2
+ACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:9:386:495/2
+GGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:10:129:238/2
+ATGCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:11:735:846/2
+AAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:12:360:470/2
+CTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:13:730:840/2
+CCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:14:669:777/2
+TGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:15:657:766/2
+TACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:16:778:887/2
+CCTTCCCGAGTGGTTAGATAGTAGTCACAGCGGCTTATGTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:17:340:450/2
+CGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:18:463:573/2
+GAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:19:371:481/2
+CGACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:20:520:629/2
+GGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:21:300:411/2
+GCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:22:316:426/2
+AATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:23:494:603/2
+ATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:24:507:619/2
+TCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:25:351:459/2
+GCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:26:709:818/2
+CGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:27:474:583/2
+TGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:28:181:292/2
+TTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:29:200:308/2
+CGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:30:435:545/2
+GGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:31:212:323/2
+AGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:32:134:244/2
+CCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:33:652:761/2
+ACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:34:200:311/2
+TACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:35:516:624/2
+GGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:36:667:778/2
+TTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:37:767:875/2
+GTTAGATAGTAGTCACAGCGGCTTATGTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:38:78:188/2
+AAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:39:183:292/2
+TTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:40:446:556/2
+GATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:41:747:856/2
+GGCTTATGTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:42:672:781/2
+ATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:43:103:214/2
+CTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:44:115:225/2
+TGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:45:615:725/2
+GAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:46:775:886/2
+CTTCCCGAGTGGTTAGATAGTAGTCACAGCGGCTTATGTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:47:721:832/2
+ATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:48:503:614/2
+TAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:49:279:388/2
+AGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:50:691:802/2
+CGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:51:713:821/2
+GCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:52:626:736/2
+CCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:53:739:849/2
+GTAAAAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:54:650:757/2
+AATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:55:544:654/2
+ATAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:56:75:184/2
+CTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACGCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:57:77:187/2
+AGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:58:493:602/2
+TATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:59:250:361/2
+TTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:60:438:547/2
+GTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:61:656:767/2
+ATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:62:674:784/2
+GGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:63:609:720/2
+GAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:64:459:570/2
+GGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:65:514:624/2
+GGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:66:628:736/2
+CCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:67:735:845/2
+AAAGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:68:273:385/2
+GTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:69:676:787/2
+ATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:70:28:137/2
+GTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACGCTACATGTGGAGCGCTCTACATGATGTTTCACTCAACAAACGGGGATACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:71:634:744/2
+AAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:72:346:455/2
+TCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:73:153:263/2
+GCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:74:106:215/2
+TCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:75:219:329/2
+GGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:76:591:700/2
+TATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:77:260:371/2
+CAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:78:60:170/2
+CAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACGCTACATGTGGAGCGCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:79:611:720/2
+GAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:80:688:797/2
+ACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:81:604:713/2
+GTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTTCGGAGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:82:622:731/2
+TAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:83:72:182/2
+CTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTGTCCCTAACGCTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:84:327:438/2
+GTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:85:231:342/2
+TCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:86:733:843/2
+AGGCCTCGCACATGGAAGGCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:87:154:265/2
+AAGCGTACAATTGGCACTAACCCCGCTATGCTGCCGTCAATGGATTAGACTCTTTGACAATCTTCTCGTGGATAGGCAAGCCTCTCGGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:88:218:328/2
+GTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACCCCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:89:239:349/2
+AGAATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:90:315:422/2
+TTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCAGAGTTGTACCAAGGGAGAATAATCAAGACCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:91:351:460/2
+TGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:92:224:333/2
+TTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACAATTGGCACTAACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:93:371:479/2
+ACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:94:543:653/2
+TAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:95:345:454/2
+CCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:96:88:196/2
+GGATAGGCAAGCCTCTCGGCCTTAAGCAACATTTGACAGGCGAGCCAGTCCACTGCGCCGTGCAGTGCAGTCCCCTCATGTCGTCCGGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:97:680:789/2
+AGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:98:439:548/2
+GGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:99:523:633/2
+GTTCGGAGCGGATATCGGATAGTTGTTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:100:455:566/2
+AGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGGAAGACGTTATCTGTCCCGATGCGGGTGAGCCGTCAGTCGACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:101:365:474/2
+CCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:102:496:608/2
+TTACTATATCCGCGTTAGATGTCGTTGTCAAACCCGAGGGTTAGCGTCTGATGATAAACGGGTGGATTCCATTCGATACGACGCGAGAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:103:392:502/2
+CGATGCGGGTGAGCCGTCAGTCGACCAGCCAGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:104:345:455/2
+TCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGGTTAGTGTAAACTAATGTTACAGTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:105:714:825/2
+GCCCGCCCGTAGGGTTAGACTCACGTCCACGCTCGCAGATCGGTATCTTGATCTTCAGATACAGACGGAATAAGTCGCAGCAAGTGGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:106:589:699/2
+ATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTGTCGGAGATAAATTGAGGCGACTCGAGAGTTCGGAGCGGATATCGGATAGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:107:237:346/2
+ATAATCAAGACCCTTATGGTCTAAGACCTTAATCTTACCGCAACAGCCTGCCTGTTACCTCCCGTCGGCCTTATAAAAGATAAGCGTACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:108:642:750/2
+CGCAGCAAGTGGATCCTAATAAGCAGAACTGAGACGTGTCATGTGCAAGATATGAGCCAAGTTTTTGTTGACTTACGCTAATTAGCCTTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa b/ariba/tests/data/cluster_test_full_run_partial_asmbly/references.fa
similarity index 83%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
rename to ariba/tests/data/cluster_test_full_run_partial_asmbly/references.fa
index c42d06b..d7a6970 100644
--- a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_gene/references.fa
+++ b/ariba/tests/data/cluster_test_full_run_partial_asmbly/references.fa
@@ -1,3 +1,3 @@
->ref_gene
+>presence_absence1
 ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
 ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.fa b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_not_have_var.fa
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc.fa
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.metadata.tsv b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc.tsv
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding_does_have_var.metadata.tsv
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc.tsv
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/reads_1.fq b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/reads_1.fq
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/reads_2.fq b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_snp_var_only_noncoding/reads_2.fq
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa b/ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/references.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_full_run_ok_samtools_known_position_snp_pres_abs_noncoding/references.fa
rename to ariba/tests/data/cluster_test_full_run_smtls_snp_varonly_nonc/references.fa
diff --git a/ariba/tests/data/clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa b/ariba/tests/data/clusters_cat_genes_match_ref.fa
similarity index 100%
rename from ariba/tests/data/clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa
rename to ariba/tests/data/clusters_cat_genes_match_ref.fa
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/00.info.txt b/ariba/tests/data/clusters_load_ref_data_from_dir/00.info.txt
similarity index 100%
rename from ariba/tests/data/clusters_test_load_reference_data_from_dir/00.info.txt
rename to ariba/tests/data/clusters_load_ref_data_from_dir/00.info.txt
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/01.filter.check_metadata.tsv b/ariba/tests/data/clusters_load_ref_data_from_dir/01.filter.check_metadata.tsv
similarity index 100%
rename from ariba/tests/data/clusters_test_load_reference_data_from_dir/01.filter.check_metadata.tsv
rename to ariba/tests/data/clusters_load_ref_data_from_dir/01.filter.check_metadata.tsv
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/02.cdhit.all.fa b/ariba/tests/data/clusters_load_ref_data_from_dir/02.cdhit.all.fa
similarity index 100%
rename from ariba/tests/data/clusters_test_load_reference_data_from_dir/02.cdhit.all.fa
rename to ariba/tests/data/clusters_load_ref_data_from_dir/02.cdhit.all.fa
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/02.cdhit.clusters.pickle b/ariba/tests/data/clusters_load_ref_data_from_dir/02.cdhit.clusters.pickle
similarity index 100%
rename from ariba/tests/data/clusters_test_load_reference_data_from_dir/02.cdhit.clusters.pickle
rename to ariba/tests/data/clusters_load_ref_data_from_dir/02.cdhit.clusters.pickle
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.clusters.tsv b/ariba/tests/data/clusters_minimap_reads_to_all_refs.clstrs.tsv
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.clusters.tsv
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.clstrs.tsv
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.cluster2representative b/ariba/tests/data/clusters_minimap_reads_to_all_refs.out.clstr2rep
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.cluster2representative
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.out.clstr2rep
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.clusterCounts b/ariba/tests/data/clusters_minimap_reads_to_all_refs.out.clstr_count
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.clusterCounts
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.out.clstr_count
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.insertHistogram b/ariba/tests/data/clusters_minimap_reads_to_all_refs.out.hist
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.insertHistogram
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.out.hist
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.properPairs b/ariba/tests/data/clusters_minimap_reads_to_all_refs.out.pairs
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.out.properPairs
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.out.pairs
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.reads_1.fq b/ariba/tests/data/clusters_minimap_reads_to_all_refs.reads_1.fq
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.reads_1.fq
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.reads_1.fq
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.reads_2.fq b/ariba/tests/data/clusters_minimap_reads_to_all_refs.reads_2.fq
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.reads_2.fq
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.reads_2.fq
diff --git a/ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.ref.fa b/ariba/tests/data/clusters_minimap_reads_to_all_refs.ref.fa
similarity index 100%
rename from ariba/tests/data/clusters_test_minimap_reads_to_all_ref_seqs.ref.fa
rename to ariba/tests/data/clusters_minimap_reads_to_all_refs.ref.fa
diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv
new file mode 100644
index 0000000..71dc3e2
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv
@@ -0,0 +1,3 @@
+seq1	1	0	.	.	.
+seq2	1	0	.	.	Original name: seq2 spam eggs
+seq3	1	0	.	.	Original name: seq3 hello dave
diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa
new file mode 100644
index 0000000..8acb693
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa
@@ -0,0 +1,6 @@
+>seq1
+CACTACAT
+>seq2 spam eggs
+AAAA
+>seq3 hello dave
+GGGGG
diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv
new file mode 100644
index 0000000..5ad3200
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv
@@ -0,0 +1,3 @@
+seq1	0	0	.	.	.
+seq2	0	0	.	.	Original name: seq2 spam eggs
+seq3	0	0	.	.	Original name: seq3 hello dave
diff --git a/ariba/tests/data/ref_preparer_test_run.out/00.info.txt b/ariba/tests/data/ref_preparer_test_run.out/00.info.txt
index c05ab65..52774fd 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/00.info.txt
+++ b/ariba/tests/data/ref_preparer_test_run.out/00.info.txt
@@ -1,6 +1,6 @@
-input fasta file:	/home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa
-input fasta file:	/home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa
-input fasta file:	/home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa
-input tsv file:	/home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.1.tsv
-input tsv file:	/home/ubuntu/git/ariba/ariba/tests/data/ref_preparer_test_run.in.2.tsv
+input fasta file:	/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa
+input fasta file:	/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa
+input fasta file:	/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa
+input tsv file:	/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.tsv
+input tsv file:	/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.tsv
 genetic_code	1
diff --git a/ariba/tests/data/ref_preparer_test_run.out/00.version_info.txt b/ariba/tests/data/ref_preparer_test_run.out/00.version_info.txt
index 0373c3d..fe7f1bb 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/00.version_info.txt
+++ b/ariba/tests/data/ref_preparer_test_run.out/00.version_info.txt
@@ -1,5 +1,5 @@
 ARIBA run with this command:
-python3 -m unittest prepareref ariba.tests.ref_preparer_test
-from this directory: /home/ubuntu/git/ariba
+python3 -m unittest prepareref ariba.tests.ref_preparer_test.TestRefPreparer
+from this directory: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba
 
 
diff --git a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.pickle b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.pickle
index 6545b8b..a113521 100644
Binary files a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.pickle and b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.pickle differ
diff --git a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
index 217fd98..bd9af57 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
+++ b/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
@@ -1,5 +1,5 @@
-cluster_1	gene1	gene2
-cluster_2	gene3
+cluster	gene1	gene2
+cluster_1	gene3
 gene4	gene4.var_only
 noncoding-	noncoding1	noncoding2.var_only	noncoding3.var_only
 noncoding4	noncoding4.var_only
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv
new file mode 100644
index 0000000..d2dc5ff
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv
@@ -0,0 +1,9 @@
+gene1	0	0	.	.	.
+gene2	0	0	.	.	.
+gene3	0	0	.	.	.
+gene4.var_only	0	0	.	.	.
+noncoding1	0	0	.	.	.
+noncoding2.var_only	0	0	.	.	.
+noncoding3.var_only	0	0	.	.	.
+noncoding4.var_only	0	0	.	.	.
+cannot_make_into_a_gene	0	0	.	.	.
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt
new file mode 100644
index 0000000..2a0b555
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt
@@ -0,0 +1,5 @@
+input fasta file:	/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa
+input fasta file:	/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa
+input fasta file:	/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa
+input tsv file:	tmp.ref_preparer_test_run/00.auto_metadata.tsv
+genetic_code	1
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt
new file mode 100644
index 0000000..fe7f1bb
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt
@@ -0,0 +1,5 @@
+ARIBA run with this command:
+python3 -m unittest prepareref ariba.tests.ref_preparer_test.TestRefPreparer
+from this directory: /nfs/users/nfs_m/mh12/sanger-pathogens/ariba
+
+
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv
new file mode 100644
index 0000000..626e9a3
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv
@@ -0,0 +1,9 @@
+cannot_make_into_a_gene	0	0	.	.	.
+gene1	0	0	.	.	.
+gene2	0	0	.	.	.
+gene3	0	0	.	.	.
+gene4.var_only	0	0	.	.	.
+noncoding1	0	0	.	.	.
+noncoding2.var_only	0	0	.	.	.
+noncoding3.var_only	0	0	.	.	.
+noncoding4.var_only	0	0	.	.	.
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa
new file mode 100644
index 0000000..339405a
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa
@@ -0,0 +1,18 @@
+>cannot_make_into_a_gene
+AAAAAAAAAAAAAAAA
+>gene1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
+>gene2
+ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
+>gene3
+ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
+>gene4.var_only
+ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
+>noncoding1
+CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding2.var_only
+CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding3.var_only
+CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding4.var_only
+CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle
new file mode 100644
index 0000000..c48f0d8
Binary files /dev/null and b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle differ
diff --git a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv
similarity index 53%
copy from ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
copy to ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv
index 217fd98..da82b78 100644
--- a/ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv
@@ -1,5 +1,5 @@
-cluster_1	gene1	gene2
-cluster_2	gene3
-gene4	gene4.var_only
+cluster	gene1	gene2
+cluster_1	cannot_make_into_a_gene
+gene4+	gene3	gene4.var_only
 noncoding-	noncoding1	noncoding2.var_only	noncoding3.var_only
 noncoding4	noncoding4.var_only
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.fa
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.varonly.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.varonly.fa
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa
new file mode 100644
index 0000000..339405a
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa
@@ -0,0 +1,18 @@
+>cannot_make_into_a_gene
+AAAAAAAAAAAAAAAA
+>gene1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
+>gene2
+ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
+>gene3
+ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
+>gene4.var_only
+ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA
+>noncoding1
+CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding2.var_only
+CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding3.var_only
+CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA
+>noncoding4.var_only
+CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA
diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa
new file mode 100644
index 0000000..e69de29
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected.clusters.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.expect.clstrs.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected.clusters.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.expect.clstrs.tsv
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.clusters.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.clusters.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.fa b/ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.fa
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.fa
rename to ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.fa
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.meta.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.meta.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.in.meta.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_clstrs_file.in.meta.tsv
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected.clusters.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.expect.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected.clusters.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.expect.tsv
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.in.fa b/ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.in.fa
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.in.fa
rename to ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.in.fa
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.in.tsv b/ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.in.tsv
similarity index 100%
rename from ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.in.tsv
rename to ariba/tests/data/reference_data_cluster_w_cdhit_nocluster.in.tsv
diff --git a/ariba/tests/data/reference_data_load_fasta_file.fa b/ariba/tests/data/reference_data_load_fasta_file.fa
index 6b27dae..e99410d 100644
--- a/ariba/tests/data/reference_data_load_fasta_file.fa
+++ b/ariba/tests/data/reference_data_load_fasta_file.fa
@@ -1,2 +1,2 @@
->seq1
+>seq1 foo
 ACGT
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.csv.1 b/ariba/tests/data/reference_data_load_input_check_seq_names.bad.csv.1
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.csv.1
rename to ariba/tests/data/reference_data_load_input_check_seq_names.bad.csv.1
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.csv.2 b/ariba/tests/data/reference_data_load_input_check_seq_names.bad.csv.2
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.csv.2
rename to ariba/tests/data/reference_data_load_input_check_seq_names.bad.csv.2
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.fa.1 b/ariba/tests/data/reference_data_load_input_check_seq_names.bad.fa.1
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.fa.1
rename to ariba/tests/data/reference_data_load_input_check_seq_names.bad.fa.1
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.fa.2 b/ariba/tests/data/reference_data_load_input_check_seq_names.bad.fa.2
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.bad.in.fa.2
rename to ariba/tests/data/reference_data_load_input_check_seq_names.bad.fa.2
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.csv.1 b/ariba/tests/data/reference_data_load_input_check_seq_names.good.csv.1
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.csv.1
rename to ariba/tests/data/reference_data_load_input_check_seq_names.good.csv.1
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.csv.2 b/ariba/tests/data/reference_data_load_input_check_seq_names.good.csv.2
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.csv.2
rename to ariba/tests/data/reference_data_load_input_check_seq_names.good.csv.2
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.fa.1 b/ariba/tests/data/reference_data_load_input_check_seq_names.good.fa.1
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.fa.1
rename to ariba/tests/data/reference_data_load_input_check_seq_names.good.fa.1
diff --git a/ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.fa.2 b/ariba/tests/data/reference_data_load_input_check_seq_names.good.fa.2
similarity index 100%
rename from ariba/tests/data/reference_data_load_input_files_and_check_seq_names.good.in.fa.2
rename to ariba/tests/data/reference_data_load_input_check_seq_names.good.fa.2
diff --git a/ariba/tests/data/reference_data_rename_sequences.fa b/ariba/tests/data/reference_data_rename_sequences.fa
index e45e17b..60c820e 100644
--- a/ariba/tests/data/reference_data_rename_sequences.fa
+++ b/ariba/tests/data/reference_data_rename_sequences.fa
@@ -1,7 +1,5 @@
 >pres_abs1 foo bar spam eggs
 ACGT
->pres_abs1 blah
-AAAA
 >pres'abs1
 CCCC
 >pres_abs2
@@ -12,11 +10,9 @@ GGGG
 AAAA
 >var:only1 boo
 CCCC
->var_only1
+>var,only1
 GGGG
 >var_only2
 TTTT
->noncoding1
-AAAA
 >noncoding1 blah
-CCCC
+AAAA
diff --git a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
index 462c736..41c6e8c 100644
--- a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
+++ b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
@@ -1,11 +1,9 @@
-noncoding1	0	0	.	.	original name "noncoding1"
-noncoding1 blah	0	0	.	.	original name "noncoding1 blah"
-pres_abs1 foo bar spam eggs	0	0	.	.	original name "pres_abs1 foo bar spam eggs"
-pres_abs1 blah	0	0	.	.	original name "pres_abs1 blah"
+noncoding1	0	0	.	.	original name "noncoding1 blah"
+pres_abs1	0	0	.	.	original name "pres_abs1 foo bar spam eggs"
 pres'abs1	0	0	.	.	original name "pres'abs1"
 pres_abs2	0	0	.	.	original name "pres_abs2"
 pres!abs3	0	0	.	.	original name "pres!abs3"
-var_only1 hello	0	0	.	.	original name "var_only1 hello"
-var:only1 boo	0	0	.	.	original name "var:only1 boo"
-var_only1	0	0	.	.	original name "var_only1"
+var_only1	0	0	.	.	original name "var_only1 hello"
+var:only1	0	0	.	.	original name "var:only1 boo"
+var,only1	0	0	.	.	original name "var,only1"
 var_only2	0	0	.	.	original name "var_only2"
diff --git a/ariba/tests/data/reference_data_test_rename_sequences.out b/ariba/tests/data/reference_data_test_rename_sequences.out
index d47d87c..e48b0d5 100644
--- a/ariba/tests/data/reference_data_test_rename_sequences.out
+++ b/ariba/tests/data/reference_data_test_rename_sequences.out
@@ -1,8 +1,6 @@
-noncoding1 blah	noncoding1_1
 pres!abs3	pres_abs3
 pres'abs1	pres_abs1
-pres_abs1 blah	pres_abs1_1
-pres_abs1 foo bar spam eggs	pres_abs1_2
-var:only1 boo	var_only1
-var_only1	var_only1_1
-var_only1 hello	var_only1_2
+pres_abs1	pres_abs1_1
+var,only1	var_only1
+var:only1	var_only1_1
+var_only1	var_only1_2
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.asmbly.fa
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.asmbly.fa
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa.fai b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.asmbly.fa.fai
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa.fai
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.asmbly.fa.fai
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.bam b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.bam
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.bam
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.bam
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.depths.gz
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.depths.gz
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz.tbi b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.depths.gz.tbi
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz.tbi
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.depths.gz.tbi
diff --git a/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf b/ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.vcf
similarity index 100%
rename from ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf
rename to ariba/tests/data/samtools_variants_make_vcf_and_depths_files.expect.vcf
diff --git a/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv b/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv
new file mode 100644
index 0000000..c652f1c
--- /dev/null
+++ b/ariba/tests/data/summary_gather_unfiltered_output_data.in.1.tsv
@@ -0,0 +1,6 @@
+#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding_ref1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A42T	1	A42T	SNP	42	42	A	84	84	T	17	.	17	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A52T	1	A52T	SNP	42	42	A	84	84	T	17	G	20,30	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence_ref1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence_ref1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence_ref2	1	0	528	232	presence_absence2	1005	554	99.1	presence_absence2.scaffold.1	1032	22.3	0	.	p	.	0	V175L	NONSYN	522	522	G	265	265	C	36	.	36	.	Description foo bar
diff --git a/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv b/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv
new file mode 100644
index 0000000..4a23ebc
--- /dev/null
+++ b/ariba/tests/data/summary_gather_unfiltered_output_data.in.2.tsv
@@ -0,0 +1,6 @@
+#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A52T	1	A52T	SNP	42	42	A	84	84	T	17	G	20,30	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv b/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
index 5e12e4a..159949c 100644
--- a/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
+++ b/ariba/tests/data/summary_sample_test_column_names_tuples_and_het_snps.tsv
@@ -1,8 +1,8 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:n:A14T:.:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:.:ref has wild type, reads have variant so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:.:Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:1:0:S5T:.:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_column_summary_data.tsv b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
index 22a42b5..9c495ec 100644
--- a/ariba/tests/data/summary_sample_test_column_summary_data.tsv
+++ b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
@@ -1,8 +1,8 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:1:0:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_var_groups.tsv b/ariba/tests/data/summary_sample_test_var_groups.tsv
index a125211..3352660 100644
--- a/ariba/tests/data/summary_sample_test_var_groups.tsv
+++ b/ariba/tests/data/summary_sample_test_var_groups.tsv
@@ -1,7 +1,7 @@
 #ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
 noncoding1	0	0	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
-presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
 presence_absence1	1	0	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
-variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
+variants_only1	1	1	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:1:0:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
deleted file mode 100644
index d1f5f70..0000000
--- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
deleted file mode 100644
index 6507d5f..0000000
--- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
-variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
deleted file mode 100644
index 9e8e9a2..0000000
--- a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
deleted file mode 100644
index d4cd028..0000000
--- a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
-variants_only1	1	1	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv b/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
deleted file mode 100644
index d1f5f70..0000000
--- a/ariba/tests/data/summary_test_get_all_het_snps.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv b/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
deleted file mode 100644
index 6507d5f..0000000
--- a/ariba/tests/data/summary_test_get_all_het_snps.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
-variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
deleted file mode 100644
index 62394c0..0000000
--- a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id4:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
deleted file mode 100644
index d4cd028..0000000
--- a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
-noncoding1	0	0	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
-presence_absence1	1	0	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
-variants_only1	1	1	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_to_matrix.1.tsv b/ariba/tests/data/summary_to_matrix.1.tsv
new file mode 100644
index 0000000..1957349
--- /dev/null
+++ b/ariba/tests/data/summary_to_matrix.1.tsv
@@ -0,0 +1,5 @@
+#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding_ref1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A42T	1	A42T	SNP	42	42	A	84	84	T	17	.	17	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A52T	1	A52T	SNP	42	42	A	84	84	T	17	G	20,30	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence_ref1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence_ref1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_to_matrix.2.tsv b/ariba/tests/data/summary_to_matrix.2.tsv
new file mode 100644
index 0000000..4a23ebc
--- /dev/null
+++ b/ariba/tests/data/summary_to_matrix.2.tsv
@@ -0,0 +1,6 @@
+#ref_name	gene	var_only	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	50	G	40,10	noncoding1:0:0:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding_ref1	0	0	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:0:0:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
+noncoding_ref2	0	0	19	78	noncoding2	120	120	98.33	noncoding2.scaffold.1	279	10.0	1	SNP	n	A52T	1	A52T	SNP	42	42	A	84	84	T	17	G	20,30	noncoding_ref2:0:0:A42T:id2:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	1	0	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	1	1	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/vfdb_parser_test_run.out.fa b/ariba/tests/data/vfdb_parser_test_run.out.fa
index 720ef52..e00ddf9 100644
--- a/ariba/tests/data/vfdb_parser_test_run.out.fa
+++ b/ariba/tests/data/vfdb_parser_test_run.out.fa
@@ -2,5 +2,5 @@
 AAAA
 >efgH.VF234(gi:2345).genus2_species2
 CCCC
->seq1 blah
+>seq1
 ACGT
diff --git a/ariba/tests/data/vfdb_parser_test_run.out.tsv b/ariba/tests/data/vfdb_parser_test_run.out.tsv
index 253ce52..c853bcf 100644
--- a/ariba/tests/data/vfdb_parser_test_run.out.tsv
+++ b/ariba/tests/data/vfdb_parser_test_run.out.tsv
@@ -1,3 +1,3 @@
-abcD.VF123(gi:1234).genus1_species1	1	0	.	.	foobar description1 [abc]
-efgH.VF234(gi:2345).genus2_species2	1	0	.	.	spam eggs description2 [abc]
-seq1 blah	1	0	.	.	.
+abcD.VF123(gi:1234).genus1_species1	1	0	.	.	Original name: VF123(gi:1234) (abcD) foobar description1 [abc] [genus1 species1]
+efgH.VF234(gi:2345).genus2_species2	1	0	.	.	Original name: VF234(gi:2345) (efgH) spam eggs description2 [abc] [genus2 species2]
+seq1	1	0	.	.	Original name: seq1 blah
diff --git a/ariba/tests/ref_preparer_test.py b/ariba/tests/ref_preparer_test.py
index 604395e..00b979d 100644
--- a/ariba/tests/ref_preparer_test.py
+++ b/ariba/tests/ref_preparer_test.py
@@ -9,6 +9,24 @@ data_dir = os.path.join(modules_dir, 'tests', 'data')
 
 
 class TestRefPreparer(unittest.TestCase):
+    def test_fasta_to_metadata(self):
+        '''test _fasta_to_metadata'''
+        infile = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.fa')
+        tmp_out = 'tmp.test_fasta_to_metadata.tsv'
+        expected_coding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.coding.tsv')
+        expected_noncoding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.noncoding.tsv')
+
+        with open(tmp_out, 'w') as f:
+            ref_preparer.RefPreparer._fasta_to_metadata(infile, f, True)
+        self.assertTrue(filecmp.cmp(expected_coding, tmp_out, shallow=False))
+
+        with open(tmp_out, 'w') as f:
+            ref_preparer.RefPreparer._fasta_to_metadata(infile, f, False)
+        self.assertTrue(filecmp.cmp(expected_noncoding, tmp_out, shallow=False))
+
+        os.unlink(tmp_out)
+
+
     def test_rename_clusters(self):
         '''test _rename_clusters'''
         clusters_in = {
@@ -31,28 +49,48 @@ class TestRefPreparer(unittest.TestCase):
            '16': {'def_2.3'},
            '17': {'def.4'},
            '18': {'def.5'},
+           '19': {'x_1.foo'},
+           '20': {'x_1.bar'},
+           '21': {'x_1.baz'},
+           '22': {'x_1_2.abc'},
+           '23': {'x_1_2.def'},
+           '24': {'y_1.foo'},
+           '25': {'y_1_2.def'},
+           '26': {'y_1.bar'},
+           '27': {'y_1.baz'},
+           '28': {'y_1_2.abc'},
         }
 
         expected = {
-            'cluster_1': {'no_dot_in_name'},
-            'cluster_2': {'another_no_dot_in_name'},
-            'foo_1': {'foo.blah_blah_blah', 'foo.xyz'},
-            'foo_2': {'foo.abc', 'foo.def'},
-            'pre-_1': {'pre1.abc', 'pre2.abc'},
-            'pre-_2': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
-            'prefix1+_1': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
-            'prefix1+_2': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
+            'cluster': {'no_dot_in_name'},
+            'cluster_1': {'another_no_dot_in_name'},
+            'foo': {'foo.blah_blah_blah', 'foo.xyz'},
+            'foo_1': {'foo.abc', 'foo.def'},
+            'pre-': {'pre1.abc', 'pre2.abc'},
+            'pre-_1': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
+            'prefix1+': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
+            'prefix1+_1': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
             'xyz+': {'xyz.1', 'xyz.2', 'abcdefgh'},
-            'cluster_3': {'a.foo', 'a.bar'},
+            'cluster_2': {'a.foo', 'a.bar'},
             'abc_1': {'abc_1.1'},
-            'abc_2': {'abc.2'},
-            'abc_3': {'abc.3'},
-            'abc_4': {'abc.4'},
+            'abc': {'abc.2'},
+            'abc_2': {'abc.3'},
+            'abc_3': {'abc.4'},
             'def_1': {'def_1.2'},
             'def_2': {'def_2.3'},
-            'def_3': {'def.1'},
-            'def_4': {'def.4'},
-            'def_5': {'def.5'},
+            'def': {'def.1'},
+            'def_3': {'def.4'},
+            'def_4': {'def.5'},
+            'x_1': {'x_1.foo'},
+            'x_1_1': {'x_1.bar'},
+            'x_1_2_1': {'x_1_2.abc'},
+            'x_1_2_2': {'x_1_2.def'},
+            'x_1_2': {'x_1.baz'},
+            'y_1': {'y_1.foo'},
+            'y_1_1': {'y_1.bar'},
+            'y_1_2_1': {'y_1_2.abc'},
+            'y_1_2': {'y_1_2.def'},
+            'y_1_3': {'y_1.baz'},
         }
 
         got = ref_preparer.RefPreparer._rename_clusters(clusters_in)
@@ -72,7 +110,7 @@ class TestRefPreparer(unittest.TestCase):
         ]
 
         extern_progs = external_progs.ExternalProgs()
-        refprep = ref_preparer.RefPreparer(fasta_in, tsv_in, extern_progs, genetic_code=1)
+        refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, metadata_tsv_files=tsv_in, genetic_code=1)
         tmp_out = 'tmp.ref_preparer_test_run'
         refprep.run(tmp_out)
         expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run.out')
@@ -96,3 +134,41 @@ class TestRefPreparer(unittest.TestCase):
 
         shutil.rmtree(tmp_out)
 
+
+    def test_run_all_noncoding(self):
+        '''test run with no metadata input, all sequences are noncoding'''
+        fasta_in = [
+            os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'),
+            os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'),
+            os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'),
+        ]
+
+        extern_progs = external_progs.ExternalProgs()
+        refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1)
+        tmp_out = 'tmp.ref_preparer_test_run'
+        refprep.run(tmp_out)
+        expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out')
+
+        test_files = [
+            '00.auto_metadata.tsv',
+            '01.filter.check_metadata.tsv',
+            '01.filter.check_genes.log',
+            '01.filter.check_metadata.log',
+            '02.cdhit.all.fa',
+            '02.cdhit.clusters.tsv',
+            '02.cdhit.gene.fa',
+            '02.cdhit.gene.varonly.fa',
+            '02.cdhit.noncoding.fa',
+            '02.cdhit.noncoding.varonly.fa',
+        ]
+
+        for filename in test_files:
+            expected = os.path.join(expected_outdir, filename)
+            got = os.path.join(tmp_out, filename)
+            self.assertTrue(filecmp.cmp(expected, got, shallow=False))
+
+        shutil.rmtree(tmp_out)
+
+
+
+
diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py
index 4714e29..f23d4cc 100644
--- a/ariba/tests/reference_data_test.py
+++ b/ariba/tests/reference_data_test.py
@@ -129,10 +129,10 @@ class TestReferenceData(unittest.TestCase):
         self.assertEqual(expected, got)
 
 
-    def test_load_input_files_and_check_seq_names_ok(self):
+    def test_load_input_check_seq_names_ok(self):
         '''Test _load_input_files_and_check_seq_names with good input'''
-        fasta_files = [os.path.join(data_dir, 'reference_data_load_input_files_and_check_seq_names.good.in.fa.' + x) for x in ['1', '2']]
-        metadata_files = [os.path.join(data_dir, 'reference_data_load_input_files_and_check_seq_names.good.in.csv.' + x) for x in ['1', '2']]
+        fasta_files = [os.path.join(data_dir, 'reference_data_load_input_check_seq_names.good.fa.' + x) for x in ['1', '2']]
+        metadata_files = [os.path.join(data_dir, 'reference_data_load_input_check_seq_names.good.csv.' + x) for x in ['1', '2']]
         expected_seqs = {
              'seq1': pyfastaq.sequences.Fasta('seq1', 'ACGT'),
              'seq2': pyfastaq.sequences.Fasta('seq2', 'TTTT')
@@ -160,10 +160,10 @@ class TestReferenceData(unittest.TestCase):
         self.assertEqual(expected_meta, got_meta)
 
 
-    def test_load_input_files_and_check_seq_names_bad(self):
+    def test_load_input_check_seq_names_bad(self):
         '''Test _load_input_files_and_check_seq_names with bad input'''
-        fasta_files = [os.path.join(data_dir, 'reference_data_load_input_files_and_check_seq_names.bad.in.fa.' + x) for x in ['1', '2']]
-        metadata_files = [os.path.join(data_dir, 'reference_data_load_input_files_and_check_seq_names.bad.in.csv.' + x) for x in ['1', '2']]
+        fasta_files = [os.path.join(data_dir, 'reference_data_load_input_check_seq_names.bad.fa.' + x) for x in ['1', '2']]
+        metadata_files = [os.path.join(data_dir, 'reference_data_load_input_check_seq_names.bad.csv.' + x) for x in ['1', '2']]
         with self.assertRaises(reference_data.Error):
             reference_data.ReferenceData._load_input_files_and_check_seq_names(fasta_files, metadata_files)
 
@@ -264,12 +264,9 @@ class TestReferenceData(unittest.TestCase):
         '''Test _new_seq_name'''
         tests = [
             ('name', 'name'),
-            ('name ', 'name'),
-            ('name xyz', 'name'),
             ('name_a', 'name_a'),
             ('name.a', 'name.a'),
-            ('name-a', 'name-a'),
-            ('name spam eggs foo', 'name'),
+            ('name-a', 'name_a'),
             ('name!', 'name_'),
             ('name:foo', 'name_foo'),
             ('name:!@foo', 'name___foo'),
@@ -281,15 +278,15 @@ class TestReferenceData(unittest.TestCase):
 
     def test_seq_names_to_rename_dict(self):
         '''Test _seq_names_to_rename_dict'''
-        names = {'foo', 'foo abc', 'foo xyz', 'bar!', 'bar:', 'spam abc', 'eggs'}
+        names = {'foo', 'bar!', 'bar:', 'bar,', 'spam', 'eggs,123'}
         got = reference_data.ReferenceData._seq_names_to_rename_dict(names)
         expected = {
-            'foo abc': 'foo_1',
-            'foo xyz': 'foo_2',
             'bar!': 'bar_',
-            'bar:': 'bar__1',
-            'spam abc': 'spam'
+            'bar,': 'bar__1',
+            'bar:': 'bar__2',
+            'eggs,123': 'eggs_123'
         }
+
         self.assertEqual(expected, got)
 
 
@@ -386,23 +383,19 @@ class TestReferenceData(unittest.TestCase):
         self.assertTrue(filecmp.cmp(expected_file, tmp_out, shallow=False))
         os.unlink(tmp_out)
 
-        meta1 = sequence_metadata.SequenceMetadata('noncoding1\t0\t0\t.\t.\toriginal name "noncoding1"')
-        meta2 = sequence_metadata.SequenceMetadata('noncoding1_1\t0\t0\t.\t.\toriginal name "noncoding1 blah"')
-        meta3 = sequence_metadata.SequenceMetadata('pres_abs1_2\t0\t0\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"')
-        meta4 = sequence_metadata.SequenceMetadata('pres_abs1_1\t0\t0\t.\t.\toriginal name "pres_abs1 blah"')
+        meta1 = sequence_metadata.SequenceMetadata('noncoding1\t0\t0\t.\t.\toriginal name "noncoding1 blah"')
+        meta3 = sequence_metadata.SequenceMetadata('pres_abs1_1\t0\t0\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"')
         meta5 = sequence_metadata.SequenceMetadata('pres_abs1\t0\t0\t.\t.\toriginal name "pres\'abs1"')
         meta6 = sequence_metadata.SequenceMetadata('pres_abs2\t0\t0\t.\t.\toriginal name "pres_abs2"')
         meta7 = sequence_metadata.SequenceMetadata('pres_abs3\t0\t0\t.\t.\toriginal name "pres!abs3"')
         meta8 = sequence_metadata.SequenceMetadata('var_only1_2\t0\t0\t.\t.\toriginal name "var_only1 hello"')
-        meta9 = sequence_metadata.SequenceMetadata('var_only1\t0\t0\t.\t.\toriginal name "var:only1 boo"')
-        meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t0\t0\t.\t.\toriginal name "var_only1"')
+        meta9 = sequence_metadata.SequenceMetadata('var_only1\t0\t0\t.\t.\toriginal name "var,only1"')
+        meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t0\t0\t.\t.\toriginal name "var:only1 boo"')
         meta11 = sequence_metadata.SequenceMetadata('var_only2\t0\t0\t.\t.\toriginal name "var_only2"')
 
         expected_meta = {
             'noncoding1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta1}},
-            'noncoding1_1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta2}},
-            'pres_abs1_2': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta3}},
-            'pres_abs1_1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta4}},
+            'pres_abs1_1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta3}},
             'pres_abs1': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta5}},
             'pres_abs2': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta6}},
             'pres_abs3': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta7}},
@@ -412,19 +405,19 @@ class TestReferenceData(unittest.TestCase):
             'var_only2': {'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta11}},
         }
 
+        self.maxDiff = None
+        self.assertEqual(set(expected_meta.keys()), set(refdata.metadata.keys()))
         self.assertEqual(expected_meta, refdata.metadata)
 
         expected_seqs_dict = {
             'noncoding1': pyfastaq.sequences.Fasta('noncoding1', 'AAAA'),
-            'noncoding1_1': pyfastaq.sequences.Fasta('noncoding1_1', 'CCCC'),
-            'pres_abs1_2': pyfastaq.sequences.Fasta('pres_abs1_2', 'ACGT'),
-            'pres_abs1_1': pyfastaq.sequences.Fasta('pres_abs1_1', 'AAAA'),
+            'pres_abs1_1': pyfastaq.sequences.Fasta('pres_abs1_1', 'ACGT'),
             'pres_abs1': pyfastaq.sequences.Fasta('pres_abs1', 'CCCC'),
             'pres_abs2': pyfastaq.sequences.Fasta('pres_abs2', 'TTTT'),
             'pres_abs3': pyfastaq.sequences.Fasta('pres_abs3', 'GGGG'),
             'var_only1_2': pyfastaq.sequences.Fasta('var_only1_2', 'AAAA'),
-            'var_only1': pyfastaq.sequences.Fasta('var_only1', 'CCCC'),
-            'var_only1_1': pyfastaq.sequences.Fasta('var_only1_1', 'GGGG'),
+            'var_only1': pyfastaq.sequences.Fasta('var_only1', 'GGGG'),
+            'var_only1_1': pyfastaq.sequences.Fasta('var_only1_1', 'CCCC'),
             'var_only2': pyfastaq.sequences.Fasta('var_only2', 'TTTT'),
         }
 
@@ -537,11 +530,11 @@ class TestReferenceData(unittest.TestCase):
         os.unlink(outprefix + '.noncoding.varonly.fa')
 
 
-    def test_cluster_with_cdhit_clusters_in_file(self):
+    def test_cluster_w_cdhit_clstrs_file(self):
         '''Test cluster_with_cd_hit clusters from file'''
-        fasta_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file.in.fa')
-        meta_tsv_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file.in.meta.tsv')
-        cluster_tsv_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file.in.clusters.tsv')
+        fasta_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_clstrs_file.in.fa')
+        meta_tsv_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_clstrs_file.in.meta.tsv')
+        cluster_tsv_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [meta_tsv_in])
         outprefix = 'tmp.test_cluster_with_cdhit_clusters_in_file'
 
@@ -555,7 +548,7 @@ class TestReferenceData(unittest.TestCase):
         got_clusters = refdata.cluster_with_cdhit(outprefix, clusters_file=cluster_tsv_in)
         self.assertEqual(expected_clusters, got_clusters)
 
-        expected_clusters_file = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file.expected.clusters.tsv')
+        expected_clusters_file = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_clstrs_file.expect.clstrs.tsv')
         got_clusters_file = outprefix + '.clusters.tsv'
         self.assertTrue(filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False))
 
@@ -567,10 +560,10 @@ class TestReferenceData(unittest.TestCase):
         os.unlink(outprefix + '.noncoding.varonly.fa')
 
 
-    def test_cluster_with_cdhit_nocluster(self):
+    def test_cluster_w_cdhit_nocluster(self):
         '''Test cluster_with_cd_hit do not run cdhit'''
-        fasta_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_nocluster.in.fa')
-        tsv_in = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_nocluster.in.tsv')
+        fasta_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_nocluster.in.fa')
+        tsv_in = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_nocluster.in.tsv')
         refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
         outprefix = 'tmp.test_cluster_with_cdhit_nocluster'
 
@@ -586,7 +579,7 @@ class TestReferenceData(unittest.TestCase):
         got_clusters = refdata.cluster_with_cdhit(outprefix, nocluster=True)
         self.assertEqual(expected_clusters, got_clusters)
 
-        expected_clusters_file = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_nocluster.expected.clusters.tsv')
+        expected_clusters_file = os.path.join(data_dir, 'reference_data_cluster_w_cdhit_nocluster.expect.tsv')
         got_clusters_file = outprefix + '.clusters.tsv'
         self.assertTrue(filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False))
 
diff --git a/ariba/tests/samtools_variants_test.py b/ariba/tests/samtools_variants_test.py
index 873e840..d0417be 100644
--- a/ariba/tests/samtools_variants_test.py
+++ b/ariba/tests/samtools_variants_test.py
@@ -16,13 +16,13 @@ def file2lines(filename):
 
 
 class TestSamtoolsVariants(unittest.TestCase):
-    def test_make_vcf_and_read_depths_files(self):
+    def test_make_vcf_and_depths_files(self):
         '''test _make_vcf_and_read_depths_files'''
-        ref = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa')
-        bam = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.bam')
-        expected_vcf = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf')
-        expected_depths = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz')
-        tmp_prefix = 'tmp.test_make_vcf_and_read_depths_files'
+        ref = os.path.join(data_dir, 'samtools_variants_make_vcf_and_depths_files.asmbly.fa')
+        bam = os.path.join(data_dir, 'samtools_variants_make_vcf_and_depths_files.bam')
+        expected_vcf = os.path.join(data_dir, 'samtools_variants_make_vcf_and_depths_files.expect.vcf')
+        expected_depths = os.path.join(data_dir, 'samtools_variants_make_vcf_and_depths_files.expect.depths.gz')
+        tmp_prefix = 'tmp.test_make_vcf_and_depths_files'
         sv = samtools_variants.SamtoolsVariants(
             ref,
             bam,
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
index 6220dcf..f5022fc 100644
--- a/ariba/tests/summary_cluster_test.py
+++ b/ariba/tests/summary_cluster_test.py
@@ -1,6 +1,6 @@
 import unittest
 import os
-from ariba import flag, summary_cluster
+from ariba import flag, summary_cluster, summary_cluster_variant
 
 modules_dir = os.path.dirname(os.path.abspath(summary_cluster.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
@@ -8,7 +8,7 @@ data_dir = os.path.join(modules_dir, 'tests', 'data')
 class TestSummaryCluster(unittest.TestCase):
     def test_line2dict(self):
         '''Test _line2dict'''
-        line = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:var_group1:ref has wild type, foo bar\tsome free text'
+        line = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:var_group1:ref has wild type, foo bar\tsome free text'
 
         expected = {
             'ref_name': 'refname',
@@ -39,7 +39,7 @@ class TestSummaryCluster(unittest.TestCase):
             'smtls_total_depth': '17',
             'smtls_alt_nt': '.',
             'smtls_alt_depth': '17',
-            'var_description': 'noncoding1:n:A14T:var_group1:ref has wild type, foo bar',
+            'var_description': 'noncoding1:1:0:A14T:var_group1:ref has wild type, foo bar',
             'var_group': 'var_group1',
             'free_text': 'some free text'
         }
@@ -51,9 +51,9 @@ class TestSummaryCluster(unittest.TestCase):
         '''Test add_data_dict'''
         cluster = summary_cluster.SummaryCluster()
         self.assertTrue(cluster.name is None)
-        line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
-        line2 = 'refname\t1\t0\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text'
-        line3 = 'refname2\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text'
+        line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line2 = 'refname\t1\t0\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id2:ref has wild type, foo bar\tsome free text'
+        line3 = 'refname2\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id3:ref has wild type, foo bar\tsome free text'
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
         data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
         data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
@@ -71,9 +71,9 @@ class TestSummaryCluster(unittest.TestCase):
         '''Test pc_id_of_longest'''
         cluster = summary_cluster.SummaryCluster()
         self.assertTrue(cluster.name is None)
-        line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
-        line2 = 'refname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
-        line3 = 'refname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line1 = 'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line2 = 'refname\t1\t0\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line3 = 'refname\t1\t0\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text'
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
         data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
         data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
@@ -85,7 +85,7 @@ class TestSummaryCluster(unittest.TestCase):
 
     def test_to_cluster_summary_number(self):
         '''Test _to_cluster_summary_assembled'''
-        line = 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line = 'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text'
         data_dict = summary_cluster.SummaryCluster.line2dict(line)
 
         tests = [
@@ -122,9 +122,9 @@ class TestSummaryCluster(unittest.TestCase):
     def test_has_known_variant(self):
         '''Test _has_known_variant'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -139,9 +139,9 @@ class TestSummaryCluster(unittest.TestCase):
 
     def test_has_any_known_variant(self):
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -159,10 +159,10 @@ class TestSummaryCluster(unittest.TestCase):
     def test_has_nonsynonymous(self):
         '''Test _has_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -178,11 +178,11 @@ class TestSummaryCluster(unittest.TestCase):
     def test_has_any_nonsynonymous(self):
         '''Test _has_any_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:N_ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:N_ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'yes', 'no', 'yes', 'yes']
@@ -198,9 +198,9 @@ class TestSummaryCluster(unittest.TestCase):
     def test_has_novel_nonsynonymous(self):
         '''Test _has_novel_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -216,9 +216,9 @@ class TestSummaryCluster(unittest.TestCase):
     def test_has_any_novel_nonsynonymous(self):
         '''Test _has_any_novel_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
             'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
         ]
@@ -236,11 +236,11 @@ class TestSummaryCluster(unittest.TestCase):
     def test_to_cluster_summary_has_known_nonsynonymous(self):
         '''Test _to_cluster_summary_has_known_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['yes', 'yes', 'no', 'no', 'no']
@@ -257,11 +257,11 @@ class TestSummaryCluster(unittest.TestCase):
     def test_to_cluster_summary_has_novel_nonsynonymous(self):
         '''Test _to_cluster_summary_has_novel_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'no', 'no', 'yes', 'yes']
@@ -278,11 +278,11 @@ class TestSummaryCluster(unittest.TestCase):
     def test_to_cluster_summary_has_nonsynonymous(self):
         '''Test _to_cluster_summary_has_nonsynonymous'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['no', 'yes', 'no', 'yes', 'yes']
@@ -369,16 +369,16 @@ class TestSummaryCluster(unittest.TestCase):
     def test_has_match(self):
         '''Test _has_match'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id1:ref has wild type, foo bar\tsome free text',
         ]
 
         expected = ['yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'no', 'no']
@@ -396,14 +396,14 @@ class TestSummaryCluster(unittest.TestCase):
     def test_has_var_groups(self):
         '''Test has_var_groups'''
         lines = [
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
-            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id4:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id5:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id6:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
-            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id2:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id3:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id4:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:0:A14T:id5:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id6:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id7:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t1\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:1:1:A14T:id7:ref has wild type, foo bar\tsome free text',
         ]
         dicts = [summary_cluster.SummaryCluster.line2dict(line) for line in lines]
         cluster = summary_cluster.SummaryCluster()
@@ -438,7 +438,7 @@ class TestSummaryCluster(unittest.TestCase):
 
     def test_non_synon_variants(self):
         '''Test non_synon_variants'''
-        line1 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
+        line1 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs'
         line2 = 'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text'
 
         data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
@@ -454,16 +454,67 @@ class TestSummaryCluster(unittest.TestCase):
     def test_known_noncoding_het_snps(self):
         '''test known_noncoding_het_snps'''
         lines = [
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs',
-            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs'
         ]
 
         cluster = summary_cluster.SummaryCluster()
         for line in lines:
             cluster.add_data_dict(summary_cluster.SummaryCluster.line2dict(line))
         got = cluster.known_noncoding_het_snps()
-        expected = {'A42T': 25.0, 'A62T': 75.0, 'A82T': 40.0}
+        expected = {
+            '.': {'A82T': 40.0},
+            'id1': {'A42T': 25.0},
+            'id2': {'A62T': 75.0},
+        }
         self.assertEqual(expected, got)
 
+
+    def test_get_all_nonsynon_variants_set(self):
+        '''test _get_all_nonsynon_variants_set'''
+        lines = [
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text',
+            'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+        ]
+
+        data_dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+
+        cluster_vars = [summary_cluster_variant.SummaryClusterVariant(x) for x in data_dicts]
+        expected = {x for x in cluster_vars if x.has_nonsynon}
+        got = summary_cluster.SummaryCluster._get_all_nonsynon_variants_set(data_dicts)
+        self.assertEqual(expected, got)
+
+
+    def test_gather_data(self):
+        '''test gather_data'''
+        lines = [
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text',
+            'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+        ]
+
+        data_dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+        cluster = summary_cluster.SummaryCluster()
+        for data_dict in data_dicts:
+            cluster.add_data_dict(data_dict)
+
+        cluster.gather_data()
+        expected_summary = {
+            'assembled': 'yes',
+            'match': 'yes',
+            'ref_seq': 'ref1',
+            'pct_id': '98.33',
+            'known_var': 'yes',
+            'novel_var': 'no',
+        }
+        self.assertEqual(expected_summary, cluster.summary)
+
+        cluster_vars = [summary_cluster_variant.SummaryClusterVariant(x) for x in data_dicts]
+        expected_variants = {x for x in cluster_vars if x.has_nonsynon}
+        self.assertEqual(expected_variants, cluster.variants)
+
diff --git a/ariba/tests/summary_cluster_variant_test.py b/ariba/tests/summary_cluster_variant_test.py
new file mode 100644
index 0000000..ec09942
--- /dev/null
+++ b/ariba/tests/summary_cluster_variant_test.py
@@ -0,0 +1,67 @@
+import unittest
+import os
+from ariba import summary_cluster, summary_cluster_variant
+
+
+class TestSummaryClusterVariant(unittest.TestCase):
+    def test_has_nonsynonymous(self):
+        '''Test _has_nonsynonymous'''
+        lines = [
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
+            'refname\t1\t0\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
+        ]
+
+        dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+        expected = [False, True, False, True, True, True]
+        assert len(dicts) == len(expected)
+
+        for i in range(len(dicts)):
+            self.assertEqual(expected[i], summary_cluster_variant.SummaryClusterVariant._has_nonsynonymous(dicts[i]))
+
+
+    def  test_get_het_percent(self):
+        '''test _get_het_percent'''
+        lines = [
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tT\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tT,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs'
+        ]
+
+        expected = [None, 25.0, 75.0, 40.0]
+        assert len(lines) == len(expected)
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            got = summary_cluster_variant.SummaryClusterVariant._get_het_percent(data_dict)
+            self.assertEqual(expected[i], got)
+
+
+    def test_init(self):
+        '''test __init__'''
+        lines = [
+            'ref1\t1\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tI14L\t1\tI14L\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:I14L:.:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA\t10,30\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+            'ref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t40\tA,G\t20,10,10\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs',
+        ]
+
+        expected = [
+            {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': None},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0},
+            {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0},
+        ]
+        assert len(lines) == len(expected)
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster_var = summary_cluster_variant.SummaryClusterVariant(data_dict)
+            for key in expected[i]:
+                got_value = eval('cluster_var.' + key)
+                self.assertEqual(expected[i][key], got_value)
+
diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py
index 091e8c0..67ca2bc 100644
--- a/ariba/tests/summary_sample_test.py
+++ b/ariba/tests/summary_sample_test.py
@@ -18,11 +18,14 @@ class TestSummarySample(unittest.TestCase):
         cluster1.add_data_dict(dicts[0])
         cluster1.add_data_dict(dicts[1])
         cluster1.add_data_dict(dicts[2])
+        cluster1.gather_data()
         cluster2 = summary_cluster.SummaryCluster()
         cluster2.add_data_dict(dicts[3])
         cluster2.add_data_dict(dicts[4])
+        cluster2.gather_data()
         cluster3 = summary_cluster.SummaryCluster()
         cluster3.add_data_dict(dicts[5])
+        cluster3.gather_data()
 
         expected = {
             'cluster.n': cluster1,
@@ -33,6 +36,9 @@ class TestSummarySample(unittest.TestCase):
         got = summary_sample.SummarySample._load_file(infile, 90)
         self.assertEqual(expected, got)
 
+        got = summary_sample.SummarySample._load_file(infile, 90, only_clusters={'cluster.n'})
+        expected = {'cluster.n': cluster1}
+        self.assertEqual(expected, got)
 
     def test_column_summary_data(self):
         '''Test _column_summary_data'''
@@ -104,7 +110,7 @@ class TestSummarySample(unittest.TestCase):
 
         expected_het_snps = {
             'cluster.v': {},
-            'cluster.n': {'A14T': 80.0},
+            'cluster.n': {'.': {'A14T': 80.0}},
             'cluster.p': {},
         }
         self.assertEqual(expected_het_snps, got_het_snps)
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 6b615ee..542ca53 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -43,35 +43,6 @@ class TestSummary(unittest.TestCase):
             self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i]))
 
 
-    def test_determine_var_cols(self):
-        col_strings = [
-            'groups,grouped,ungrouped,novel',
-            'groups,grouped,ungrouped',
-            'grouped,novel',
-            'ungrouped,novel',
-            'grouped',
-            'ungrouped',
-            'novel',
-            ''
-        ]
-
-        expected = [
-            {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': True},
-            {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': False},
-            {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': True},
-            {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': True},
-            {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': False},
-            {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': False},
-            {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': True},
-            {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': False},
-        ]
-
-        assert len(col_strings) == len(expected)
-
-        for i in range(len(col_strings)):
-            self.assertEqual(expected[i], summary.Summary._determine_var_cols(col_strings[i]))
-
-
     def test_load_input_files(self):
         '''Test _load_input_files'''
         file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv')
@@ -84,239 +55,322 @@ class TestSummary(unittest.TestCase):
         expected = {file1: sample1, file2: sample2}
         self.assertEqual(expected, got)
 
-
-    def test_get_all_cluster_names(self):
-        '''Test _get_all_cluster_names'''
-        file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv')
-        file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv')
-        samples = summary.Summary._load_input_files([file1, file2], 90)
-        got = summary.Summary._get_all_cluster_names(samples)
-        expected = {'cluster.n.1', 'cluster.v.1', 'cluster.p.1', 'cluster.p.2'}
-        self.assertEqual(expected, got)
-
-
-    def test_get_all_variant_columns(self):
-        '''Test _get_all_variant_columns'''
-        file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv')
-        file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv')
-        samples = summary.Summary._load_input_files([file1, file2], 90)
-        got = summary.Summary._get_all_variant_columns(samples)
-        expected = {
-            'cluster.p.2': {('presence_absence1', 'A10V', 'grouped', 'id3')},
-            'cluster.n.1': {('noncoding1', 'A6G', 'grouped', 'id2'), ('noncoding1', 'A14T', 'grouped', 'id1')},
-            'cluster.p.1': {('presence_absence1', 'A10V', 'grouped', 'id2')},
-        }
-        self.assertEqual(expected, got)
-
-
-    def test_get_all_het_snps(self):
-        '''test _get_all_het_snps'''
-        file1 = os.path.join(data_dir, 'summary_test_get_all_het_snps.1.tsv')
-        file2 = os.path.join(data_dir, 'summary_test_get_all_het_snps.2.tsv')
-        samples = summary.Summary._load_input_files([file1, file2], 90)
-        got = summary.Summary._get_all_het_snps(samples)
-        expected = {('noncoding1', 'A14T')}
-        self.assertEqual(expected, got)
-
-
-    def test_get_all_var_groups(self):
-        '''test _get_all_var_groups'''
-        file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv')
-        file2 = os.path.join(data_dir, 'summary_test_get_all_var_groups.2.tsv')
-        samples = summary.Summary._load_input_files([file1, file2], 90)
-        got = summary.Summary._get_all_var_groups(samples)
-        expected = {
-            'cluster.p.1': {'id4'},
-            'cluster.p.2': {'id3'},
-            'cluster.v.1': set(),
-            'cluster.n.1': {'id1', 'id2'}
-        }
+        sample1 = summary_sample.SummarySample(file1, only_clusters={'noncoding1'})
+        sample2 = summary_sample.SummarySample(file2, only_clusters={'noncoding1'})
+        sample1.run()
+        sample2.run()
+        expected = {file1: sample1, file2: sample2}
+        got = summary.Summary._load_input_files([file1, file2], 90, only_clusters={'noncoding1'})
         self.assertEqual(expected, got)
 
 
-    def test_gather_output_rows(self):
-        '''Test _gather_output_rows'''
+    def test_gather_unfiltered_output_data(self):
+        '''test gather_output_rows_new'''
         infiles = [
-            os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'),
-            os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv')
+            os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.1.tsv'),
+            os.path.join(data_dir, 'summary_gather_unfiltered_output_data.in.2.tsv')
         ]
-        s = summary.Summary('out', filenames=infiles, variant_cols=None)
-        s.samples = summary.Summary._load_input_files(infiles, 90)
-        expected = {
+
+        expected_all = {
             infiles[0]: {
                 'noncoding1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'noncoding1',
-                    'known_var': 'yes',
-                    'novel_var': 'no',
-                    'pct_id': '98.33',
+                    'summary': {
+                        'assembled': 'yes',
+                        'known_var': 'yes',
+                        'match': 'yes',
+                        'novel_var': 'no',
+                        'pct_id': '98.33',
+                        'ref_seq': 'noncoding_ref1'
+                    },
+                    'groups': {},
+                    'vars': {},
+                },
+                'noncoding2': {
+                    'summary': {
+                        'assembled': 'yes',
+                        'known_var': 'yes',
+                        'match': 'yes',
+                        'novel_var': 'no',
+                        'pct_id': '98.33',
+                        'ref_seq': 'noncoding_ref2'
+                    },
+                    'groups': {},
+                    'vars': {},
                 },
                 'presence_absence1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'presence_absence1',
-                    'known_var': 'no',
-                    'novel_var': 'yes',
-                    'pct_id': '98.96',
+                    'summary': {
+                        'assembled': 'yes',
+                        'known_var': 'no',
+                        'match': 'yes',
+                        'novel_var': 'yes',
+                        'pct_id': '98.96',
+                        'ref_seq': 'presence_absence_ref1'
+                    },
+                    'groups': {},
+                    'vars': {},
                 },
-                'variants_only1': {
-                    'assembled': 'no',
-                    'match': 'no',
-                    'ref_seq': 'NA',
-                    'known_var': 'NA',
-                    'novel_var': 'NA',
-                    'pct_id': 'NA',
+                'presence_absence2': {
+                    'summary': {
+                            'assembled': 'no',
+                            'known_var': 'NA',
+                            'match': 'no',
+                            'novel_var': 'NA',
+                            'pct_id': 'NA',
+                            'ref_seq': 'NA'
+                    },
+                    'groups': {},
+                    'vars': {}
                 }
             },
             infiles[1]: {
                 'noncoding1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'noncoding1',
-                    'known_var': 'yes',
-                    'novel_var': 'no',
-                    'pct_id': '98.33',
+                    'summary': {'assembled': 'yes',
+                        'known_var': 'yes',
+                        'match': 'yes',
+                        'novel_var': 'no',
+                        'pct_id': '98.33',
+                        'ref_seq': 'noncoding_ref1'
+                     },
+                    'groups': {},
+                    'vars': {},
+                },
+                'noncoding2': {
+                    'summary': {
+                        'assembled': 'yes',
+                        'known_var': 'yes',
+                        'match': 'yes',
+                        'novel_var': 'no',
+                        'pct_id': '98.33',
+                        'ref_seq': 'noncoding_ref2'
+                    },
+                    'groups': {},
+                    'vars': {},
                 },
                 'presence_absence1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'presence_absence1',
-                    'pct_id': '98.96',
-                    'known_var': 'no',
-                    'novel_var': 'yes',
+                    'summary': {
+                            'assembled': 'yes',
+                            'known_var': 'no',
+                            'match': 'yes',
+                            'novel_var': 'yes',
+                            'pct_id': '98.96',
+                            'ref_seq': 'presence_absence1'
+                    },
+                    'groups': {},
+                    'vars': {}
                 },
-                'variants_only1': {
-                    'assembled': 'no',
-                    'match': 'no',
-                    'ref_seq': 'NA',
-                    'known_var': 'NA',
-                    'novel_var': 'NA',
-                    'pct_id': 'NA',
-                }
+            }
+        }
+
+        expected_potential_cols = {
+            'noncoding1': {
+                'summary': {
+                    'assembled',
+                    'known_var',
+                    'match',
+                    'novel_var',
+                    'pct_id',
+                    'ref_seq'
+                },
+                'groups': set(),
+                'vars': set()
+            },
+            'noncoding2': {
+                'summary': {
+                    'assembled',
+                    'known_var',
+                    'match',
+                    'novel_var',
+                    'pct_id',
+                    'ref_seq'
+                },
+                'groups': set(),
+                'vars': set()
+            },
+            'presence_absence1': {
+                'summary': {
+                    'assembled',
+                    'known_var',
+                    'match',
+                    'novel_var',
+                    'pct_id',
+                    'ref_seq'
+                },
+                'groups': set(),
+                'vars': set()
             },
+            'presence_absence2': {
+                'summary': {
+                    'assembled',
+                    'known_var',
+                    'match',
+                    'novel_var',
+                    'pct_id',
+                    'ref_seq'
+                },
+                'groups': set(),
+                'vars': set()
+            }
         }
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
 
-        s.var_columns['groups'] = True
-        expected[infiles[0]]['noncoding1']['vgroup.id1'] = 'yes'
-        expected[infiles[0]]['noncoding1']['vgroup.id3'] = 'no'
-        expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'yes'
-        expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes'
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
+        s = summary.Summary('out', filenames=infiles)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        self.assertEqual(expected_potential_cols, s.all_potential_columns)
+        self.assertEqual(expected_all, s.all_data)
+
+        expected_potential_cols['noncoding1']['groups'] = {'id3', 'id1', 'id1.%'}
+        expected_potential_cols['noncoding2']['groups'] = {'id2.%', 'id2'}
+        expected_all[infiles[0]]['noncoding1']['groups'] = {'id1': 'yes'}
+        expected_all[infiles[0]]['noncoding2']['groups'] = {'id2': 'yes_multi_het', 'id2.%': 'NA'}
+        expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes'}
+        expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0}
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        self.assertEqual(expected_potential_cols, s.all_potential_columns)
+        self.assertEqual(expected_all, s.all_data)
+
+        expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A14T'}
+        expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T'}
+
+        expected_all[infiles[0]]['noncoding1']['vars'] = {'A14T': 'yes'}
+        expected_all[infiles[0]]['noncoding2']['vars'] = {'A42T': 'yes', 'A52T': 'het', 'A52T.%': 40.0}
+        expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes'}
+        expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0}
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        self.assertEqual(expected_potential_cols, s.all_potential_columns)
+        self.assertEqual(expected_all, s.all_data)
+
+        expected_potential_cols['presence_absence1']['vars'] = {'A10V'}
+        expected_all[infiles[0]]['presence_absence1']['vars'] = {'A10V': 'yes'}
+        expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'}
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        self.assertEqual(expected_potential_cols, s.all_potential_columns)
+        self.assertEqual(expected_all, s.all_data)
 
 
-        s.var_columns['grouped'] = True
-        s.var_columns['ungrouped'] = True
-        expected[infiles[0]]['noncoding1']['noncoding1.A14T'] = 'yes'
-        expected[infiles[0]]['noncoding1']['noncoding1.A6G'] = 'no'
-        expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'yes'
-        expected[infiles[1]]['noncoding1']['noncoding1.A6G'] = 'yes'
-        self.maxDiff = None
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
+    def test_to_matrix_all_cols(self):
+        '''Test _to_matrix all columns'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+        ]
 
-        s.var_columns['novel'] = True
-        expected[infiles[0]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
-        expected[infiles[1]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
 
-        s.show_known_het = True
-        expected[infiles[0]]['noncoding1']['noncoding1.A14T.%'] = 'NA'
-        expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'het'
-        expected[infiles[1]]['noncoding1']['noncoding1.A14T.%'] = 80.0
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncodin [...]
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.A42T', 'noncoding2. [...]
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'],
+            [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
 
-        for filename in expected:
-            del expected[filename]['noncoding1']['vgroup.id1']
-            del expected[filename]['noncoding1']['vgroup.id3']
-            for gene_type in expected[filename]:
-                del expected[filename][gene_type]['ref_seq']
 
-        s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,match,pct_id,known_var,novel_var', variant_cols='ungrouped,grouped,novel')
+    def test_to_matrix_with_groups(self):
+        '''Test _to_matrix with groups'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+        ]
+
+        s = summary.Summary('out', filenames=infiles, show_var_groups=True)
         s.samples = summary.Summary._load_input_files(infiles, 90)
-        s.include_all_variant_columns = True
-        s.show_known_het = True
-        got = s._gather_output_rows()
-        self.assertEqual(expected, got)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
 
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'presence_absence1.assembled:o1' [...]
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'p [...]
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'],
+            [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes']
+        ]
 
-    def test_to_matrix(self):
-        '''Test _to_matrix'''
-        rows = {
-            'file1': {
-                'cluster.n.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'noncoding1',
-                    'known_var': 'yes',
-                    'novel_var': 'no',
-                    'pct_id': '98.33',
-                    'noncoding1.A14T': 'yes'
-                },
-                'cluster.p.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'presence_absence1',
-                    'known_var': 'yes',
-                    'novel_var': 'no',
-                    'pct_id': '98.96',
-                    'presence_absence1.I42L': 'yes'
-                },
-                'cluster.v.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'varonly1',
-                    'known_var': 'no',
-                    'novel_var': 'no',
-                    'pct_id': '99.42',
-                }
-            },
-            'file2': {
-                'cluster.n.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'noncoding1',
-                    'known_var': 'no',
-                    'novel_var': 'no',
-                    'pct_id': '98.33',
-                    'noncoding1.A14T': 'no'
-                },
-                'cluster.p.1': {
-                    'assembled': 'yes',
-                    'match': 'yes',
-                    'ref_seq': 'presence_absence1',
-                    'pct_id': '98.96',
-                    'known_var': 'no',
-                    'novel_var': 'no',
-                    'presence_absence1.I42L': 'no'
-                },
-                'cluster.v.1': {
-                    'assembled': 'no',
-                    'match': 'NA',
-                    'ref_seq': 'NA',
-                    'known_var': 'NA',
-                    'novel_var': 'NA',
-                    'pct_id': 'NA',
-                }
-            },
-        }
-        filenames = ['file1', 'file2']
-        cluster_cols = {'assembled': True, 'match': True, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False}
-        got_phandago_header, got_csv_header, got_lines  = summary.Summary._to_matrix(filenames, rows, cluster_cols)
-        expected_phandango_header = ['name', 'cluster.n.1.assembled:o1', 'cluster.n.1.match:o1', 'cluster.n.1.noncoding1.A14T:o1', 'cluster.p.1.assembled:o1', 'cluster.p.1.match:o1', 'cluster.p.1.presence_absence1.I42L:o1', 'cluster.v.1.assembled:o1', 'cluster.v.1.match:o1']
-        expected_csv_header = ['name', 'cluster.n.1.assembled', 'cluster.n.1.match', 'cluster.n.1.noncoding1.A14T', 'cluster.p.1.assembled', 'cluster.p.1.match', 'cluster.p.1.presence_absence1.I42L', 'cluster.v.1.assembled', 'cluster.v.1.match']
-        expected_lines = [
-            ['file1', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'],
-            ['file2', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'no', 'NA']
+        self.assertEqual(expected_phandango_header, got_phandango_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
+
+
+    def test_to_matrix_with_vars(self):
+        '''Test _to_matrix with vars'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
         ]
-        self.assertEqual(expected_phandango_header, got_phandago_header)
+
+        s = summary.Summary('out', filenames=infiles, show_known_vars=True, show_novel_vars=True)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.A42T:o1', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'prese [...]
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.A42T', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presenc [...]
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'NA', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'],
+            [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
+
+
+    def test_to_matrix_cluster_only(self):
+        '''Test _to_matrix with cluster columns only'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+        ]
+
+        s = summary.Summary('out', filenames=infiles)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_abse [...]
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var']
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes'],
+            [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
         self.assertEqual(expected_csv_header, got_csv_header)
-        self.assertEqual(expected_lines, got_lines)
+        self.assertEqual(expected_matrix, got_matrix)
+
+
+    def test_to_matrix_assembled_only(self):
+        '''Test _to_matrix with assembled column only'''
+        infiles = [
+            os.path.join(data_dir, 'summary_to_matrix.1.tsv'),
+            os.path.join(data_dir, 'summary_to_matrix.2.tsv')
+        ]
+
+        s = summary.Summary('out', filenames=infiles, cluster_cols='assembled')
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s._gather_unfiltered_output_data()
+        got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns)
+
+        expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding2.assembled:o1', 'presence_absence1.assembled:o1']
+        expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding2.assembled', 'presence_absence1.assembled']
+        expected_matrix = [
+            [infiles[0], 'yes', 'yes', 'yes'],
+            [infiles[1], 'yes', 'yes', 'yes']
+        ]
+
+        self.assertEqual(expected_phandango_header, got_phandango_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
 
 
     def test_filter_matrix_rows(self):
@@ -373,10 +427,10 @@ class TestSummary(unittest.TestCase):
 
         expected_header = ['head1', 'head2', 'head2:colour', 'head3', 'head3:colour', 'head4', 'head5', 'head5:colour']
         expected_matrix = [
-            ['yes', 'yes', '#1f78b4', 'yes_nonunique', '#a6cee3', 'yes', 'no', '#33a02c'],
-            ['yes', 'yes_nonunique', '#a6cee3', 'no', '#33a02c', 'yes', 'NA', '#b2df8a'],
-            ['yes', 'no', '#33a02c', 'NA', '#b2df8a', 'yes', 'yes', '#1f78b4'],
-            ['yes', 'NA', '#b2df8a', 'yes', '#1f78b4', 'yes', 'yes_nonunique', '#a6cee3'],
+            ['yes', 'yes', '#33a02c', 'yes_nonunique', '#b2df8a', 'yes', 'no', '#fb9a99'],
+            ['yes', 'yes_nonunique', '#b2df8a', 'no', '#fb9a99', 'yes', 'NA', '#ffffff'],
+            ['yes', 'no', '#fb9a99', 'NA', '#ffffff', 'yes', 'yes', '#33a02c'],
+            ['yes', 'NA', '#ffffff', 'yes', '#33a02c', 'yes', 'yes_nonunique', '#b2df8a']
         ]
         got_header, got_matrix = summary.Summary._add_phandango_colour_columns(header, matrix)
         self.assertEqual(expected_header, got_header)
@@ -400,6 +454,23 @@ class TestSummary(unittest.TestCase):
         os.unlink(tmpfile)
 
 
+    def test_matrix_to_csv_remove_nas(self):
+        '''Test _matrix_to_csv with remove_nas '''
+        matrix = [
+            ['line1_1', 'line1_2', 'NA', 'foo'],
+            ['NA', 'NA', 'bar', 'NA'],
+        ]
+        header = ['head1', 'head2', 'head3', 'head4']
+        tmpfile = 'tmp.test.matrix_to_csv_remove_nas.csv'
+        summary.Summary._matrix_to_csv(matrix, header, tmpfile, remove_nas=True)
+        with open(tmpfile) as f:
+            got = f.read()
+
+        expected = 'head1,head2,head3,head4\nline1_1,line1_2,,foo\n,,bar,\n'
+        self.assertEqual(expected, got)
+        os.unlink(tmpfile)
+
+
     def test_distance_score_bewteen_values(self):
         '''Test _distance_score_bewteen_values'''
         tests = [
diff --git a/ariba/vfdb_parser.py b/ariba/vfdb_parser.py
index 3b9e2a4..9af1ff8 100644
--- a/ariba/vfdb_parser.py
+++ b/ariba/vfdb_parser.py
@@ -36,8 +36,11 @@ class VfdbParser:
         tsv_out = pyfastaq.utils.open_file_write(self.outprefix + '.tsv')
 
         for seq in file_reader:
+            original_id = seq.id
             seq.id, description = self._fa_header_to_name_and_metadata(seq.id)
-            print(seq.id, '1', '0', '.', '.', description, sep='\t', file=tsv_out)
+            if description == '.':
+                seq.id = original_id.split()[0]
+            print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=tsv_out)
             print(seq, file=fa_out)
 
         pyfastaq.utils.close(fa_out)
diff --git a/scripts/ariba b/scripts/ariba
index 5140aed..5c70787 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -42,7 +42,7 @@ subparser_flag.set_defaults(func=ariba.tasks.flag.run)
 
 
 #---------------------------- getref ------------------------------------
-allowed_dbs = ['argannot', 'card', 'plasmidfinder', 'resfinder','vfdb']
+allowed_dbs = sorted(list(ariba.ref_genes_getter.allowed_ref_dbs))
 subparser_getref = subparsers.add_parser(
     'getref',
     help='Download reference data',
@@ -50,6 +50,7 @@ subparser_getref = subparsers.add_parser(
     description='Download reference data from one of a few supported public resources',
 )
 subparser_getref.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
+subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Only applies to card')
 subparser_getref.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs, metavar="DB name")
 subparser_getref.add_argument('outprefix', help='Prefix of output filenames')
 subparser_getref.set_defaults(func=ariba.tasks.getref.run)
@@ -61,11 +62,13 @@ subparser_prepareref = subparsers.add_parser(
     help='Prepare reference data for input to "run"',
     usage='ariba prepareref [options] <outdir>',
     description='Prepare reference data for running the pipeline with "ariba run"',
-    epilog='REQUIRED: -f and -m must each be used at least once',
+    epilog='REQUIRED: -f/--fasta, and also either -m/--metadata or --all_coding must be used',
 )
 input_group = subparser_prepareref.add_argument_group('input files options')
 input_group.add_argument('-f', '--fasta', action='append', dest='fasta_files', required=True, help='REQUIRED. Name of fasta file. Can be used more than once if your sequences are spread over more than on file', metavar='FILENAME')
-input_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', required=True, help='REQUIRED. Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file', metavar='FILENAME')
+meta_group = input_group.add_mutually_exclusive_group(required=True)
+meta_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', help='Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file. Incompatible with --all_coding', metavar='FILENAME')
+meta_group.add_argument('--all_coding', choices=['yes', 'no'], help='Use this if you only have a fasta of presence absence sequences as input, and no metadata. Use "yes" if all sequences are coding, or "no" if they are all non-coding. Incompatible with -m/--metadata')
 
 cdhit_group = subparser_prepareref.add_argument_group('cd-hit options')
 cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
@@ -138,7 +141,8 @@ assembly_group.add_argument('--assembly_cov', type=int, help='Target read covera
 assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs [%(default)s]', default=10, metavar='INT')
 
 other_group = subparser_run.add_argument_group('Other options')
-other_group.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+#other_group.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+other_group.add_argument('--threads', type=int, help=argparse.SUPPRESS, default=1, metavar='INT')
 other_group.add_argument('--assembled_threshold', type=float, help='If proportion of gene assembled (regardless of into how many contigs) is at least this value then the flag gene_assembled is set [%(default)s]', default=0.95, metavar='FLOAT (between 0 and 1)')
 other_group.add_argument('--gene_nt_extend', type=int, help='Max number of nucleotides to extend ends of gene matches to look for start/stop codons [%(default)s]', default=30, metavar='INT')
 other_group.add_argument('--unique_threshold', type=float, help='If proportion of bases in gene assembled more than once is <= this value, then the flag unique_contig is set [%(default)s]', default=0.03, metavar='FLOAT (between 0 and 1)')
@@ -149,7 +153,7 @@ subparser_run.set_defaults(func=ariba.tasks.run.run)
 
 
 #----------------------------- summary -------------------------------
-summary_presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'cluster_known_vars', 'all', 'all_no_filter']
+summary_presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'all', 'all_no_filter']
 subparser_summary = subparsers.add_parser(
     'summary',
     help='Summarise multiple reports made by "run"',
@@ -159,13 +163,16 @@ subparser_summary = subparsers.add_parser(
 )
 
 subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
-subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--known_vars,--novel_vars. Using this overrides those options', metavar='|'.join(summary_presets))
+subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--v_groups,--variants. Using this overrides those options', metavar='|'.join(summary_presets))
 subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...')
 subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
-subparser_summary.add_argument('--het', action='store_true', help='For known noncoding SNPs, report if they are heterozygous or not, and the percent of reads supporting the variant type')
+subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree')
 subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
-subparser_summary.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='')
 subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
+subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name')
+subparser_summary.add_argument('--v_groups', action='store_true', help='Show a group column for each group of variants')
+subparser_summary.add_argument('--known_variants', action='store_true', help='Report all known variants')
+subparser_summary.add_argument('--novel_variants', action='store_true', help='Report all novel variants')
 subparser_summary.add_argument('--verbose', action='store_true', help='Be verbose')
 subparser_summary.add_argument('outprefix', help='Prefix of output files')
 subparser_summary.add_argument('infiles', nargs='*', help='Files to be summarised')
@@ -179,7 +186,8 @@ subparser_test = subparsers.add_parser(
     description='Run ARIBA on a small made up built-in test dataset'
 )
 
-subparser_test.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+#subparser_test.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+subparser_test.add_argument('--threads', type=int, help=argparse.SUPPRESS, default=1, metavar='INT')
 subparser_test.add_argument('outdir', help='Name of output directory')
 subparser_test.set_defaults(func=ariba.tasks.test.run)
 
diff --git a/setup.py b/setup.py
index 77ba935..0d421cf 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@ fermilite_mod = Extension(
 setup(
     ext_modules=[minimap_mod, fermilite_mod],
     name='ariba',
-    version='2.1.0',
+    version='2.2.0',
     description='ARIBA: Antibiotic Resistance Identification By Assembly',
     packages = find_packages(),
     package_data={'ariba': ['test_run_data/*']},

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ariba.git



More information about the debian-med-commit mailing list