[med-svn] [ariba] 02/05: Imported Upstream version 1.0.0

Sascha Steinbiss sascha at steinbiss.name
Fri May 27 14:13:03 UTC 2016


This is an automated email from the git hooks/post-receive script.

sascha-guest pushed a commit to branch master
in repository ariba.

commit 194d86ce833cf91cc5f28d20fb88491dfdb537cd
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date:   Fri May 27 11:01:28 2016 +0000

    Imported Upstream version 1.0.0
---
 .travis.yml                                        |   23 +
 README.md                                          |   67 +-
 ariba/__init__.py                                  |   28 +-
 ariba/aln_to_metadata.py                           |  269 ++++
 ariba/assembly.py                                  |  377 ++++++
 ariba/assembly_compare.py                          |  371 ++++++
 ariba/assembly_variants.py                         |  323 +++++
 ariba/best_seq_chooser.py                          |   98 ++
 ariba/card_record.py                               |  111 ++
 ariba/cdhit.py                                     |  199 +--
 ariba/cluster.py                                   | 1319 +++++---------------
 ariba/clusters.py                                  |  674 ++++++----
 ariba/common.py                                    |   31 +-
 ariba/external_progs.py                            |  224 ++--
 ariba/faidx.py                                     |    5 +-
 ariba/flag.py                                      |    9 +-
 ariba/mapping.py                                   |  196 +--
 ariba/read_store.py                                |   60 +
 ariba/ref_genes_getter.py                          |  267 ++++
 ariba/ref_preparer.py                              |  189 +++
 ariba/refcheck.py                                  |   90 --
 ariba/reference_data.py                            |  510 ++++++++
 ariba/report.py                                    |  268 ++++
 ariba/report_filter.py                             |  216 ++++
 ariba/samtools_variants.py                         |  194 +++
 ariba/sequence_metadata.py                         |   48 +
 ariba/sequence_variant.py                          |   70 ++
 ariba/summary.py                                   |  480 ++++---
 ariba/summary_cluster.py                           |  258 ++++
 ariba/summary_sample.py                            |   61 +
 ariba/tasks/aln2meta.py                            |   28 +
 ariba/tasks/getref.py                              |   19 +
 ariba/tasks/prepareref.py                          |   59 +
 ariba/tasks/refcheck.py                            |   31 -
 ariba/tasks/reportfilter.py                        |   35 +
 ariba/tasks/run.py                                 |   90 +-
 ariba/tasks/summary.py                             |  108 +-
 ariba/tasks/test.py                                |   94 ++
 ariba/tasks/version.py                             |    6 +-
 ariba/test_run_data/metadata.tsv                   |   14 +
 ariba/test_run_data/non_coding.fa                  |   10 +
 ariba/test_run_data/presence_absence.fa            |    5 +
 ariba/test_run_data/reads_1.fq                     |  908 ++++++++++++++
 ariba/test_run_data/reads_2.fq                     |  908 ++++++++++++++
 .../test_run_data/ref_fasta_to_make_reads_from.fa  |   36 +
 ariba/test_run_data/variants_only.fa               |    6 +
 ariba/tests/aln_to_metadata_test.py                |  411 ++++++
 ariba/tests/assembly_compare_test.py               |  381 ++++++
 ariba/tests/assembly_test.py                       |  135 ++
 ariba/tests/assembly_variants_test.py              |  385 ++++++
 ariba/tests/best_seq_chooser_test.py               |   69 +
 ariba/tests/card_record_test.py                    |  205 +++
 ariba/tests/cdhit_test.py                          |  130 +-
 ariba/tests/cluster_test.py                        |  967 +++-----------
 ariba/tests/clusters_test.py                       |  199 ++-
 ariba/tests/common_test.py                         |   22 +
 .../tests/data/aln_to_metadata_load_aln_file.in.fa |    4 +
 .../data/aln_to_metadata_load_vars_file_bad.1.tsv  |    2 +
 .../data/aln_to_metadata_load_vars_file_bad.2.tsv  |    2 +
 .../data/aln_to_metadata_load_vars_file_good.tsv   |    3 +
 .../data/aln_to_metadata_make_cluster_file.out     |    1 +
 ariba/tests/data/aln_to_metadata_run_coding.in.fa  |   10 +
 ariba/tests/data/aln_to_metadata_run_coding.in.tsv |    2 +
 .../data/aln_to_metadata_run_coding.out.cluster    |    1 +
 ariba/tests/data/aln_to_metadata_run_coding.out.fa |   10 +
 .../tests/data/aln_to_metadata_run_coding.out.tsv  |    7 +
 .../tests/data/aln_to_metadata_run_noncoding.in.fa |   10 +
 .../data/aln_to_metadata_run_noncoding.in.tsv      |    2 +
 .../data/aln_to_metadata_run_noncoding.out.cluster |    1 +
 .../data/aln_to_metadata_run_noncoding.out.fa      |   10 +
 .../data/aln_to_metadata_run_noncoding.out.tsv     |    7 +
 ...ssembly_compare_parse_nucmer_coords_file.coords |    7 +
 ... assembly_test_assemble_with_spades_reads_1.fq} |    0
 ... assembly_test_assemble_with_spades_reads_2.fq} |    0
 ...a => assembly_test_assemble_with_spades_ref.fa} |    0
 .../assembly_test_check_spades_log_file.log.bad    |    5 +
 .../assembly_test_check_spades_log_file.log.good   |    5 +
 ... => assembly_test_fix_contig_orientation.in.fa} |    7 +
 ...=> assembly_test_fix_contig_orientation.out.fa} |    7 +
 ...=> assembly_test_fix_contig_orientation.ref.fa} |    0
 ...st_gapfill_with_gapfiller.scaffolds_no_gaps.fa} |    0
 ..._gapfill_with_gapfiller.scaffolds_with_gaps.fa} |    0
 ...ssembly_test_gapfill_with_gapfiller_reads_1.fq} |    0
 ...ssembly_test_gapfill_with_gapfiller_reads_2.fq} |    0
 .../assembly_test_has_gaps_to_fill.has_gaps.fa     |    2 +
 .../data/assembly_test_has_gaps_to_fill.no_gaps.fa |    4 +
 ...> assembly_test_parse_assembly_bam.assembly.fa} |    0
 ...am.bam => assembly_test_parse_assembly_bam.bam} |  Bin
 ....in.fa => assembly_test_rename_scaffolds.in.fa} |    0
 .../data/assembly_test_rename_scaffolds.out.fa     |    6 +
 ... assembly_test_scaffold_with_sspace_contigs.fa} |    0
 ... assembly_test_scaffold_with_sspace_reads_1.fq} |    0
 ... assembly_test_scaffold_with_sspace_reads_2.fq} |    0
 ... => assembly_test_set_assembly_kmer_reads_1.fq} |    0
 ... => assembly_test_set_assembly_kmer_reads_2.fq} |    0
 ...ly_variants_test_get_mummer_variants.none.snps} |    0
 ...bly_variants_test_get_mummer_variants.snp.snps} |    0
 ..._one_variant_for_one_contig_coding_metadata.tsv |   11 +
 ...riant_for_one_contig_coding_presence_absence.fa |    2 +
 ..._variant_for_one_contig_non_coding.metadata.tsv |   11 +
 ...sembly_variants_test_get_variants_non_coding.fa |    2 +
 ..._variants_test_get_variants_presence_absence.fa |    2 +
 ...ariants_test_get_variants_presence_absence.snps |    3 +
 ...bly_variants_test_get_variants_variants_only.fa |    2 +
 ...y_variants_test_get_variants_variants_only.snps |    3 +
 ...s_1.fq => best_seq_chooser_best_seq_reads_1.fq} |    0
 ...s_2.fq => best_seq_chooser_best_seq_reads_2.fq} |    0
 .../genes.fa => best_seq_chooser_best_seq_ref.fa}  |    0
 .../data/best_seq_chooser_best_seq_ref.fa.fai      |    3 +
 ...ser_get_best_seq_by_alignment_score_reads_1.fq} |    0
 ...ser_get_best_seq_by_alignment_score_reads_2.fq} |    0
 ...chooser_get_best_seq_by_alignment_score_ref.fa} |    0
 ...oser_get_best_seq_by_alignment_score_ref.fa.fai |    3 +
 ...t_seq_chooser_total_alignment_score_reads_1.fq} |    0
 ...t_seq_chooser_total_alignment_score_reads_2.fq} |    0
 ..._seq_chooser_total_alignment_score_ref_seqs.fa} |    0
 ...q_chooser_total_alignment_score_ref_seqs.fa.fai |    3 +
 ariba/tests/data/cdhit_test_enumerate_fasta.in.fa  |    6 -
 ariba/tests/data/cdhit_test_enumerate_fasta.out.fa |    6 -
 ariba/tests/data/cdhit_test_fake_run.out.fa        |    8 +-
 .../data/cdhit_test_load_user_clusters_file.bad1   |    1 +
 .../data/cdhit_test_load_user_clusters_file.bad2   |    2 +
 .../data/cdhit_test_load_user_clusters_file.bad3   |    2 +
 .../data/cdhit_test_load_user_clusters_file.good   |    3 +
 .../data/cdhit_test_parse_cluster_info_file.in.fa  |   40 -
 ...dhit_test_parse_cluster_info_file.in.renamed.fa |   40 -
 .../data/cdhit_test_parse_cluster_info_file.infile |    6 +
 .../data/cdhit_test_parse_cluster_info_file.out.fa |   20 -
 ...t_test_parse_cluster_info_file.out.fa.bak.clstr |    4 -
 .../data/cdhit_test_rename_clusters.expected.fa    |    6 +
 ariba/tests/data/cdhit_test_rename_clusters.in.fa  |    6 +
 ariba/tests/data/cdhit_test_rename_fasta.in.fa     |    6 -
 ariba/tests/data/cdhit_test_run.out.fa             |    4 +-
 ...hit_test_run_get_clusters_from_dict.in.clusters |    2 +
 ...=> cdhit_test_run_get_clusters_from_dict.in.fa} |    6 +-
 .../cdhit_test_run_get_clusters_from_dict.out.fa   |    4 +
 .../cluster_test_assemble_with_spades/genes.fa     |    2 -
 .../data/cluster_test_choose_best_gene.gene.fa     |    6 -
 ariba/tests/data/cluster_test_count_reads_1.fq     |    8 +
 ariba/tests/data/cluster_test_count_reads_2.fq     |    8 +
 .../cluster_test_fix_contig_orientation/genes.fa   |   10 -
 ...luster_test_full_run_assembly_fail.noncoding.fa |    8 +
 .../cluster_test_full_run_assembly_fail/reads_1.fq |    8 +
 .../cluster_test_full_run_assembly_fail/reads_2.fq |    8 +
 .../references.fa                                  |    8 +
 ...st_full_run_choose_ref_fail.presence_absence.fa |    2 +
 .../reads_1.fq                                     |    4 +
 .../reads_2.fq                                     |    4 +
 .../references.fa                                  |    4 +
 .../data/cluster_test_full_run_ok_non_coding.fa    |    6 +
 ...luster_test_full_run_ok_non_coding.metadata.tsv |    5 +
 .../cluster_test_full_run_ok_non_coding/reads_1.fq |  144 +++
 .../cluster_test_full_run_ok_non_coding/reads_2.fq |  144 +++
 .../references.fa                                  |    6 +
 .../cluster_test_full_run_ok_presence_absence.fa   |    5 +
 ..._test_full_run_ok_presence_absence.metadata.tsv |    4 +
 .../reads_1.fq                                     |  128 ++
 .../reads_2.fq                                     |  128 ++
 .../references.fa                                  |    5 +
 .../data/cluster_test_full_run_ok_variants_only.fa |    5 +
 ...nts_only.not_present.always_report.metadata.tsv |    2 +
 ...l_run_ok_variants_only.not_present.metadata.tsv |    2 +
 ..._full_run_ok_variants_only.present.metadata.tsv |    3 +
 .../reads_1.fq                                     |  132 ++
 .../reads_2.fq                                     |  132 ++
 .../references.fa                                  |    5 +
 .../cluster_test_gapfill_with_gapfiller.gene.fa    |    2 -
 .../cluster_test_gapfill_with_gapfiller/genes.fa   |    2 -
 ariba/tests/data/cluster_test_generic/genes.fa     |    2 -
 ariba/tests/data/cluster_test_generic/reads_1.fq   |    4 -
 ariba/tests/data/cluster_test_generic/reads_2.fq   |    4 -
 .../data/cluster_test_get_read_counts/genes.fa     |    2 -
 .../data/cluster_test_get_read_counts/reads_1.fq   |    4 -
 .../data/cluster_test_get_read_counts/reads_2.fq   |    4 -
 .../cluster_test_get_read_counts_fail/genes.fa     |    2 -
 .../cluster_test_get_read_counts_fail/reads_1.fq   |    8 -
 .../cluster_test_get_read_counts_fail/reads_2.fq   |    4 -
 .../reads_1.fq                                     |    0
 .../reads_2.fq                                     |    0
 ariba/tests/data/cluster_test_init_refdata.fa      |    2 +
 .../cluster_test_load_final_contigs.contigs.fa     |    6 -
 .../data/cluster_test_load_final_contigs/genes.fa  |    2 -
 .../cluster_test_load_final_contigs/reads_1.fq     |    4 -
 .../cluster_test_load_final_contigs/reads_2.fq     |    4 -
 .../cluster_test_make_reads_for_assembly.in1.fq    |   40 +
 .../cluster_test_make_reads_for_assembly.in2.fq    |   40 +
 .../cluster_test_make_reads_for_assembly.out1.fq   |   28 +
 .../cluster_test_make_reads_for_assembly.out2.fq   |   28 +
 .../cluster_test_make_report_lines.read_depths.gz  |  Bin 78 -> 0 bytes
 ...uster_test_make_report_lines.read_depths.gz.tbi |  Bin 91 -> 0 bytes
 ...its_to_assembled_gene_sequences.expected.out.fa |    7 -
 ...luster_test_number_of_reads_for_assembly.ref.fa |    3 +
 .../data/cluster_test_parse_assembly_bam/genes.fa  |    2 -
 .../cluster_test_parse_assembly_bam/reads_1.fq     |    4 -
 .../cluster_test_parse_assembly_bam/reads_2.fq     |    4 -
 ...uster_test_parse_assembly_vs_gene_coords.coords |    7 -
 .../genes.fa                                       |    2 -
 .../reads_1.fq                                     |    4 -
 .../reads_2.fq                                     |    4 -
 .../data/cluster_test_rename_scaffolds.out.fa      |    6 -
 .../data/cluster_test_rename_scaffolds/genes.fa    |    2 -
 .../data/cluster_test_rename_scaffolds/reads_1.fq  |    4 -
 .../data/cluster_test_rename_scaffolds/reads_2.fq  |    4 -
 .../data/cluster_test_scaffold_with_sspace.gene.fa |    2 -
 .../cluster_test_scaffold_with_sspace/genes.fa     |    2 -
 .../data/cluster_test_set_assembly_kmer/genes.fa   |    2 -
 .../data/cluster_test_set_assembly_kmer/reads_1.fq |   12 -
 .../data/cluster_test_set_assembly_kmer/reads_2.fq |   12 -
 ...usters_test_bam_to_clusters.out.ref2.reads_1.fq |    4 -
 ...usters_test_bam_to_clusters.out.ref2.reads_2.fq |    4 -
 .../clusters_test_bam_to_clusters_reads.db.fa.fai  |    3 +
 ...usters_test_bam_to_clusters_reads.read_store.gz |  Bin 0 -> 213 bytes
 ...ers_test_bam_to_clusters_reads_no_reads_map.bam |  Bin 0 -> 364 bytes
 ...s_test_bam_to_clusters_reads_no_reads_map_1.fq} |    4 +-
 ...s_test_bam_to_clusters_reads_no_reads_map_2.fq} |    4 +-
 ariba/tests/data/clusters_test_dummy_db.fa         |    2 +
 ariba/tests/data/clusters_test_load_data_info_file |    5 +
 .../cdhit.clusters.pickle                          |  Bin 0 -> 34 bytes
 .../info.txt                                       |    5 +
 .../refcheck.01.check_variants.non_coding.fa       |    2 +
 .../refcheck.01.check_variants.presence_absence.fa |    2 +
 .../refcheck.01.check_variants.tsv                 |    2 +
 .../refcheck.01.check_variants.variants_only.fa    |    2 +
 ...te_catted_assembled_genes_fasta.expected.out.fa |   10 +-
 ..._write_catted_assembled_genes_fasta.in.gene1.fa |    4 -
 ..._write_catted_assembled_genes_fasta.in.gene2.fa |    2 -
 ...atted_genes_matching_refs_fasta.expected.out.fa |    4 +
 ariba/tests/data/clusters_test_write_report.tsv    |    6 +-
 ...ing_test_bowtie2_remove_both_unmapped_reads.bam |  Bin 0 -> 422 bytes
 ...ng_test_bowtie2_remove_both_unmapped_reads_1.fq |   16 +
 ...ng_test_bowtie2_remove_both_unmapped_reads_2.fq |   16 +
 ...ert.bam => mapping_test_sam_pair_to_insert.bam} |  Bin
 ..._to_fastq.bam => mapping_test_sam_to_fastq.bam} |  Bin
 ariba/tests/data/mapping_test_smalt_reads_1.fq     |   28 -
 ariba/tests/data/mapping_test_smalt_reads_2.fq     |   28 -
 ariba/tests/data/mapping_test_smalt_ref.fa         |    5 -
 ariba/tests/data/mapping_test_smalt_ref.fa.fai     |    1 -
 ariba/tests/data/mapping_test_smalt_sorted.bam     |  Bin 648 -> 0 bytes
 ariba/tests/data/mapping_test_smalt_unsorted.bam   |  Bin 656 -> 0 bytes
 ariba/tests/data/read_store_test_clean.in          |   10 +
 .../read_store_test_compress_and_index_file.in     |   10 +
 ariba/tests/data/read_store_test_get_reads.in      |   10 +
 .../data/read_store_test_get_reads.reads_1.fq      |   12 +
 .../data/read_store_test_get_reads.reads_2.fq      |   12 +
 ariba/tests/data/read_store_test_sort_file.in      |   10 +
 ariba/tests/data/read_store_test_sort_file.out     |   10 +
 .../data/ref_preparer_test_write_info_file.out     |    5 +
 ...ence_data_filter_bad_data_metadata.expected.tsv |    9 +
 .../reference_data_filter_bad_data_metadata.in.tsv |   20 +
 ...nce_data_filter_bad_data_non_coding.expected.fa |    2 +
 ...reference_data_filter_bad_data_non_coding.in.fa |    2 +
 ...ta_filter_bad_data_presence_absence.expected.fa |    6 +
 ...nce_data_filter_bad_data_presence_absence.in.fa |    6 +
 ..._data_filter_bad_data_variants_only.expected.fa |    2 +
 ...erence_data_filter_bad_data_variants_only.in.fa |    6 +
 ariba/tests/data/reference_data_get_filename       |    1 +
 ariba/tests/data/reference_data_init.tsv           |    4 +
 .../reads_1.fq => reference_data_init_empty.fa}    |    0
 .../data/reference_data_init_presence_absence.fa   |    4 +
 .../data/reference_data_keep_seqs_from_dict.fa     |    2 +
 .../data/reference_data_keep_seqs_from_dict.log    |    1 +
 ariba/tests/data/reference_data_load_fasta_file.fa |    2 +
 .../data/reference_data_load_metadata_tsv.tsv      |    3 +
 .../reference_data_make_catted_fasta.expected.fa   |    6 +
 .../reference_data_make_catted_fasta.noncoding.fa  |    2 +
 ...ence_data_make_catted_fasta.presence_absence.fa |    2 +
 ...ference_data_make_catted_fasta.variants_only.fa |    2 +
 .../data/reference_data_remove_bad_genes.in.fa     |   10 +
 .../reference_data_rename_sequences.noncoding.fa   |    4 +
 ...rence_data_rename_sequences.presence_absence.fa |   10 +
 ...eference_data_rename_sequences.variants_only.fa |    8 +
 .../reference_data_rename_sequences_metadata.tsv   |   11 +
 .../reference_data_sequence.presence_absence.fa    |    2 +
 ...erence_data_sequence_length.presence_absence.fa |    2 +
 .../data/reference_data_sequence_type.noncoding.fa |    2 +
 ...eference_data_sequence_type.presence_absence.fa |    2 +
 .../reference_data_sequence_type.variants_only.fa  |    2 +
 ...est_all_non_wild_type_variants.ref.noncoding.fa |    2 +
 ...test_all_non_wild_type_variants.ref.pres_abs.fa |    2 +
 ...test_all_non_wild_type_variants.ref.var_only.fa |    2 +
 ...erence_data_test_all_non_wild_type_variants.tsv |   12 +
 ...rence_data_test_cluster_with_cdhit.clusters.tsv |    3 +
 ..._cluster_with_cdhit.expected_representatives.fa |   10 +
 ...ence_data_test_cluster_with_cdhit.non_coding.fa |    5 +
 ...ata_test_cluster_with_cdhit.presence_absence.fa |   10 +
 ...st_cluster_with_cdhit_clusters_in_file.clusters |    4 +
 ...luster_with_cdhit_clusters_in_file.clusters.tsv |    4 +
 ...it_clusters_in_file.expected_representatives.fa |   16 +
 ...uster_with_cdhit_clusters_in_file.non_coding.fa |   10 +
 ...with_cdhit_clusters_in_file.presence_absence.fa |   10 +
 ..._test_cluster_with_cdhit_nocluster.clusters.tsv |    6 +
 ...ith_cdhit_nocluster.expected_representatives.fa |   20 +
 ...test_cluster_with_cdhit_nocluster.non_coding.fa |   10 +
 ...luster_with_cdhit_nocluster.presence_absence.fa |   10 +
 .../data/reference_data_test_remove_bad_genes.log  |    5 +
 .../data/reference_data_test_rename_sequences.out  |    8 +
 ...ata_test_write_cluster_allocation_file.expected |    4 +
 ...rence_data_test_write_seqs_to_fasta.expected.fa |    6 +
 .../reference_data_test_write_seqs_to_fasta.in.fa  |   10 +
 ...a => reference_data_write_dict_of_sequences.fa} |    6 +-
 .../reference_data_write_metadata_tsv.expected.tsv |    2 +
 .../data/reference_data_write_metadata_tsv.tsv     |    2 +
 ...nce_data_write_metadata_tsv_presence_absence.fa |    4 +
 ariba/tests/data/report_filter_test_init_bad.tsv   |    4 +
 ariba/tests/data/report_filter_test_init_good.tsv  |    5 +
 .../data/report_filter_test_load_report_bad.tsv    |    4 +
 .../data/report_filter_test_load_report_good.tsv   |    5 +
 .../tests/data/report_filter_test_run.expected.tsv |    6 +
 ariba/tests/data/report_filter_test_run.in.tsv     |    9 +
 .../tests/data/report_filter_test_write_report.tsv |    4 +
 ...mtools_variants_test_get_depths_at_position.bam |  Bin 0 -> 4744 bytes
 ...ols_variants_test_get_depths_at_position.ref.fa |   18 +
 ...variants_test_get_depths_at_position.ref.fa.fai |    1 +
 ...z => samtools_variants_test_get_read_depths.gz} |  Bin
 ... samtools_variants_test_get_read_depths.gz.tbi} |  Bin
 ...riants_test_get_variant_positions_from_vcf.vcf} |    0
 ...ools_variants_test_get_variants.read_depths.gz} |  Bin
 ..._variants_test_get_variants.read_depths.gz.tbi} |  Bin
 ...vcf => samtools_variants_test_get_variants.vcf} |    0
 ...est_make_vcf_and_read_depths_files.assembly.fa} |    0
 ...make_vcf_and_read_depths_files.assembly.fa.fai} |    0
 ...riants_test_make_vcf_and_read_depths_files.bam} |  Bin
 ..._and_read_depths_files.expected.read_depths.gz} |  Bin
 ..._read_depths_files.expected.read_depths.gz.tbi} |  Bin
 ...st_make_vcf_and_read_depths_files.expected.vcf} |    0
 .../samtools_variants_test_total_depth_per_contig  |    7 +
 ... samtools_variants_test_variants_in_coords.vcf} |    0
 .../summary_sample_test_column_names_tuples.tsv    |    8 +
 .../summary_sample_test_column_summary_data.tsv    |    8 +
 .../data/summary_sample_test_load_file.in.tsv      |    7 +
 .../summary_sample_test_non_synon_variants.tsv     |    8 +
 .../tests/data/summary_sample_test_var_groups.tsv  |    7 +
 .../data/summary_test_gather_output_rows.in.1.tsv  |    7 +-
 .../data/summary_test_gather_output_rows.in.2.tsv  |    8 +-
 .../data/summary_test_get_all_cluster_names.1.tsv  |    3 +
 .../data/summary_test_get_all_cluster_names.2.tsv  |    5 +
 .../data/summary_test_get_all_var_groups.1.tsv     |    3 +
 .../data/summary_test_get_all_var_groups.2.tsv     |    5 +
 ariba/tests/data/summary_test_load_file.in.tsv     |    5 -
 .../tests/data/summary_test_load_input_files.1.tsv |    3 +
 .../tests/data/summary_test_load_input_files.2.tsv |    5 +
 .../summary_test_newick_from_dist_matrix.distances |    4 +
 .../data/summary_test_newick_from_dist_matrix.tre  |    1 +
 .../summary_test_write_distance_matrix.distances   |    4 +
 ariba/tests/data/summary_test_write_tsv.out.tsv    |    3 -
 ariba/tests/data/test_common_cat_files.in.1        |    2 +
 ariba/tests/data/test_common_cat_files.in.2        |    3 +
 ariba/tests/data/test_common_cat_files.in.3        |    1 +
 ariba/tests/data/test_common_cat_files.out         |    6 +
 ariba/tests/data/vfdb_parser_test_run.in.fa        |    6 +
 ariba/tests/data/vfdb_parser_test_run.out.fa       |    6 +
 ariba/tests/data/vfdb_parser_test_run.out.tsv      |    2 +
 ariba/tests/external_progs_test.py                 |    9 +
 ariba/tests/faidx_test.py                          |    5 +-
 ariba/tests/flag_test.py                           |   11 +-
 ariba/tests/mapping_test.py                        |  134 +-
 ariba/tests/read_store_test.py                     |   80 ++
 ariba/tests/ref_preparer_test.py                   |  121 ++
 ariba/tests/refcheck_test.py                       |   69 -
 ariba/tests/reference_data_test.py                 |  678 ++++++++++
 ariba/tests/report_filter_test.py                  |  335 +++++
 ariba/tests/samtools_variants_test.py              |  173 +++
 ariba/tests/sequence_metadata_test.py              |   89 ++
 ariba/tests/sequence_variant_test.py               |  100 ++
 ariba/tests/summary_cluster_test.py                |  424 +++++++
 ariba/tests/summary_sample_test.py                 |  104 ++
 ariba/tests/summary_test.py                        |  503 ++++++--
 ariba/tests/versions_test.py                       |    8 +
 ariba/tests/vfdb_parser_test.py                    |   60 +
 ariba/versions.py                                  |   66 +
 ariba/vfdb_parser.py                               |   46 +
 install_dependencies.sh                            |  115 ++
 scripts/ariba                                      |   14 +-
 setup.py                                           |   11 +-
 374 files changed, 14344 insertions(+), 3389 deletions(-)

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..d2e1f1a
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,23 @@
+language: python
+addons:
+  apt:
+    packages:
+    - zlib1g-dev
+    - libblas-dev
+    - liblapack-dev
+    - libgfortran3
+    - libncurses5-dev
+    - r-base
+    - r-base-dev
+    - r-base-core
+cache:
+  directories:
+  - "build"
+  - "$HOME/.cache/pip"
+python:
+  - "3.4"
+sudo: false
+install:
+  - "source ./install_dependencies.sh"
+script:
+  - "python setup.py test"
diff --git a/README.md b/README.md
index 481f873..697c2ac 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ ARIBA
 
 Antibiotic Resistance Identification By Assembly
 
-For how to use ARIBA, please see the [ARIBA wiki page] [ARIBA wiki].
+For how to use ARIBA, please see the [ARIBA wiki page][ARIBA wiki].
 
 
 
@@ -11,18 +11,22 @@ Installation
 ------------
 
 ARIBA has the following dependencies, which need to be installed:
-  * [bowtie2] [Bowtie2] version >= 2.1.0
-  * [cd-hit] [cdhit] version >= 4.6
-  * [samtools and bcftools] [samtools]  version >= 1.2
-  * [MUMmer] [mummer] version >= 3.23
-  * Either [SPAdes] [spades] version >= 3.5.0 or [Velvet] [velvet] version >= 1.2.07
-    (SPAdes is recommended)
+  * [Python3][python] version >= 3.4
+  * [R][r] version >= 2.14.0
+  * The R package [ape][ape] version >= 3.1
+  * [Bowtie2][bowtie2] version >= 2.1.0
+  * [CD-HIT][cdhit] version >= 4.6
+  * [Samtools and BCFtools][samtools]  version >= 1.2
+  * [MUMmer][mummer] version >= 3.23
+  * [SPAdes][spades] version >= 3.5.0
+  * [Python2][python] version >= 2.7 (SPAdes needs python2)
+
 
 ARIBA has the following optional dependencies. If they are installed,
 they will be used. Otherwise scaffolding and gap filling will be
 skipped.
-  * [SSPACE-basic scaffolder] [sspace]
-  * [GapFiller] [gapfiller]
+  * [SSPACE-basic scaffolder][sspace]
+  * [GapFiller][gapfiller]
 
 Once the dependencies are installed, install ARIBA using pip:
 
@@ -38,10 +42,49 @@ If the tests all pass, install:
     python3 setup.py install
 
 
+### Dependencies and environment variables
+
+By default, ARIBA will look for the dependencies in your `$PATH`, using
+the names in the table below. This behaviour can be overridden and
+point ARIBA to a specific program using environment variables.
+The environment variable is checked first and is used if it is set.
+Otherwise ARIBA looks in your `$PATH` for the default name. This applies
+to the following dependencies.
+
+| Dependency     |  Default               | Environment variable name |
+|----------------|------------------------|---------------------------|
+| BCFtools       | `bcftools`             | `$ARIBA_BCFTOOLS`         |
+| Bowtie2        | `bowtie2`              | `$ARIBA_BOWTIE2`          |
+| CD-HIT         | `cd-hit-est`           | `$ARIBA_CDHIT`            |
+| GapFiller      | `GapFiller.pl`         | `$ARIBA_GAPFILLER`        |
+| R              | `Rscript`              | `$ARIBA_R`                |
+| Samtools       | `samtools`             | `$ARIBA_SAMTOOLS`         |
+| SPAdes         | `spades.py`            | `$ARIBA_SPADES`           |
+| SSPACE         | `SSPACE_Basic_v2.0.pl` | `$ARIBA_SSPACE`           |
+
+
+For example, you could specify an exact version of Samtools using
+(assuming BASH):
+
+    export ARIBA_SAMTOOLS=/path/to/samtools
+
+The path need not be absolute. ARIBA looks for the value of the variable
+in your $PATH. For example, suppose you have `samtools-0.1.19` and
+`samtools-1.3` installed. You could use this:
+
+    export ARIBA_SAMTOOLS=samtools-1.3
+
+
+
+
 Usage
 -----
 
-Please read the [ARIBA wiki page] [ARIBA wiki] for usage instructions.
+Please read the [ARIBA wiki page][ARIBA wiki] for usage instructions.
+
+
+
+Build status: [![Build Status](https://travis-ci.org/sanger-pathogens/ariba.svg?branch=master)](https://travis-ci.org/sanger-pathogens/ariba)
 
 
   [bowtie2]: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
@@ -52,6 +95,8 @@ Please read the [ARIBA wiki page] [ARIBA wiki] for usage instructions.
   [samtools]: http://www.htslib.org/
   [spades]: http://bioinf.spbau.ru/spades
   [sspace]: http://www.baseclear.com/genomics/bioinformatics/basetools/SSPACE
-  [velvet]: http://www.ebi.ac.uk/~zerbino/velvet/
+  [ape]: https://cran.r-project.org/web/packages/ape/index.html
+  [r]: https://www.r-project.org/
+  [python]: https://www.python.org/
 
 
diff --git a/ariba/__init__.py b/ariba/__init__.py
index 4446c00..7c85be2 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -1,5 +1,19 @@
+from pkg_resources import get_distribution
+
+try:
+    __version__ = get_distribution('ariba').version
+except:
+    __version__ = 'local'
+
+
 __all__ = [
+    'aln_to_metadata',
+    'assembly',
+    'assembly_compare',
+    'assembly_variants',
     'bam_parse',
+    'best_seq_chooser',
+    'card_record',
     'cdhit',
     'cluster',
     'clusters',
@@ -10,10 +24,22 @@ __all__ = [
     'histogram',
     'link',
     'mapping',
-    'refcheck',
+    'read_store',
+    'reference_data',
+    'ref_genes_getter',
+    'ref_preparer',
+    'report',
+    'report_filter',
     'scaffold_graph',
+    'samtools_variants',
+    'sequence_metadata',
+    'sequence_variant',
     'summary',
+    'summary_cluster',
+    'summary_sample',
     'tasks',
+    'versions',
+    'vfdb_parser',
 ]
 
 from ariba import *
diff --git a/ariba/aln_to_metadata.py b/ariba/aln_to_metadata.py
new file mode 100644
index 0000000..555b69d
--- /dev/null
+++ b/ariba/aln_to_metadata.py
@@ -0,0 +1,269 @@
+import os
+import re
+import sys
+import shutil
+import pyfastaq
+from ariba import sequence_variant
+
+class Error (Exception): pass
+
+class AlnToMetadata:
+    def __init__(self,
+      aln_file,
+      vars_file,
+      refs_are_coding,
+      cluster_rep_name,
+      genetic_code=11,
+    ):
+        self.padded_seqs = AlnToMetadata._load_aln_file(aln_file)
+        self.refs_are_coding = refs_are_coding
+        self.variants = AlnToMetadata._load_vars_file(vars_file, self.refs_are_coding)
+        self.genetic_code = genetic_code
+        self.cluster_rep_name = cluster_rep_name
+
+
+    @classmethod
+    def _load_aln_file(cls, aln_file):
+        seqs = {}
+        pyfastaq.tasks.file_to_dict(aln_file, seqs)
+        return seqs
+
+
+    @classmethod
+    def _load_vars_file(cls, vars_file, refs_are_coding):
+        var_type = 'p' if refs_are_coding else 'n'
+        f = pyfastaq.utils.open_file_read(vars_file)
+        variants = {}
+
+        for line in f:
+            try:
+                ref_name, variant, identifier, description = line.rstrip().split('\t')
+                variant = sequence_variant.Variant(var_type, variant, identifier)
+            except:
+                pyfastaq.utils.close(f)
+                raise Error('Error in this line of variants file:\n' + line)
+
+            if ref_name not in variants:
+                variants[ref_name] = []
+
+            variants[ref_name].append((variant, description))
+
+        pyfastaq.utils.close(f)
+        return variants
+
+
+    @classmethod
+    def _make_unpadded_seqs(cls, padded_seqs):
+        unpadded_seqs = {}
+        for seq in padded_seqs.values():
+            unpadded_seqs[seq.id] = pyfastaq.sequences.Fasta(seq.id, seq.seq.replace('-', ''))
+        return unpadded_seqs
+
+
+    @classmethod
+    def _check_seq_lengths_same(cls, seqs):
+        sequence_lengths = set([len(x) for x in seqs.values()])
+        if len(sequence_lengths) > 1:
+            raise Error('Input sequences must all be the same length. Cannot continue. Lengths found: ' + ','.join([str(x) for x in sequence_lengths]))
+        return len(sequence_lengths) == 1
+
+
+    @classmethod
+    def _insertion_coords(cls, sequence):
+        insertions = []
+        regex = re.compile('-+')
+        for m in regex.finditer(sequence.seq):
+             insertions.append(pyfastaq.intervals.Interval(m.span()[0], m.span()[1] - 1))
+        return insertions
+
+
+    @classmethod
+    def _make_unpadded_insertion_coords(cls, unpadded_sequences):
+        return {x.id: AlnToMetadata._insertion_coords(x) for x in unpadded_sequences.values()}
+
+
+    @classmethod
+    def _check_insertion_coords(cls, sequence):
+        insertions = AlnToMetadata._insertion_coords(sequence)
+        for coords in insertions:
+            if coords.start % 3 !=0:
+                raise Error('Insertion does not start in frame in sequence "' + sequence.id + '". Cannot continue')
+            elif len(coords) % 3 != 0:
+                raise Error('Insertion of length not a mulitple of 3 in sequence "' + sequence.id + '". Cannot continue')
+
+        return True
+
+
+    @classmethod
+    def _check_coding_seq(cls, sequence, genetic_code=11):
+        if len(sequence) % 3 != 0:
+            raise Error('Length of sequence ' + sequence.id + ' is ' + str(len(sequence)) + ', which is not a multiple of 3. Cannot continue')
+
+        original_code = pyfastaq.sequences.genetic_code
+        pyfastaq.sequences.genetic_code = genetic_code
+        protein_seq = sequence.translate()
+        start_ok = sequence.seq[0:3].upper() in pyfastaq.genetic_codes.starts[genetic_code]
+        pyfastaq.sequences.genetic_code = original_code
+
+        if not start_ok:
+            raise Error('Sequence "' + sequence.id + '" does not start with a start codon. Cannot continue')
+        elif protein_seq[-1] != '*':
+            raise Error('Sequence "' + sequence.id + '" does not end with a stop codon. Cannot continue')
+        elif '*' in protein_seq[:-1]:
+            raise Error('Sequence "' + sequence.id + '" has an internal stop codon. Cannot continue')
+
+        return True
+
+
+    @classmethod
+    def _check_sequences(cls, padded_sequences, unpadded_sequences, seqs_are_coding, genetic_code=11):
+        AlnToMetadata._check_seq_lengths_same(padded_sequences)
+
+        if seqs_are_coding:
+            for sequence in unpadded_sequences.values():
+                AlnToMetadata._check_insertion_coords(sequence)
+                AlnToMetadata._check_coding_seq(sequence, genetic_code=genetic_code)
+
+        return True
+
+
+    @classmethod
+    def _check_variants_match_sequences(cls, unpadded_sequences, variants, seqs_are_coding, genetic_code=11):
+        original_code = pyfastaq.sequences.genetic_code
+        pyfastaq.sequences.genetic_code = genetic_code
+        for seqname, variant_list in variants.items():
+            if seqname not in unpadded_sequences:
+                pyfastaq.sequences.genetic_code = original_code
+                raise Error('Sequence name "' + seqname + '" given in variants file, but sequence not found')
+            for variant, description in variant_list:
+                if not variant.sanity_check_against_seq(unpadded_sequences[seqname], translate_seq=seqs_are_coding):
+                    pyfastaq.sequences.genetic_code = original_code
+                    raise Error('Variant "' + str(variant) + '" for sequence "' + seqname + '" does not match sequence. cannot continue')
+
+        pyfastaq.sequences.genetic_code = original_code
+        return True
+
+
+    @classmethod
+    def _variant_ids_are_unique(cls, variants):
+        seen_variants = set()
+        for variants_list in variants.values():
+            for variant, description in variants_list:
+                if variant.identifier in seen_variants:
+                    raise Error('Variant identifier "' + variant.identifier + '" found more than once. Cannot continue')
+                else:
+                    seen_variants.add(variant.identifier)
+
+        return True
+
+
+    @classmethod
+    def _unpadded_to_padded_nt_position(cls, position, insertions):
+        if len(insertions) == 0:
+            return position
+
+        i = 0
+        while i < len(insertions) and insertions[i].start <= position:
+            position += len(insertions[i])
+            i += 1
+
+        return position
+
+
+    @classmethod
+    def _padded_to_unpadded_nt_position(cls, position, insertions):
+        if len(insertions) == 0:
+            return position
+
+        i = 0
+        total_gap_length = 0
+        while i < len(insertions) and insertions[i].end < position:
+            total_gap_length += len(insertions[i])
+            i += 1
+
+        if i < len(insertions) and insertions[i].distance_to_point(position) == 0:
+            return None
+        else:
+            return position - total_gap_length
+
+
+    @classmethod
+    def _variants_to_tsv_lines(cls, variants, unpadded_sequences, padded_sequences, insertions, seqs_are_coding):
+        if seqs_are_coding:
+            unpadded_aa_sequences = {x: unpadded_sequences[x].translate() for x in unpadded_sequences}
+
+        lines = []
+        for refname in sorted(variants):
+            for variant, description in variants[refname]:
+                if seqs_are_coding:
+                    ref_unpadded_nt_position = 3 * variant.position
+                else:
+                    ref_unpadded_nt_position = variant.position
+
+                padded_nt_position = AlnToMetadata._unpadded_to_padded_nt_position(ref_unpadded_nt_position, insertions[refname])
+                lines.append('\t'.join([refname, variant.variant_type, str(variant), variant.identifier, description]))
+
+                for seqname, seq in sorted(padded_sequences.items()):
+                    if seqname == refname:
+                        continue
+
+                    if seq[padded_nt_position] == '-':
+                        print('Warning: position has a gap in sequence ', seqname, 'corresponding to variant', variant, '(' + variant.identifier + ') in sequence ', refname, '... Ignoring for ' + seqname, file=sys.stderr)
+                        continue
+
+                    unpadded_nt_position = AlnToMetadata._padded_to_unpadded_nt_position(padded_nt_position, insertions[seqname])
+                    assert unpadded_nt_position is not None
+
+                    if seqs_are_coding:
+                        assert unpadded_nt_position % 3 == 0
+                        unpadded_aa_position = unpadded_nt_position // 3
+                        pos_string = str(unpadded_aa_position)
+                        if unpadded_aa_sequences[seqname][unpadded_aa_position] in {variant.wild_value, variant.variant_value}:
+                            variant_string = variant.wild_value
+                        else:
+                            variant_string = unpadded_aa_sequences[seqname][unpadded_aa_position]
+                        variant_string += str(unpadded_aa_position + 1) + variant.variant_value
+                    else:
+                        pos_string = str(unpadded_nt_position)
+                        if unpadded_sequences[seqname][unpadded_nt_position] in {variant.wild_value, variant.variant_value}:
+                            variant_string = variant.wild_value
+                        else:
+                            variant_string = unpadded_sequences[seqname][unpadded_nt_position]
+                        variant_string += str(unpadded_nt_position + 1) + variant.variant_value
+
+                    lines.append('\t'.join([seqname, variant.variant_type, variant_string, variant.identifier, description]))
+
+        return lines
+
+
+    @classmethod
+    def _make_cluster_file(cls, cluster_name, sequences, filename):
+        if cluster_name not in sequences:
+            raise Error('Sequence name "' + cluster_name + '" to be used as cluster representative not found. Cannot continue')
+        names = [x for x in sequences.keys() if x != cluster_name]
+        names.sort()
+        with open(filename, 'w') as f:
+            print(cluster_name, *names, sep='\t', file=f)
+
+
+    def run(self, outprefix):
+        if self.cluster_rep_name not in self.padded_seqs:
+            raise Error('Sequence name "' + self.cluster_rep_name + '" to be used as cluster representative not found. Cannot continue')
+        original_code = pyfastaq.sequences.genetic_code
+        pyfastaq.sequences.genetic_code = self.genetic_code
+        unpadded_seqs = AlnToMetadata._make_unpadded_seqs(self.padded_seqs)
+        insertions = AlnToMetadata._make_unpadded_insertion_coords(self.padded_seqs)
+        AlnToMetadata._check_sequences(self.padded_seqs, unpadded_seqs, self.refs_are_coding, genetic_code=self.genetic_code)
+        AlnToMetadata._variant_ids_are_unique(self.variants)
+        AlnToMetadata._check_variants_match_sequences(unpadded_seqs, self.variants, self.refs_are_coding, genetic_code=self.genetic_code)
+
+        tsv_lines = AlnToMetadata._variants_to_tsv_lines(self.variants, unpadded_seqs, self.padded_seqs, insertions, self.refs_are_coding)
+        with open(outprefix + '.tsv', 'w') as f:
+            print(*tsv_lines, sep='\n', file=f)
+
+        with open(outprefix + '.fa', 'w') as f:
+            for seqname in sorted(unpadded_seqs):
+                print(unpadded_seqs[seqname], sep='\n', file=f)
+
+        AlnToMetadata._make_cluster_file(self.cluster_rep_name, unpadded_seqs, outprefix + '.cluster')
+        pyfastaq.sequences.genetic_code = original_code
diff --git a/ariba/assembly.py b/ariba/assembly.py
new file mode 100644
index 0000000..40aa2ed
--- /dev/null
+++ b/ariba/assembly.py
@@ -0,0 +1,377 @@
+import os
+import sys
+import shutil
+import pyfastaq
+import pymummer
+from ariba import common, mapping, bam_parse, external_progs
+
+class Error (Exception): pass
+
+class Assembly:
+    def __init__(self,
+      reads1,
+      reads2,
+      ref_fasta,
+      working_dir,
+      final_assembly_fa,
+      final_assembly_bam,
+      log_fh,
+      scaff_name_prefix='scaffold',
+      kmer=0,
+      assembler='spades',
+      bowtie2_preset='very-sensitive-local',
+      max_insert=1000,
+      min_scaff_depth=10,
+      min_scaff_length=50,
+      nucmer_min_id=90,
+      nucmer_min_len=20,
+      nucmer_breaklen=200,
+      spades_other_options=None,
+      sspace_k=20,
+      sspace_sd=0.4,
+      reads_insert=500,
+      extern_progs=None,
+      clean=True,
+    ):
+        self.reads1 = os.path.abspath(reads1)
+        self.reads2 = os.path.abspath(reads2)
+        self.ref_fasta = os.path.abspath(ref_fasta)
+        self.working_dir = os.path.abspath(working_dir)
+        self.final_assembly_fa = os.path.abspath(final_assembly_fa)
+        self.final_assembly_bam = os.path.abspath(final_assembly_bam)
+        self.log_fh = log_fh
+        self.scaff_name_prefix = scaff_name_prefix
+
+        self.assembly_kmer = self._get_assembly_kmer(kmer, reads1, reads2)
+        self.assembler = assembler
+        self.bowtie2_preset = bowtie2_preset
+        self.max_insert = max_insert
+        self.min_scaff_depth = min_scaff_depth
+        self.min_scaff_length = min_scaff_length
+        self.nucmer_min_id = nucmer_min_id
+        self.nucmer_min_len = nucmer_min_len
+        self.nucmer_breaklen = nucmer_breaklen
+        self.spades_other_options = spades_other_options
+        self.sspace_k = sspace_k
+        self.sspace_sd = sspace_sd
+        self.reads_insert = reads_insert
+        self.clean = clean
+
+        if extern_progs is None:
+            self.extern_progs = external_progs.ExternalProgs()
+        else:
+            self.extern_progs = extern_progs
+
+        try:
+            os.mkdir(self.working_dir)
+        except:
+            raise Error('Error mkdir ' + self.working_dir)
+
+        self.assembler_dir = os.path.join(self.working_dir, 'Assemble')
+        self.assembly_contigs = os.path.join(self.working_dir, 'contigs.fa')
+        self.scaffold_dir = os.path.join(self.working_dir, 'Scaffold')
+        self.scaffolder_scaffolds = os.path.join(self.working_dir, 'scaffolds.fa')
+        self.gapfill_dir = os.path.join(self.working_dir, 'Gapfill')
+        self.gapfilled_scaffolds = os.path.join(self.working_dir, 'scaffolds.gapfilled.fa')
+        self.gapfilled_length_filtered = os.path.join(self.working_dir, 'scaffolds.gapfilled.length_filtered.fa')
+
+
+    @staticmethod
+    def _get_assembly_kmer(k, reads1, reads2):
+        '''If the kmer not given, uses 2/3 of the mean read length (using first 1000 forward and first 1000 reverse reads)'''
+        if k == 0:
+            read_length1 = pyfastaq.tasks.mean_length(reads1, limit=1000)
+            read_length2 = pyfastaq.tasks.mean_length(reads2, limit=1000)
+            assembly_kmer = round( (read_length1 + read_length2) / 3)
+            if assembly_kmer % 2 == 0:
+                assembly_kmer += 1
+        else:
+            assembly_kmer = k
+
+        return assembly_kmer
+
+
+    @staticmethod
+    def _check_spades_log_file(logfile):
+        '''SPAdes can fail with a strange error. Stop everything if this happens'''
+        f = pyfastaq.utils.open_file_read(logfile)
+
+        for line in f:
+            if line.startswith('== Error ==  system call for:') and line.rstrip().endswith('finished abnormally, err code: -7'):
+                pyfastaq.utils.close(f)
+                print('Error running SPAdes. Cannot continue. This is the error from the log file', logfile, '...', file=sys.stderr)
+                print(line, file=sys.stderr)
+                raise Error('Fatal error ("err code: -7") running spades. Cannot continue')
+
+        pyfastaq.utils.close(f)
+        return True
+
+
+    def _assemble_with_spades(self, unittest=False):
+        cmd = ' '.join([
+            self.extern_progs.exe('spades'),
+            '-1', self.reads1,
+            '-2', self.reads2,
+            '-o', self.assembler_dir,
+            '-k', str(self.assembly_kmer),
+            '--threads 1', # otherwise defaults to 16!
+            '--untrusted-contigs', self.ref_fasta,
+        ])
+        if self.spades_other_options is not None:
+            cmd += ' ' + self.spades_other_options
+
+        cwd = os.getcwd()
+        try:
+            os.chdir(self.working_dir)
+        except:
+            raise Error('Error chdir ' + self.working_dir)
+        spades_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'scaffolds.fasta')
+
+        if unittest:
+            os.mkdir(self.assembler_dir)
+            open(spades_contigs, 'w').close()
+            self.assembled_ok = True
+        else:
+            self.assembled_ok, err = common.syscall(cmd, verbose=True, allow_fail=True, verbose_filehandle=self.log_fh, print_errors=False)
+        if self.assembled_ok:
+            os.rename(spades_contigs, os.path.basename(self.assembly_contigs))
+        else:
+            print('Assembly finished with errors. These are the errors:', file=self.log_fh)
+            print(err, file=self.log_fh)
+            print('\nEnd of spades errors\n', file=self.log_fh)
+
+        spades_log = os.path.join(self.assembler_dir, 'spades.log')
+        if os.path.exists(spades_log):
+            self._check_spades_log_file(spades_log)
+
+            with open(spades_log) as f:
+                print('\n______________ SPAdes log ___________________\n', file=self.log_fh)
+                for line in f:
+                    print(line.rstrip(), file=self.log_fh)
+                print('\n______________ End of SPAdes log _________________\n', file=self.log_fh)
+
+
+        spades_warnings = os.path.join(self.assembler_dir, 'warnings.log')
+        if os.path.exists(spades_warnings):
+            with open(spades_warnings) as f:
+                print('\n______________ SPAdes warnings ___________________\n', file=self.log_fh)
+                for line in f:
+                    print(line.rstrip(), file=self.log_fh)
+                print('\n______________ End of SPAdes warnings _________________\n', file=self.log_fh)
+
+        os.chdir(cwd)
+
+        if self.clean:
+            print('Deleting assembly directory', self.assembler_dir, file=self.log_fh)
+            shutil.rmtree(self.assembler_dir)
+
+
+    def _scaffold_with_sspace(self):
+        if not os.path.exists(self.assembly_contigs):
+            raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs)
+
+        try:
+            os.mkdir(self.scaffold_dir)
+        except:
+            raise Error('Error mkdir '+  self.scaffold_dir)
+
+        cwd = os.getcwd()
+
+        if self.extern_progs.exe('sspace') is None:
+            os.chdir(self.working_dir)
+            os.symlink(self.assembly_contigs, os.path.basename(self.scaffolder_scaffolds))
+            os.chdir(cwd)
+            return
+
+        os.chdir(self.scaffold_dir)
+        lib_file = 'lib'
+        with open(lib_file, 'w') as f:
+            print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f)
+
+        cmd = ' '.join([
+            'perl', self.extern_progs.exe('sspace'),
+            '-k', str(self.sspace_k),
+            '-l', lib_file,
+            '-s', self.assembly_contigs
+        ])
+
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+        sspace_scaffolds = os.path.abspath('standard_output.final.scaffolds.fasta')
+        sspace_log = os.path.abspath('standard_output.logfile.txt')
+        with open(sspace_log) as f:
+            print('\n_______________ SSPACE log __________________\n', file=self.log_fh)
+            for line in f:
+                print(line.rstrip(), file=self.log_fh)
+            print('_______________ End of SSPACE log __________________\n', file=self.log_fh)
+
+        os.rename(sspace_scaffolds, self.scaffolder_scaffolds)
+        os.chdir(cwd)
+
+        if self.clean:
+            print('Deleting scaffolding directory', self.scaffold_dir, file=self.log_fh)
+            shutil.rmtree(self.scaffold_dir)
+
+
+    @staticmethod
+    def _has_gaps_to_fill(filename):
+        seq_reader = pyfastaq.sequences.file_reader(filename)
+        for seq in seq_reader:
+            if 'n' in seq.seq or 'N' in seq.seq:
+                return True
+        return False
+
+
+    @staticmethod
+    def _rename_scaffolds(infile, outfile, prefix):
+        freader = pyfastaq.sequences.file_reader(infile)
+        f_out = pyfastaq.utils.open_file_write(outfile)
+        i = 1
+        for scaff in freader:
+            scaff.id = prefix + '.scaffold.' + str(i)
+            i += 1
+            print(scaff, file=f_out)
+        pyfastaq.utils.close(f_out)
+
+
+    def _gap_fill_with_gapfiller(self):
+        if not os.path.exists(self.scaffolder_scaffolds):
+            raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds)
+
+        cwd = os.getcwd()
+
+        if self.extern_progs.exe('gapfiller') is None or not self._has_gaps_to_fill(self.scaffolder_scaffolds):
+            self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds, self.scaff_name_prefix)
+            return
+
+        try:
+            os.mkdir(self.gapfill_dir)
+        except:
+            raise Error('Error mkdir '+  self.gapfill_dir)
+
+        os.chdir(self.gapfill_dir)
+        lib_file = 'lib'
+        with open(lib_file, 'w') as f:
+            print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f)
+
+        cmd = ' '.join([
+            'perl', self.extern_progs.exe('gapfiller'),
+            '-l', lib_file,
+            '-s', self.scaffolder_scaffolds
+        ])
+
+        gapfilled_scaffolds = os.path.join(self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa')
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+        self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds, self.scaff_name_prefix)
+        os.chdir(cwd)
+        if self.clean:
+            print('Deleting GapFiller directory', self.gapfill_dir, file=self.log_fh)
+            shutil.rmtree(self.gapfill_dir)
+
+
+    @staticmethod
+    def _fix_contig_orientation(contigs_fa, ref_fa, outfile, min_id=90, min_length=20, breaklen=200):
+        '''Changes orientation of each contig to match the reference, when possible.
+           Returns a set of names of contigs that had hits in both orientations to the reference'''
+        if not os.path.exists(contigs_fa):
+            raise Error('Cannot fix orientation of assembly contigs because file not found: ' + contigs_fa)
+
+        tmp_coords = os.path.join(outfile + '.tmp.rename.coords')
+        pymummer.nucmer.Runner(
+            ref_fa,
+            contigs_fa,
+            tmp_coords,
+            min_id=min_id,
+            min_length=min_length,
+            breaklen=breaklen,
+            maxmatch=True,
+        ).run()
+
+        to_revcomp = set()
+        not_revcomp = set()
+        file_reader = pymummer.coords_file.reader(tmp_coords)
+        for hit in file_reader:
+            if hit.on_same_strand():
+                not_revcomp.add(hit.qry_name)
+            else:
+                to_revcomp.add(hit.qry_name)
+
+        os.unlink(tmp_coords)
+        in_both = to_revcomp.intersection(not_revcomp)
+
+        f = pyfastaq.utils.open_file_write(outfile)
+        seq_reader = pyfastaq.sequences.file_reader(contigs_fa)
+        for seq in seq_reader:
+            if seq.id in to_revcomp and seq.id not in in_both:
+                seq.revcomp()
+            print(seq, file=f)
+        pyfastaq.utils.close(f)
+
+        return in_both
+
+
+    @staticmethod
+    def _parse_bam(sequences, bam, min_scaff_depth, max_insert):
+        if not os.path.exists(bam):
+            raise Error('File not found: ' + bam)
+
+        bam_parser = bam_parse.Parser(bam, sequences)
+        bam_parser.parse()
+        bam_parser.write_files(bam)
+        return bam_parser.scaff_graph_is_consistent(min_scaff_depth, max_insert)
+
+
+    def run(self):
+        self._assemble_with_spades()
+        self.sequences = {}
+
+        # double-check we got some contigs
+        number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs) if os.path.exists(self.assembly_contigs) else 0
+        if number_of_contigs == 0:
+            self.assembled_ok = False
+            # This is to make this object picklable, to keep multithreading happy
+            self.log_fh = None
+            return
+        else:
+            self.assembled_ok = True
+
+        if self.assembled_ok:
+            self._scaffold_with_sspace()
+            self._gap_fill_with_gapfiller()
+
+            pyfastaq.tasks.filter(self.gapfilled_scaffolds, self.gapfilled_length_filtered, minlength=self.min_scaff_length)
+            if pyfastaq.tasks.count_sequences(self.gapfilled_length_filtered) == 0:
+                self.assembled_ok = False
+                # This is to make this object picklable, to keep multithreading happy
+                self.log_fh = None
+                return
+
+            contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen)
+            self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
+            pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)
+
+            mapping.run_bowtie2(
+                self.reads1,
+                self.reads2,
+                self.final_assembly_fa,
+                self.final_assembly_bam[:-4],
+                threads=1,
+                sort=True,
+                samtools=self.extern_progs.exe('samtools'),
+                bowtie2=self.extern_progs.exe('bowtie2'),
+                bowtie2_preset=self.bowtie2_preset,
+                verbose=True,
+                verbose_filehandle=self.log_fh
+            )
+
+            self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert)
+            print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh)
+
+            if self.clean:
+                for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']:
+                    filename = self.final_assembly_bam + '.' + suffix
+                    print('Deleting file', filename, file=self.log_fh)
+                    os.unlink(filename)
+
+
+        # This is to make this object picklable, to keep multithreading happy
+        self.log_fh = None
diff --git a/ariba/assembly_compare.py b/ariba/assembly_compare.py
new file mode 100644
index 0000000..e2f7599
--- /dev/null
+++ b/ariba/assembly_compare.py
@@ -0,0 +1,371 @@
+import os
+import copy
+import pyfastaq
+import pymummer
+
+class Error (Exception): pass
+
+class AssemblyCompare:
+    def __init__(self,
+      assembly_fa,
+      assembly_sequences,
+      ref_fa,
+      ref_sequence,
+      outprefix,
+      refdata,
+      nucmer_min_id=90,
+      nucmer_min_len=20,
+      nucmer_breaklen=200,
+      assembled_threshold=0.95,
+      unique_threshold=0.03,
+      max_gene_nt_extend=30,
+    ):
+        self.assembly_fa = os.path.abspath(assembly_fa)
+        self.assembly_sequences = assembly_sequences
+        self.ref_fa = os.path.abspath(ref_fa)
+        self.ref_sequence = ref_sequence
+        self.outprefix = os.path.abspath(outprefix)
+        self.refdata = refdata
+
+        self.nucmer_min_id = nucmer_min_id
+        self.nucmer_min_len = nucmer_min_len
+        self.nucmer_breaklen = nucmer_breaklen
+        self.assembled_threshold = assembled_threshold
+        self.unique_threshold = unique_threshold
+        self.max_gene_nt_extend = max_gene_nt_extend
+        self.gene_matching_ref = None
+        self.gene_matching_ref_type = None
+        self.gene_start_bases_added = None
+        self.gene_end_bases_added = None
+
+        self.nucmer_coords_file = self.outprefix + '.nucmer.coords'
+        self.nucmer_snps_file = self.nucmer_coords_file + '.snps'
+
+
+    def _run_nucmer(self):
+        pymummer.nucmer.Runner(
+            self.ref_fa,
+            self.assembly_fa,
+            self.nucmer_coords_file,
+            min_id=self.nucmer_min_id,
+            min_length=self.nucmer_min_len,
+            breaklen=self.nucmer_breaklen,
+            maxmatch=True,
+            show_snps=True
+        ).run()
+
+
+    @staticmethod
+    def _parse_nucmer_coords_file(coords_file, ref_name):
+        '''Input is coords file made by self._run_nucmer. Reference should have one sequence only.
+           ref_name is name fo the reference sequence, to sanity check the coords file.
+           Returns dictionary. Key = assembly contig name. Value = list of nucmer hits to that contig'''
+        file_reader = pymummer.coords_file.reader(coords_file)
+        nucmer_hits = {}
+        for hit in file_reader:
+            assert hit.ref_name == ref_name
+            contig = hit.qry_name
+            if contig not in nucmer_hits:
+                nucmer_hits[contig] = []
+            nucmer_hits[contig].append(copy.copy(hit))
+
+        return nucmer_hits
+
+
+    @staticmethod
+    def _nucmer_hits_to_percent_identity(nucmer_hits):
+        '''Input is hits made by self._parse_nucmer_coords_file.
+           Returns dictionary. key = contig name. Value = percent identity of hits to that contig'''
+        percent_identities = {}
+
+        for contig in nucmer_hits:
+            product_sum = 0
+            length_sum = 0
+            for hit in nucmer_hits[contig]:
+                product_sum += hit.hit_length_qry * hit.percent_identity
+                length_sum += hit.hit_length_qry
+            assert length_sum > 0
+            percent_identities[contig] = round(product_sum / length_sum, 2)
+
+        return percent_identities
+
+
+    @staticmethod
+    def _nucmer_hits_to_assembly_coords(nucmer_hits):
+        '''Input is hits made by self._parse_nucmer_coords_file.
+           Returns dictionary. key = contig name. Value = list of coords that match
+           to the reference gene'''
+        coords = {}
+        for l in nucmer_hits.values():
+            for hit in l:
+                if hit.qry_name not in coords:
+                    coords[hit.qry_name] = []
+                coords[hit.qry_name].append(hit.qry_coords())
+
+        for scaff in coords:
+            pyfastaq.intervals.merge_overlapping_in_list(coords[scaff])
+
+        return coords
+
+
+    def assembly_match_coords(self):
+        return self._nucmer_hits_to_assembly_coords(self.nucmer_hits)
+
+
+    @classmethod
+    def nucmer_hits_to_ref_coords(cls, nucmer_hits, contig=None):
+        '''Input is hits made by self._parse_nucmer_coords_file.
+           Returns dictionary. Key = contig name. Value = list of coords in the
+           reference sequence for that contig.
+           if contig=contig_name, then just gets the ref coords from that contig,
+           instead of using all the contigs'''
+        coords = []
+        if contig is None:
+            coords = {key: [] for key in nucmer_hits.keys()}
+        else:
+            coords = {contig: []}
+
+        for key in coords:
+            coords[key] = [hit.ref_coords() for hit in nucmer_hits[key]]
+            pyfastaq.intervals.merge_overlapping_in_list(coords[key])
+
+        return coords
+
+
+    @staticmethod
+    def ref_cov_per_contig(nucmer_hits):
+        '''Input is hits made by self._parse_nucmer_coords_file.
+           Returns dictionary. key = contig name. Value = number of bases that
+           match to the reference sequence.'''
+        coords = AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits)
+        return {x: pyfastaq.intervals.length_sum_from_list(coords[x]) for x in coords}
+
+
+    @staticmethod
+    def _get_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly):
+        '''nucmer_hits =  hits made by self._parse_nucmer_coords_file.
+           ref_gene = reference sequence (pyfastaq.sequences.Fasta object)
+           assembly = dictionary of contig name -> contig.
+           Makes a set of Fasta objects of each piece of assembly that
+           corresponds to the reference sequeunce.'''
+        sequences = {}
+
+        for contig in sorted(nucmer_hits):
+            for hit in nucmer_hits[contig]:
+                qry_coords = hit.qry_coords()
+                fa = assembly[hit.qry_name].subseq(qry_coords.start, qry_coords.end + 1)
+                if hit.on_same_strand():
+                    strand = '+'
+                else:
+                    fa.revcomp()
+                    strand = '-'
+                ref_coords = hit.ref_coords()
+                fa.id = '.'.join([
+                    ref_sequence.id,
+                    str(ref_coords.start + 1),
+                    str(ref_coords.end + 1),
+                    contig,
+                    str(qry_coords.start + 1),
+                    str(qry_coords.end + 1),
+                    strand
+                ])
+
+                if hit.hit_length_ref == hit.ref_length:
+                    fa.id += '.complete'
+
+                sequences[fa.id] = fa
+
+        return sequences
+
+
+    @staticmethod
+    def _whole_gene_covered_by_nucmer_hits(nucmer_hits, ref_seq, percent_threshold, max_nt_extend):
+        '''Returns true iff the reference sequence is covered by nucmer hits.
+           nucmer_hits = hits made by self._parse_nucmer_coords_file.
+           Counts as covered if (total ref bases covered) / len(ref_seq) >= threshold'''
+        coords = AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits)
+        covered = []
+        for coords_list in coords.values():
+            covered.extend(coords_list)
+        pyfastaq.intervals.merge_overlapping_in_list(covered)
+        return (2 * max_nt_extend + pyfastaq.intervals.length_sum_from_list(covered)) / len(ref_seq) >= percent_threshold
+
+
+    @staticmethod
+    def _ref_has_region_assembled_twice(nucmer_hits, ref_seq, threshold):
+        '''Returns true iff there is a part of the reference that is assembled
+           more than once (ie covered by >1 nucmer hit).
+           Needs a minimum proportin of the ref to be assembled more than once,
+           determined by threshold.
+           nucmer_hits = hits made by self._parse_nucmer_coords_file.'''
+        coords = AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits)
+        covered = []
+        for coords_list in coords.values():
+            covered.extend(coords_list)
+        covered.sort()
+
+        if len(covered) <= 1:
+            return False
+
+        coverage = {}
+        for i in covered:
+            for j in range(i.start, i.end + 1):
+                coverage[j] = coverage.get(j, 0) + 1
+
+        bases_depth_at_least_two = len([1 for x in coverage.values() if x > 1])
+        return bases_depth_at_least_two / len(ref_seq) >= threshold
+
+
+    @classmethod
+    def _longest_nucmer_hit_in_ref(cls, nucmer_hits):
+        max_length = None
+        max_hit = None
+
+        for l in nucmer_hits.values():
+            for hit in l:
+                if max_length is None or hit.hit_length_ref > max_length:
+                    max_length = hit.hit_length_ref
+                    max_hit = hit
+
+        return max_hit
+
+
+    @classmethod
+    def _find_previous_start_codon(cls, sequence, start_coord, min_start_coord):
+        for i in range(start_coord, min_start_coord - 1, -3):
+            codon = pyfastaq.sequences.Fasta('x', sequence[i:i+3])
+            aa = codon.translate()
+            if aa.seq == '*':
+                return None
+            elif codon.seq in pyfastaq.genetic_codes.starts[pyfastaq.sequences.genetic_code]:
+                return i
+
+        return None
+
+
+    @classmethod
+    def _find_next_stop_codon(cls, sequence, end_coord, max_end_coord):
+        final_i = min(len(sequence) - 3, max_end_coord - 2)
+        for i in range(end_coord, final_i + 1, 3):
+            codon = pyfastaq.sequences.Fasta('x', sequence[i:i+3])
+            aa = codon.translate()
+            if aa.seq == '*':
+                return i
+
+        return None
+
+
+    @classmethod
+    def _gene_from_nucmer_match(cls, nucmer_match, contig, max_end_nt_extend):
+        if nucmer_match.on_same_strand():
+            revcomp = False
+        else:
+            revcomp = True
+            nucmer_match = copy.copy(nucmer_match)
+            nucmer_match.reverse_query()
+            contig = copy.copy(contig)
+            contig.revcomp()
+
+        ref_hit_start = min(nucmer_match.ref_start, nucmer_match.ref_end)
+        contig_hit_start = min(nucmer_match.qry_start, nucmer_match.qry_end)
+        min_allowed_start = max(0, contig_hit_start - max_end_nt_extend)
+        if ref_hit_start % 3 != 0:
+            contig_hit_start += 3 - (ref_hit_start % 3)
+        contig_hit_end = max(nucmer_match.qry_start, nucmer_match.qry_end)
+        max_allowed_end = min(len(contig) - 1, contig_hit_end + max_end_nt_extend)
+        contig_hit_end -= (contig_hit_end - contig_hit_start + 1) % 3
+        assert contig_hit_start < contig_hit_end
+        gene_nt_name = nucmer_match.qry_name + '.' + str(contig_hit_start + 1) + '-' + str(contig_hit_end + 1)
+
+        gene_nt = pyfastaq.sequences.Fasta(gene_nt_name, contig[contig_hit_start:contig_hit_end+1])
+        assert len(gene_nt) % 3 == 0
+        gene_aa = gene_nt.translate()
+        if '*' in gene_aa[:-1]:
+            return gene_nt, 'HAS_STOP', None, None
+
+        extended_start_position = AssemblyCompare._find_previous_start_codon(contig, contig_hit_start, min_allowed_start)
+        extended_end_position = AssemblyCompare._find_next_stop_codon(contig, contig_hit_end - 2, max_allowed_end)
+        start = extended_start_position if extended_start_position is not None else contig_hit_start
+        end = extended_end_position + 2 if extended_end_position is not None else contig_hit_end
+
+        if revcomp:
+            gene_nt_name = nucmer_match.qry_name + '.' + str(end + 1) + '-' + str(start + 1)
+        else:
+            gene_nt_name = nucmer_match.qry_name + '.' + str(start + 1) + '-' + str(end + 1)
+
+        gene_nt = pyfastaq.sequences.Fasta(gene_nt_name, contig[start:end+1])
+        start_nt_added = None if extended_start_position is None else min(nucmer_match.qry_start, nucmer_match.qry_end) - start
+        end_nt_added = None if extended_end_position is None else end - max(nucmer_match.qry_start, nucmer_match.qry_end)
+
+        if None in [extended_start_position, extended_end_position]:
+            return gene_nt, 'START_OR_END_FAIL', start_nt_added, end_nt_added
+        else:
+            return gene_nt, 'GENE_FOUND', start_nt_added, end_nt_added
+
+
+    @staticmethod
+    def _get_gene_matching_ref(nucmer_hits, contigs, max_end_nt_extend):
+        longest_match = AssemblyCompare._longest_nucmer_hit_in_ref(nucmer_hits)
+        if longest_match is None:
+            return None, 'NO_MATCH', None, None
+        else:
+            return AssemblyCompare._gene_from_nucmer_match(longest_match, contigs[longest_match.qry_name], max_end_nt_extend)
+
+
+    @staticmethod
+    def _ref_covered_by_at_least_one_full_length_contig(nucmer_hits, percent_threshold, max_nt_extend):
+        '''Returns true iff there exists a contig that completely
+           covers the reference sequence
+           nucmer_hits = hits made by self._parse_nucmer_coords_file.'''
+        for l in nucmer_hits.values():
+            for hit in l:
+                if ( (2 * max_nt_extend) + len(hit.ref_coords()) ) / hit.ref_length >= percent_threshold:
+                    return True
+        return False
+
+
+    def update_flag(self, flag):
+        if self._whole_gene_covered_by_nucmer_hits(self.nucmer_hits, self.ref_sequence, self.assembled_threshold, self.max_gene_nt_extend):
+            flag.add('assembled')
+
+        if self.assembled_into_one_contig:
+            flag.add('assembled_into_one_contig')
+
+        if self._ref_has_region_assembled_twice(self.nucmer_hits, self.ref_sequence, self.unique_threshold):
+            flag.add('region_assembled_twice')
+
+        ref_seq_type = self.refdata.sequence_type(self.ref_sequence.id)
+        if ref_seq_type != 'non_coding' and self.gene_matching_ref_type == 'GENE_FOUND':
+            flag.add('complete_gene')
+
+        if len(self.nucmer_hits) == 1:
+            flag.add('unique_contig')
+
+        return flag
+
+
+    @staticmethod
+    def nucmer_hit_containing_reference_position(nucmer_hits, ref_name, ref_position):
+        '''Returns the first nucmer match found that contains the given
+           reference location. nucmer_hits = hits made by self._parse_nucmer_coords_file.
+           Returns None if no matching hit found'''
+        for contig_name in nucmer_hits:
+            for hit in nucmer_hits[contig_name]:
+                if hit.ref_name == ref_name and hit.ref_coords().distance_to_point(ref_position) == 0:
+                    return hit
+
+        return None
+
+
+    def run(self):
+        self._run_nucmer()
+        self.nucmer_hits = self._parse_nucmer_coords_file(self.nucmer_coords_file, self.ref_sequence.id)
+        self.percent_identities = self._nucmer_hits_to_percent_identity(self.nucmer_hits)
+        self.assembled_reference_sequences = self._get_assembled_reference_sequences(self.nucmer_hits, self.ref_sequence, self.assembly_sequences)
+        ref_seq_type = self.refdata.sequence_type(self.ref_sequence.id)
+        if self._ref_covered_by_at_least_one_full_length_contig(self.nucmer_hits, self.assembled_threshold, self.max_gene_nt_extend):
+            self.assembled_into_one_contig = True
+            if ref_seq_type != 'non_coding':
+                self.gene_matching_ref, self.gene_matching_ref_type, self.gene_start_bases_added, self.gene_end_bases_added = self._get_gene_matching_ref(self.nucmer_hits, self.assembly_sequences, self.max_gene_nt_extend)
+        else:
+            self.assembled_into_one_contig = False
diff --git a/ariba/assembly_variants.py b/ariba/assembly_variants.py
new file mode 100644
index 0000000..137cb23
--- /dev/null
+++ b/ariba/assembly_variants.py
@@ -0,0 +1,323 @@
+import sys
+import operator
+import pyfastaq
+import pymummer
+from ariba import sequence_variant
+from pyfastaq import intervals
+
+
+class Error (Exception): pass
+
+class AssemblyVariants:
+    def __init__(self,
+      refdata,
+      nucmer_snp_file,
+    ):
+        self.refdata = refdata
+        self.nucmer_snp_file = nucmer_snp_file
+
+
+    @classmethod
+    def _get_codon_start(cls, gene_start, position):
+        assert position >= gene_start
+        while  (position - gene_start) % 3 != 0:
+            position -= 1
+        return position
+
+
+    @classmethod
+    def _get_mummer_variants(cls, snp_file):
+        variants = pymummer.snp_file.get_all_variants(snp_file)
+        mummer_variants = {}
+
+        if len(variants) == 0:
+            return {}
+
+        variants.sort(key=operator.attrgetter('qry_name'))
+        variants.sort(key=operator.attrgetter('ref_start'))
+
+        for v in variants:
+            if v.qry_name not in mummer_variants:
+                mummer_variants[v.qry_name] = []
+            mummer_variants[v.qry_name].append(v)
+
+        for contig in mummer_variants:
+            l = mummer_variants[contig]
+            if len(l) > 1:
+                new_l = [[l[0]]]
+                previous_codon_start = AssemblyVariants._get_codon_start(0, l[0].ref_start)
+                for variant in l[1:]:
+                    codon_start = AssemblyVariants._get_codon_start(0, variant.ref_start)
+                    if codon_start == previous_codon_start:
+                        new_l[-1].append(variant)
+                    else:
+                        new_l.append([variant])
+                        previous_codon_start = codon_start
+                mummer_variants[contig] = new_l
+            else:
+                mummer_variants[contig] = [l]
+
+        return mummer_variants
+
+
+    @classmethod
+    def _get_variant_effect(cls, variants, ref_sequence):
+        '''variants = list of variants in the same codon.
+           returns type of variant (cannot handle more than one indel in the same codon).'''
+        assert len(variants) != 0
+
+        var_types = [x.var_type for x in variants]
+        if len(set(var_types)) != 1:
+            return 'MULTIPLE', '.', '.'
+
+        var_type = var_types[0]
+
+        assert set([x.ref_name for x in variants]) == set([ref_sequence.id])
+        codon_starts = [AssemblyVariants._get_codon_start(0, x.ref_start) for x in variants]
+        assert len(set(codon_starts)) == 1
+        codon_start = codon_starts[0]
+        aa_start = codon_start // 3
+        ref_codon = pyfastaq.sequences.Fasta('codon', ref_sequence[codon_start:codon_start+3])
+        ref_aa = ref_codon.translate()
+
+        if var_type == pymummer.variant.SNP:
+            new_codon = list(ref_codon.seq)
+            for v in variants:
+                new_codon[v.ref_start - codon_start] = v.qry_base
+            new_codon = pyfastaq.sequences.Fasta('new', ''.join(new_codon))
+            qry_aa = new_codon.translate()
+
+            if ref_aa.seq == qry_aa.seq:
+                return ('SYN', '.', aa_start)
+            elif qry_aa.seq == '*':
+                return ('TRUNC', ref_aa.seq + str(aa_start + 1) + 'trunc', aa_start)
+            else:
+                return ('NONSYN', ref_aa.seq + str(aa_start + 1) + qry_aa.seq, aa_start)
+        elif var_type in [pymummer.variant.INS, pymummer.variant.DEL]:
+            if len(variants) > 1:
+                return 'INDELS', '.', aa_start
+
+            var = variants[0]
+
+            if var_type == pymummer.variant.INS:
+                new_seq = pyfastaq.sequences.Fasta('seq', var.qry_base)
+            else:
+                new_seq = pyfastaq.sequences.Fasta('seq', var.ref_base)
+
+            if len(new_seq) % 3 != 0:
+                return ('FSHIFT', ref_aa.seq + str(aa_start + 1) + 'fs', aa_start)
+
+            new_seq_aa = new_seq.translate()
+            if '*' in new_seq_aa.seq:
+                return ('TRUNC', ref_aa.seq + str(aa_start + 1) + 'trunc', aa_start)
+            elif var_type == pymummer.variant.INS:
+                ref_codon_after_ins = pyfastaq.sequences.Fasta('codon', ref_sequence[codon_start+3:codon_start+6])
+                aa_after_ins = ref_codon_after_ins.translate()
+                return ('INS', ref_aa.seq + str(aa_start + 1) + '_' + aa_after_ins.seq + str(aa_start + 2) + 'ins' + new_seq_aa.seq , aa_start)
+            else:
+                if len(new_seq) == 3:
+                    return ('DEL', ref_aa.seq + str(aa_start + 1) + 'del', aa_start)
+                else:
+                    assert len(new_seq) % 3 == 0
+                    ref_codon_after_ins = pyfastaq.sequences.Fasta('codon', ref_sequence[codon_start+3:codon_start+6])
+                    aa_after_ins = ref_codon_after_ins.translate()
+                    return ('DEL', ref_aa.seq + str(aa_start + 1)+ '_' + aa_after_ins.seq + str(aa_start + 2) + 'del', aa_start)
+
+        else:
+            return ('UNKNOWN', '.', aa_start)
+
+
+    @staticmethod
+    def _filter_mummer_variants(mummer_variants, ref_sequence):
+        if len(mummer_variants) == 0:
+            return
+
+        for contig in mummer_variants:
+            variants = mummer_variants[contig]
+            for i in range(len(variants)):
+                t = AssemblyVariants._get_variant_effect(variants[i], ref_sequence)
+                if t is not None and t[0] in ['TRUNC', 'FSHIFT']:
+                    break
+            mummer_variants[contig] = variants[:i+1]
+
+
+    @staticmethod
+    def _get_one_variant_for_one_contig_non_coding(refdata_var_dict, mummer_variant):
+        var_tuple = None
+        used_known_variants = set()
+
+        # if the variant is at the same position as a known variant in the reference
+        if refdata_var_dict is not None and mummer_variant.ref_start in refdata_var_dict['n']:
+            if mummer_variant.var_type == pymummer.variant.SNP:
+                variants_at_this_position = {x for x in refdata_var_dict['n'][mummer_variant.ref_start]}
+                matching_variants = {x for x in variants_at_this_position if mummer_variant.qry_base == x.variant.variant_value}
+                not_interesting_variants = {x for x in variants_at_this_position if mummer_variant.qry_base == x.variant.wild_value}
+                variants_at_this_position = variants_at_this_position.difference(matching_variants)
+            else:
+                matching_variants = set()
+                variants_at_this_position = refdata_var_dict['n'][mummer_variant.ref_start]
+                not_interesting_variants = set()
+
+            if len(not_interesting_variants) == 0:
+                var_tuple = (
+                    mummer_variant.ref_start,
+                    'n',
+                    mummer_variant.ref_base + str(mummer_variant.ref_start + 1) + mummer_variant.qry_base,
+                    pymummer.variant.var_types[mummer_variant.var_type],
+                    [mummer_variant],
+                    matching_variants,
+                    variants_at_this_position
+                )
+
+            used_known_variants.update(matching_variants, variants_at_this_position)
+        else: # not at a known variant position in the reference
+            var_tuple = (
+                mummer_variant.ref_start,
+                'n',
+                mummer_variant.ref_base + str(mummer_variant.ref_start + 1) + mummer_variant.qry_base,
+                pymummer.variant.var_types[mummer_variant.var_type],
+                [mummer_variant],
+                set(),
+                set()
+            )
+
+        return var_tuple, used_known_variants
+
+
+    @staticmethod
+    def _get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variants_list):
+        aa_var_effect, aa_var_string, aa_var_position = AssemblyVariants._get_variant_effect(mummer_variants_list, ref_sequence)
+        var_tuple = None
+        used_known_variants = set()
+
+        # if this variant is at the same position as a known variant in the reference
+        if refdata_var_dict is not None and aa_var_position in refdata_var_dict['p']:
+            if aa_var_effect == 'NONSYN':
+                aa_variant = sequence_variant.Variant('p', aa_var_string, '.')
+                variants_at_this_position = {x for x in refdata_var_dict['p'][aa_variant.position]}
+                matching_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.variant_value}
+                not_interesting_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.wild_value}
+                variants_at_this_position = variants_at_this_position.difference(matching_variants)
+            else:
+                matching_variants = set()
+                variants_at_this_position = refdata_var_dict['p'][aa_var_position]
+                not_interesting_variants = set()
+
+            if len(not_interesting_variants) == 0:
+                var_tuple = (
+                    aa_var_position,
+                    'p',
+                    aa_var_string,
+                    aa_var_effect,
+                    mummer_variants_list,
+                    matching_variants,
+                    variants_at_this_position
+                )
+
+            used_known_variants.update(matching_variants, variants_at_this_position)
+        else: # this variant is not at a known position in the reference
+            var_tuple = (
+                aa_var_position,
+                'p',
+                aa_var_string,
+                aa_var_effect,
+                mummer_variants_list,
+                set(),
+                set()
+            )
+
+        return var_tuple, used_known_variants
+
+
+    @staticmethod
+    def _get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nucmer_coords):
+        '''Finds variants where ref has the variant and so does the contig. Which means
+           that there was no mummer call to flag it up so need to look through the known
+           ref variants. Also need to check that the variant is in a nucmer match to an
+           assembly contig.'''
+        variants = []
+
+        for ref_variant_pos, ref_variants_set in sorted(known_ref_variants.items()):
+            for known_ref_variant in ref_variants_set:
+                if known_ref_variant not in used_ref_variants:
+                    variant_pos_matches_contig = False
+                    pos = known_ref_variant.variant.position
+
+                    if known_ref_variant.variant_type == 'n':
+                        ref_interval = intervals.Interval(pos, pos)
+                    elif known_ref_variant.variant_type == 'p':
+                        ref_interval = intervals.Interval(3 * pos, 3 * pos + 2)
+                    else:
+                        raise Error('Unexpected variant type "' + known_ref_variant.variant_type + '" in _get_remaining_known_ref_variants. Cannot continue')
+
+                    for interval in nucmer_coords:
+                        if ref_interval.intersects(interval):
+                            variant_pos_matches_contig = True
+                            break
+
+                    if variant_pos_matches_contig:
+                        variants.append((None, known_ref_variant.variant_type, None, None, None, {known_ref_variant}, set()))
+
+        return variants
+
+
+    def get_variants(self, ref_sequence_name, nucmer_coords):
+        '''Nucmr coords = dict. Key=contig name. Value = list of intervals of ref coords that match the contig.
+           Made by assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords
+           Returns dictionary. Key=contig name. Value = list of variants. Each variant
+           is a tuple: (
+               0 = position,
+               1 = type in ['n', 'p']
+               2 = Variant string, eg 'D2E',
+               3 = variant effect (as returned by _get_variant_effect)
+               4 = list of pymummer.variant.Variant that made up this variant (could be more than one because of
+                   variants in the same codon)
+               5 = set {matching known variants from metadata (=sequence_metadata.SequenceMetadata)}
+               6 = set {known ref metadata (=sequence_metadata.SequenceMetadata)  at same position as SNP}, excluding those from 4
+           )
+        '''
+        mummer_variants = self._get_mummer_variants(self.nucmer_snp_file)
+        variants = {}
+        ref_sequence_type = self.refdata.sequence_type(ref_sequence_name)
+        assert ref_sequence_type is not None
+        ref_sequence = self.refdata.sequence(ref_sequence_name)
+
+        if ref_sequence_name in self.refdata.metadata:
+            refdata_var_dict = self.refdata.metadata[ref_sequence_name]
+        else:
+            refdata_var_dict = None
+
+        known_non_wild_variants_in_ref = self.refdata.all_non_wild_type_variants(ref_sequence_name)
+
+        for contig in nucmer_coords:
+            used_known_variants = set()
+            variants[contig] = []
+
+            if contig in mummer_variants:
+                for mummer_variant_list in mummer_variants[contig]:
+                    if ref_sequence_type == 'non_coding':
+                        for mummer_variant in mummer_variant_list:
+                            new_variant, used_variants = self._get_one_variant_for_one_contig_non_coding(refdata_var_dict, mummer_variant)
+                    else:
+                        new_variant, used_variants = self._get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variant_list)
+
+                    if new_variant is not None:
+                            variants[contig].append(new_variant)
+                    used_known_variants.update(used_variants)
+
+            # for this contig, need to know all the ref sequence and coords it maps to.
+            # Then report just the unused known variants, as the contig also has these variants
+            if ref_sequence_type == 'non_coding':
+                new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['n'], used_known_variants, nucmer_coords[contig])
+            else:
+                new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['p'], used_known_variants, nucmer_coords[contig])
+
+                if ref_sequence_type == 'variants_only':
+                    new_variants = [x for x in new_variants if len(x[5]) > 0]
+
+            variants[contig].extend(new_variants)
+            if len(variants[contig]) == 0:
+                del variants[contig]
+
+        return variants
+
diff --git a/ariba/best_seq_chooser.py b/ariba/best_seq_chooser.py
new file mode 100644
index 0000000..0d0395c
--- /dev/null
+++ b/ariba/best_seq_chooser.py
@@ -0,0 +1,98 @@
+import shutil
+import tempfile
+import os
+import pyfastaq
+from ariba import mapping, faidx
+
+class Error (Exception): pass
+
+class BestSeqChooser:
+    def __init__(self,
+        reads1,
+        reads2,
+        references_fa,
+        log_fh,
+        samtools_exe='samtools',
+        bowtie2_exe='bowtie2',
+        bowtie2_preset='very-sensitive-local',
+        threads=1,
+    ):
+        self.reads1 = reads1
+        self.reads2 = reads2
+        self.references_fa = references_fa
+        self.log_fh = log_fh
+        self.samtools_exe = samtools_exe
+        self.bowtie2_exe = bowtie2_exe
+        self.bowtie2_preset = bowtie2_preset
+        self.threads = threads
+
+
+    def _total_alignment_score(self, seq_name):
+        tmpdir = tempfile.mkdtemp(prefix='tmp.get_total_aln_score.', dir=os.getcwd())
+        tmp_bam = os.path.join(tmpdir, 'tmp.get_total_alignment_score.bam')
+        tmp_fa = os.path.join(tmpdir, 'tmp.get_total_alignment_score.ref.fa')
+
+        faidx.write_fa_subset(
+            [seq_name],
+            self.references_fa,
+            tmp_fa,
+            samtools_exe=self.samtools_exe,
+            verbose=True,
+            verbose_filehandle=self.log_fh
+        )
+
+        mapping.run_bowtie2(
+            self.reads1,
+            self.reads2,
+            tmp_fa,
+            tmp_bam[:-4],
+            threads=self.threads,
+            samtools=self.samtools_exe,
+            bowtie2=self.bowtie2_exe,
+            bowtie2_preset=self.bowtie2_preset,
+            verbose=True,
+            verbose_filehandle=self.log_fh
+        )
+
+        score = mapping.get_total_alignment_score(tmp_bam)
+        shutil.rmtree(tmpdir)
+        return score
+
+
+    def _get_best_seq_by_alignment_score(self):
+        total_sequences = pyfastaq.tasks.count_sequences(self.references_fa)
+        if total_sequences == 1:
+            seqs = {}
+            pyfastaq.tasks.file_to_dict(self.references_fa, seqs)
+            assert len(seqs) == 1
+            seq_name = list(seqs.values())[0].id
+            print('No need to choose sequence for this cluster because only has one sequence:', seq_name, file=self.log_fh)
+            return seq_name
+
+        print('\nChoosing best sequence from cluster of', total_sequences, 'sequences...', file=self.log_fh)
+        file_reader = pyfastaq.sequences.file_reader(self.references_fa)
+        best_score = 0
+        best_seq_name = None
+        for seq in file_reader:
+            score = self._total_alignment_score(seq.id)
+            print('Total alignment score for sequence', seq.id, 'is', score, file=self.log_fh)
+            if score > best_score:
+                best_score = score
+                best_seq_name = seq.id
+
+        print('\nBest sequence is', best_seq_name, 'with total alignment score of', best_score, file=self.log_fh)
+        print(file=self.log_fh)
+        return best_seq_name
+
+
+    def best_seq(self, outfile):
+        '''Finds the closest matchng sequence, writes it to a FASTA file, and returns it as a pyfastaq.sequences.Fasta object'''
+        seq_name = self._get_best_seq_by_alignment_score()
+        if seq_name is None:
+            return None
+        faidx.write_fa_subset([seq_name], self.references_fa, outfile, samtools_exe=self.samtools_exe, verbose=True, verbose_filehandle=self.log_fh)
+        seqs = {}
+        pyfastaq.tasks.file_to_dict(outfile, seqs)
+        assert len(seqs) == 1
+        return list(seqs.values())[0]
+
diff --git a/ariba/card_record.py b/ariba/card_record.py
new file mode 100644
index 0000000..f1a1f49
--- /dev/null
+++ b/ariba/card_record.py
@@ -0,0 +1,111 @@
+import sys
+import pprint
+import re
+
+aro_regex = re.compile(r'\b([a-z]{3}[A-Z])\b')
+
+class Error (Exception): pass
+
+class CardRecord:
+    def __init__(self, data_dict):
+        self.data_dict = data_dict
+
+
+    @staticmethod
+    def _ARO_id(data_dict):
+        return data_dict.get('ARO_id', None)
+
+
+    @staticmethod
+    def _ARO_accession(data_dict):
+        return data_dict.get('ARO_accession', None)
+
+
+    @staticmethod
+    def _ARO_name(data_dict):
+        return data_dict.get('ARO_name', None)
+
+
+    @staticmethod
+    def _ARO_description(data_dict):
+        return data_dict.get('ARO_description', None)
+
+
+    @staticmethod
+    def _ARO_name_to_fasta_name(aro_name):
+        if ' ' not in aro_name:
+            return aro_name
+
+        re_search = aro_regex.search(aro_name)
+        if re_search is not None:
+            return re_search.group(1).replace('.', '_')
+        else:
+            return '_'.join(aro_name.split()[:3]).replace('.', '_')
+
+
+    @staticmethod
+    def _dna_seqs_and_genbank_ids(gene_dict):
+        try:
+            seq_dict = gene_dict['model_sequences']['sequence']
+        except:
+            return []
+
+        if len(seq_dict) == 0:
+            return []
+
+        seqs_and_ids = []
+
+        for key, seq_dict in sorted(gene_dict['model_sequences']['sequence'].items()):
+            try:
+                dna_seq = seq_dict['dna_sequence']['sequence']
+                genbank_id = seq_dict['dna_sequence']['accession']
+                start = seq_dict['dna_sequence']['fmin']
+                end = seq_dict['dna_sequence']['fmax']
+                gi = seq_dict['protein_sequence']['GI']
+                protein_seq = seq_dict['protein_sequence']['sequence']
+            except:
+                print('Missing data from', key, file=sys.stderr)
+                continue
+
+            assert gi != 'NA'
+
+            if gi == '':
+                assert protein_seq == ''
+                gi = 'NA'
+
+            seqs_and_ids.append((key, gi, genbank_id, start, end, dna_seq, protein_seq))
+
+        return seqs_and_ids
+
+
+    @staticmethod
+    def _snps(data_dict):
+        try:
+            snps_dict = data_dict['model_param']['snp']
+        except:
+            return set()
+
+        try:
+            snps_set = set(snps_dict['param_value'].values())
+        except:
+            return set()
+
+        return snps_set
+
+
+    def get_data(self):
+        data = {
+            'ARO_id': self._ARO_id(self.data_dict),
+            'ARO_accession': self._ARO_accession(self.data_dict),
+            'ARO_name': self._ARO_name(self.data_dict),
+            'ARO_description': self._ARO_description(self.data_dict),
+            'dna_seqs_and_ids': self._dna_seqs_and_genbank_ids(self.data_dict),
+        }
+
+        if None in (data.values()) or data['dna_seqs_and_ids'] == 0:
+            pprint(self.data_dict)
+            raise Error('Error getting ARO_acccesion, ARO_id, ARO_name, dna_sequence(s) or genbank accession(s) from the above dictionary. Cannot continue.')
+
+        data['snps'] = self._snps(self.data_dict)
+        return data
+
diff --git a/ariba/cdhit.py b/ariba/cdhit.py
index 3276404..c49750c 100644
--- a/ariba/cdhit.py
+++ b/ariba/cdhit.py
@@ -6,8 +6,6 @@ from ariba import common
 
 class Error (Exception): pass
 
-
-
 class Runner:
     def __init__(
       self,
@@ -17,6 +15,8 @@ class Runner:
       threads=1,
       length_diff_cutoff=0.9,
       verbose=False,
+      cd_hit_est='cd-hit-est',
+      rename_suffix='x',
     ):
 
         if not os.path.exists(infile):
@@ -28,113 +28,170 @@ class Runner:
         self.threads = threads
         self.length_diff_cutoff = length_diff_cutoff
         self.verbose = verbose
+        self.cd_hit_est = cd_hit_est
+        self.rename_suffix = rename_suffix
 
 
     def fake_run(self):
         '''Doesn't actually run cd-hit. Instead, puts each input sequence into its own cluster. So it's as if cdhit was run, but didn't cluster anything'''
-        cluster_to_name = {}
-        found_names = set()
+        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
+        tmp_fa = os.path.join(tmpdir, 'cdhit.fa')
+        clusters = {}
         seq_reader = pyfastaq.sequences.file_reader(self.infile)
-        f = pyfastaq.utils.open_file_write(self.outfile)
+        f = pyfastaq.utils.open_file_write(tmp_fa)
+
         for seq in seq_reader:
-            if seq.id in found_names:
+            if seq.id in clusters:
+                pyfastaq.utils.close(f)
+                shutil.rmtree(tmpdir)
                 raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue')
-            found_names.add(seq.id)
-            cluster_number = str(len(cluster_to_name))
-            cluster_to_name[cluster_number] = {seq.id}
-            seq.id = cluster_number
+
+            clusters[seq.id] = {seq.id}
             print(seq, file=f)
 
         pyfastaq.utils.close(f)
-        return cluster_to_name
+        clusters = self._rename_clusters(clusters, tmp_fa, self.outfile, rename_suffix=self.rename_suffix)
+        shutil.rmtree(tmpdir)
+        return clusters
 
 
-    def run(self):
-        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
-        cdhit_fasta = os.path.join(tmpdir, 'cdhit')
-        cluster_info_outfile = cdhit_fasta + '.bak.clstr'
-        infile_renamed = os.path.join(tmpdir, 'input.renamed.fa')
+    @staticmethod
+    def _load_user_clusters_file(filename):
+        f = pyfastaq.utils.open_file_read(filename)
+        seq_to_cluster = {}
+        for line in f:
+            data = line.rstrip().split()
 
-        # cd-hit truncates all names to 19 bases in its report of which
-        # sequences belong to which clusters. So need to temporarily
-        # rename all sequences to have short enough names. Grrr.
-        new_to_old_name = self._enumerate_fasta(self.infile, infile_renamed)
+            for seq_name in data:
+                if seq_name in seq_to_cluster:
+                    pyfastaq.utils.close(f)
+                    raise Error('Error reading clusters file. The sequence "' + seq_name + '" was found more than once in the file ' + filename)
+                seq_to_cluster[seq_name] = data[0]
 
-        cmd = ' '.join([
-            'cd-hit-est',
-            '-i', infile_renamed,
-            '-o', cdhit_fasta,
-            '-c', str(self.seq_identity_threshold),
-            '-T', str(self.threads),
-            '-s', str(self.length_diff_cutoff),
-            '-bak 1',
-        ])
+        pyfastaq.utils.close(f)
+        return seq_to_cluster
 
-        common.syscall(cmd, verbose=self.verbose)
 
-        cluster_representatives = self._get_ids(cdhit_fasta)
-        clusters, cluster_rep_to_cluster = self._parse_cluster_info_file(cluster_info_outfile, new_to_old_name, cluster_representatives)
-        self._rename_fasta(cdhit_fasta, self.outfile, cluster_rep_to_cluster)
-        shutil.rmtree(tmpdir)
-        return clusters
+    def run_get_clusters_from_file(self, infile):
+        '''Instead of running cdhit, gets the clusters info from the input dict.
+           Dict expected to be key=sequence name, value=name of cluster'''
+        seq_to_cluster = self._load_user_clusters_file(infile)
+        cluster_names = set(seq_to_cluster.values())
+        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
+        tmp_fa = os.path.join(tmpdir, 'cdhit.fa')
+        clusters = {}
+        seq_reader = pyfastaq.sequences.file_reader(self.infile)
+        f = pyfastaq.utils.open_file_write(tmp_fa)
 
+        for seq in seq_reader:
+            if seq.id in clusters and seq.id in clusters[seq.id]:
+                pyfastaq.utils.close(f)
+                shutil.rmtree(tmpdir)
+                raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue')
 
-    def _enumerate_fasta(self, infile, outfile):
-        rename_file = outfile + '.tmp.rename_info'
-        assert not os.path.exists(rename_file)
-        pyfastaq.tasks.enumerate_names(infile, outfile, rename_file=rename_file)
+            if seq.id not in seq_to_cluster:
+                raise Error('Error forcing cdhit clustering. Found sequence ' + seq.id + ' in FASTA file, but not in provided clusters info from file ' + infile)
 
-        with open(rename_file) as f:
-            lines = [x.rstrip().split('\t') for x in f.readlines() if x != '#old\tnew\n']
-            new_to_old_name = {x[1]: x[0] for x in lines}
-            if len(lines) != len(new_to_old_name):
-                raise Error('Sequence names in input file not unique! Cannot continue')
+            cluster = seq_to_cluster[seq.id]
+            if cluster not in clusters:
+                clusters[cluster] = set()
 
-        os.unlink(rename_file)
-        return new_to_old_name
+            clusters[cluster].add(seq.id)
+            if seq.id in cluster_names:
+                print(seq, file=f)
 
+        pyfastaq.utils.close(f)
+        clusters = self._rename_clusters(clusters, tmp_fa, self.outfile, rename_suffix=self.rename_suffix)
+        shutil.rmtree(tmpdir)
+        return clusters
 
-    def _rename_fasta(self, infile, outfile, names_dict):
-        seq_reader = pyfastaq.sequences.file_reader(infile)
-        f = pyfastaq.utils.open_file_write(outfile)
-        for seq in seq_reader:
-            seq.id = names_dict[seq.id]
-            print(seq, file=f)
 
-        pyfastaq.utils.close(f)
+    def _get_ids(self, infile):
+        seq_reader = pyfastaq.sequences.file_reader(infile)
+        return set([seq.id for seq in seq_reader])
 
 
-    def _parse_cluster_info_file(self, infile, names_dict, cluster_representatives):
+    @staticmethod
+    def _parse_cluster_info_file(infile, cluster_representatives):
         f = pyfastaq.utils.open_file_read(infile)
-        clusters = {}
-        cluster_representative_to_cluster_number = {}
+        cluster_sets = {}
+        found_representatives = {}  # store cluster number -> representative name
+
         for line in f:
             data = line.rstrip().split()
-            cluster = data[0]
             seqname = data[2]
             if not (seqname.startswith('>') and seqname.endswith('...')):
-                raise Error('Unexpected format of sequence name in line:\n' + line)
+                raise Error('Unexpected format of line from cdhit output file "' + infile + '". Line is:\n' + line)
             seqname = seqname[1:-3]
 
-            if seqname in cluster_representatives:
-                cluster_representative_to_cluster_number[seqname] = cluster
+            cluster_number = int(data[0]) # this is the cluster number used by cdhit
+            if cluster_number not in cluster_sets:
+                cluster_sets[cluster_number] = set()
 
-            seqname = names_dict[seqname]
+            cluster_sets[cluster_number].add(seqname)
 
-            if cluster not in clusters:
-                clusters[cluster] = set()
+            if data[3] == '*':
+                found_representatives[cluster_number] = seqname
 
-            if seqname in clusters[cluster]:
-                raise Error('Duplicate name "' + seqname + '" found in cluster ' + str(cluster))
+        pyfastaq.utils.close(f)
 
-            clusters[cluster].add(seqname)
+        if set(found_representatives.values()) != cluster_representatives:
+            raise Error('Mismatch in cdhit output sequence names between fasta file and clusters file. Cannot continue')
 
-        pyfastaq.utils.close(f)
+        clusters = {}
+        for cluster_number, cluster_name in found_representatives.items():
+            clusters[cluster_name] = cluster_sets[cluster_number]
 
-        return clusters, cluster_representative_to_cluster_number
+        return clusters
 
 
-    def _get_ids(self, infile):
-        seq_reader = pyfastaq.sequences.file_reader(infile)
-        return set([seq.id for seq in seq_reader])
+    @staticmethod
+    def _rename_clusters(clusters_dict, infile, outfile, rename_suffix='x'):
+        new_clusters_dict = {}
+        freader = pyfastaq.sequences.file_reader(infile)
+        f_out = pyfastaq.utils.open_file_write(outfile)
+
+        for seq in freader:
+            original_name = seq.id
+            assert original_name in clusters_dict
+            new_name = original_name.split('.')[0] + '.' + rename_suffix
+
+            if new_name in new_clusters_dict:
+                suffix = 2
+                while new_name + '.' + str(suffix) in new_clusters_dict:
+                    suffix += 1
+                new_name += '.' + str(suffix)
+
+            new_clusters_dict[new_name] = clusters_dict[original_name]
+            seq.id = new_name
+            print(seq, file=f_out)
+
+        pyfastaq.utils.close(f_out)
+
+        return new_clusters_dict
+
+
+    def run(self):
+        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
+        cdhit_fasta = os.path.join(tmpdir, 'cdhit')
+        cluster_info_outfile = cdhit_fasta + '.bak.clstr'
+
+        cmd = ' '.join([
+            self.cd_hit_est,
+            '-i', self.infile,
+            '-o', cdhit_fasta,
+            '-c', str(self.seq_identity_threshold),
+            '-T', str(self.threads),
+            '-s', str(self.length_diff_cutoff),
+            '-d 0',
+            '-bak 1',
+        ])
+
+        common.syscall(cmd, verbose=self.verbose)
+        cluster_representatives = self._get_ids(cdhit_fasta)
+        clusters = self._parse_cluster_info_file(cluster_info_outfile, cluster_representatives)
+        clusters = self._rename_clusters(clusters, cdhit_fasta, self.outfile, rename_suffix=self.rename_suffix)
+
+        shutil.rmtree(tmpdir)
+        return clusters
 
diff --git a/ariba/cluster.py b/ariba/cluster.py
index f8023ca..5ee9db7 100644
--- a/ariba/cluster.py
+++ b/ariba/cluster.py
@@ -1,30 +1,38 @@
+import signal
 import os
-import copy
-from operator import itemgetter
-import sys
+import atexit
+import random
+import math
 import shutil
-import pysam
-import operator
+import sys
 import pyfastaq
-import pymummer
-from ariba import common, mapping, bam_parse, flag, faidx
+from ariba import assembly, assembly_compare, assembly_variants, bam_parse, best_seq_chooser, external_progs, flag, mapping, report, samtools_variants
 
 class Error (Exception): pass
 
+unittest=False
 
 class Cluster:
     def __init__(self,
       root_dir,
       name,
-      assembly_kmer=0,
-      assembler='velvet',
+      refdata,
+      total_reads,
+      total_reads_bases,
+      fail_file=None,
+      read_store=None,
+      reference_names=None,
+      logfile=None,
+      assembly_coverage=50,
+      assembly_kmer=21,
+      assembler='spades',
       max_insert=1000,
       min_scaff_depth=10,
       nucmer_min_id=90,
-      nucmer_min_len=50,
-      nucmer_breaklen=50,
-      sspace_k=20,
+      nucmer_min_len=20,
+      nucmer_breaklen=200,
       reads_insert=500,
+      sspace_k=20,
       sspace_sd=0.4,
       threads=1,
       bcf_min_dp=10,
@@ -33,35 +41,42 @@ class Cluster:
       bcf_min_qual=20,
       assembled_threshold=0.95,
       unique_threshold=0.03,
-      verbose=False,
-      bcftools_exe='bcftools',
-      gapfiller_exe='GapFiller.pl',
-      samtools_exe='samtools',
-      bowtie2_exe='bowtie2',
+      max_gene_nt_extend=30,
       bowtie2_preset='very-sensitive-local',
-      smalt_exe='smalt',
-      spades_exe='spades.py',
-      sspace_exe='SSPACE_Basic_v2.0.pl',
-      velvet_exe='velvet', # prefix of velvet{g,h}
-      spades_other=None,
-      clean=1,
+      spades_other_options=None,
+      clean=True,
+      extern_progs=None,
+      random_seed=42,
     ):
-
         self.root_dir = os.path.abspath(root_dir)
-        if not os.path.exists(self.root_dir):
-            raise Error('Directory ' + self.root_dir + ' not found. Cannot continue')
-
+        self.read_store = read_store
+        self.refdata = refdata
         self.name = name
-        self.reads1 = os.path.join(self.root_dir, 'reads_1.fq')
-        self.reads2 = os.path.join(self.root_dir, 'reads_2.fq')
-        self.gene_fa = os.path.join(self.root_dir, 'gene.fa')
-        self.genes_fa = os.path.join(self.root_dir, 'genes.fa')
-        self.gene_bam = os.path.join(self.root_dir, 'gene.reads_mapped.bam')
+        self.fail_file = fail_file
+        self.reference_fa = os.path.join(self.root_dir, 'reference.fa')
+        self.reference_names = reference_names
+        self.all_reads1 = os.path.join(self.root_dir, 'reads_1.fq')
+        self.all_reads2 = os.path.join(self.root_dir, 'reads_2.fq')
+        self.references_fa = os.path.join(self.root_dir, 'references.fa')
+
+        if os.path.exists(self.root_dir):
+            self._input_files_exist()
+
+        self.total_reads = total_reads
+        self.total_reads_bases = total_reads_bases
+        self.logfile = logfile
+        self.assembly_coverage = assembly_coverage
+        self.assembly_kmer = assembly_kmer
+        self.assembler = assembler
+        self.sspace_k = sspace_k
+        self.sspace_sd = sspace_sd
+        self.reads_insert = reads_insert
+        self.spades_other_options = spades_other_options
 
-        for fname in [self.reads1, self.reads2, self.genes_fa]:
-            if not os.path.exists(fname):
-                raise Error('File ' + fname + ' not found. Cannot continue')
+        self.reads_for_assembly1 = os.path.join(self.root_dir, 'reads_for_assembly_1.fq')
+        self.reads_for_assembly2 = os.path.join(self.root_dir, 'reads_for_assembly_2.fq')
 
+        self.ref_sequence = None
 
         self.max_insert = max_insert
         self.min_scaff_depth = min_scaff_depth
@@ -69,1037 +84,353 @@ class Cluster:
         self.nucmer_min_id = nucmer_min_id
         self.nucmer_min_len = nucmer_min_len
         self.nucmer_breaklen = nucmer_breaklen
-        self.assembly_vs_gene_coords = os.path.join(self.root_dir, 'assembly_vs_gene.coords')
 
         self.bcf_min_dp = bcf_min_dp
         self.bcf_min_dv = bcf_min_dv
         self.bcf_min_dv_over_dp = bcf_min_dv_over_dp
         self.bcf_min_qual = bcf_min_qual
 
-        self._set_assembly_kmer(assembly_kmer)
-        self.assembler = assembler
-        assert self.assembler in ['velvet', 'spades']
-        self.spades_exe = spades_exe
-        self.spades_other = spades_other
-
-        self.bcftools_exe = bcftools_exe
-
-        self.sspace_exe = shutil.which(sspace_exe)
-        if self.sspace_exe is None:
-            self.gapfiller_exe = None
-        else:
-            self.sspace_exe = os.path.realpath(self.sspace_exe) # otherwise sspace dies loading packages
-            self.gapfiller_exe = shutil.which(gapfiller_exe)
-            if self.gapfiller_exe is not None:
-                self.gapfiller_exe = os.path.realpath(self.gapfiller_exe) # otherwise gapfiller dies loading packages
-
-        self.samtools_exe = samtools_exe
-        self.smalt_exe = smalt_exe
-        self.bowtie2_exe = bowtie2_exe
         self.bowtie2_preset = bowtie2_preset
 
-        if self.assembler == 'velvet':
-            self.velveth = velvet_exe + 'h'
-            self.velvetg = velvet_exe + 'g'
-
-        self.sspace_k = sspace_k
-        self.reads_insert = reads_insert
-        self.sspace_sd = sspace_sd
-
         self.threads = threads
-        self.verbose = verbose
         self.assembled_threshold = assembled_threshold
         self.unique_threshold = unique_threshold
+        self.max_gene_nt_extend = max_gene_nt_extend
         self.status_flag = flag.Flag()
-        self.flag_file = os.path.join(self.root_dir, 'flag')
         self.clean = clean
 
         self.assembly_dir = os.path.join(self.root_dir, 'Assembly')
-        try:
-            os.mkdir(self.assembly_dir)
-        except:
-            raise Error('Error mkdir ' + self.assembly_dir)
-        self.assembler_dir = os.path.join(self.assembly_dir, 'Assemble')
-        self.assembly_contigs = os.path.join(self.assembly_dir, 'contigs.fa')
-        self.scaffold_dir = os.path.join(self.assembly_dir, 'Scaffold')
-        self.scaffolder_scaffolds = os.path.join(self.assembly_dir, 'scaffolds.fa')
-        self.gapfill_dir = os.path.join(self.assembly_dir, 'Gapfill')
-        self.gapfilled_scaffolds = os.path.join(self.assembly_dir, 'scaffolds.gapfilled.fa')
         self.final_assembly_fa = os.path.join(self.root_dir, 'assembly.fa')
         self.final_assembly_bam = os.path.join(self.root_dir, 'assembly.reads_mapped.bam')
         self.final_assembly_read_depths = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz')
         self.final_assembly_vcf = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.vcf')
-        self.final_assembled_genes_fa = os.path.join(self.root_dir, 'assembly.genes.fa')
-        self.final_assembly = {}
+        self.samtools_vars_prefix = self.final_assembly_bam
+        self.assembly_compare = None
+        self.assembly_compare_prefix = os.path.join(self.root_dir, 'assembly_compare')
+
         self.mummer_variants = {}
         self.variant_depths = {}
         self.percent_identities = {}
 
-
-    def _get_read_counts(self):
-        count1 = pyfastaq.tasks.count_sequences(self.reads1)
-        count2 = pyfastaq.tasks.count_sequences(self.reads2)
-        if count1 == count2:
-            return count1 + count2
+        # The log filehandle self.log_fh is set at the start of the run() method.
+        # Lots of other methods use self.log_fh. But for unit testing, run() isn't
+        # run. So we need to set this to something for unit testing.
+        # On the other hand, setting it here breaks a real run of ARIBA because
+        # multiprocessing complains with the error:
+        # TypeError: cannot serialize '_io.TextIOWrapper' object.
+        # Hence the following two lines...
+        if unittest:
+            self.log_fh = sys.stdout
         else:
-            raise Error('Different number of fwd/rev reads in cluster ' + self.name + '! Cannot continue')
-
-
-    def _get_total_alignment_score(self, gene_name):
-        tmp_bam = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.bam')
-        assert not os.path.exists(tmp_bam)
-        tmp_fa = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.ref.fa')
-        assert not os.path.exists(tmp_fa)
-        faidx.write_fa_subset([gene_name], self.genes_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=self.verbose)
-        mapping.run_bowtie2(
-            self.reads1,
-            self.reads2,
-            tmp_fa,
-            tmp_bam[:-4],
-            threads=self.threads,
-            samtools=self.samtools_exe,
-            bowtie2=self.bowtie2_exe,
-            bowtie2_preset=self.bowtie2_preset,
-            verbose=self.verbose,
-        )
+            atexit.register(self._atexit)
+            self.log_fh = None
 
-        score = mapping.get_total_alignment_score(tmp_bam)
-        os.unlink(tmp_bam)
-        os.unlink(tmp_fa)
-        os.unlink(tmp_fa + '.fai')
-        return score
-
-
-    def _get_best_gene_by_alignment_score(self):
-        cluster_size = pyfastaq.tasks.count_sequences(self.genes_fa)
-        if cluster_size == 1:
-            seqs = {}
-            pyfastaq.tasks.file_to_dict(self.genes_fa, seqs)
-            assert len(seqs) == 1
-            gene_name = list(seqs.values())[0].id
-            if self.verbose:
-                print('No need to choose gene for this cluster because only has one gene:', gene_name)
-            return gene_name
-
-        if self.verbose:
-            print('\nChoosing best gene from cluster of', cluster_size, 'genes...')
-        file_reader = pyfastaq.sequences.file_reader(self.genes_fa)
-        best_score = 0
-        best_gene_name = None
-        for seq in file_reader:
-            score = self._get_total_alignment_score(seq.id)
-            if self.verbose:
-                print('Total alignment score for gene', seq.id, 'is', score)
-            if score > best_score:
-                best_score = score
-                best_gene_name = seq.id
-
-        if self.verbose:
-            print('Best gene is', best_gene_name, 'with total alignment score of', best_score)
-            print()
-
-        return best_gene_name
-
-
-    def _choose_best_gene(self):
-        gene_name = self._get_best_gene_by_alignment_score()
-        if gene_name is None:
-            return None
-        faidx.write_fa_subset([gene_name], self.genes_fa, self.gene_fa, samtools_exe=self.samtools_exe, verbose=self.verbose)
-        seqs = {}
-        pyfastaq.tasks.file_to_dict(self.gene_fa, seqs)
-        assert len(seqs) == 1
-        return list(seqs.values())[0]
-
-
-    def _set_assembly_kmer(self, k):
-        '''If the kmer not given, uses 2/3 of the mean read length (using first 1000 forward and first 1000 reverse reads)'''
-        if k == 0:
-            read_length1 = pyfastaq.tasks.mean_length(self.reads1, limit=1000)
-            read_length2 = pyfastaq.tasks.mean_length(self.reads2, limit=1000)
-            self.assembly_kmer = round( (read_length1 + read_length2) / 3)
-            if self.assembly_kmer % 2 == 0:
-                self.assembly_kmer += 1
+        if extern_progs is None:
+            self.extern_progs = external_progs.ExternalProgs()
         else:
-            self.assembly_kmer = k
-
-
-    def _assemble_with_velvet(self):
-        # map reads to reference gene to make BAM input to velvet columbus
-        mapping.run_bowtie2(
-            self.reads1,
-            self.reads2,
-            self.gene_fa,
-            self.gene_bam[:-4],
-            threads=self.threads,
-            sort=True,
-            samtools=self.samtools_exe,
-            bowtie2=self.bowtie2_exe,
-            bowtie2_preset=self.bowtie2_preset,
-            verbose=self.verbose,
-        )
+            self.extern_progs = extern_progs
 
-        cmd = ' '.join([
-            self.velveth,
-            self.assembler_dir,
-            str(self.assembly_kmer),
-            '-reference', self.gene_fa,
-            '-shortPaired -bam', self.gene_bam[:-4] + '.unsorted.bam'
-        ])
-
-        cwd = os.getcwd()
-        os.chdir(self.assembly_dir)
-        velvet_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'contigs.fa')
-
-        self.velveth_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True)
-        if not self.velveth_ok:
-            with open('velveth_errors', 'w') as f:
-                print(err, file=f)
-                f.close()
-            self.status_flag.add('assembly_fail')
-            os.chdir(cwd)
-            return
+        self.random_seed = random_seed
+        wanted_signals = [signal.SIGABRT, signal.SIGINT, signal.SIGSEGV, signal.SIGTERM]
+        for s in wanted_signals:
+            signal.signal(s, self._receive_signal)
 
-        cmd = ' '.join([
-            self.velvetg,
-            self.assembler_dir,
-            '-ins_length', str(int(self.reads_insert)),
-            '-scaffolding no',
-            '-exp_cov auto',
-            '-very_clean yes',
-            '-cov_cutoff auto',
-        ])
-
-        self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True)
-        if self.assembled_ok:
-            os.symlink(velvet_contigs, os.path.basename(self.assembly_contigs))
-        else:
-            with open('velvetg_errors', 'w') as f:
-                print(err, file=f)
-                f.close()
-            self.status_flag.add('assembly_fail')
 
-        os.chdir(cwd)
+    def _atexit(self):
+        if self.log_fh is not None:
+            pyfastaq.utils.close(self.log_fh)
+            self.log_fh = None
 
 
-    def _assemble_with_spades(self, unittest=False):
-        cmd = ' '.join([
-            self.spades_exe,
-            '-1', self.reads1,
-            '-2', self.reads2,
-            '-o', self.assembler_dir,
-            '-k', str(self.assembly_kmer),
-            '--threads', str(self.threads),
-            '--untrusted-contigs', self.gene_fa,
-        ])
-        if self.spades_other is not None:
-            cmd += ' ' + self.spades_other
+    def _receive_signal(self, signum, stack):
+        print('Signal', signum, 'received in cluster', self.name + '... Stopping!', file=sys.stderr, flush=True)
+        if self.log_fh is not None:
+            pyfastaq.utils.close(self.log_fh)
+            self.log_fh = None
+        if self.fail_file is not None:
+            with open(self.fail_file, 'w') as f:
+                pass
+        sys.exit(1)
 
-        cwd = os.getcwd()
-        os.chdir(self.assembly_dir)
-        spades_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'scaffolds.fasta')
 
-        if unittest:
-            os.mkdir(self.assembler_dir)
-            open(spades_contigs, 'w').close()
-            self.assembled_ok = True
-        else:
-            self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True)
-        if self.assembled_ok:
-            os.symlink(spades_contigs, os.path.basename(self.assembly_contigs))
-        else:
-            with open('spades_errors', 'w') as f:
-                print(err, file=f)
-            f.close()
-            self.status_flag.add('assembly_fail')
+    def _input_files_exist(self):
+        assert self.read_store is None
+        if not (os.path.exists(self.all_reads1) and os.path.exists(self.all_reads2)):
+            raise Error('Error making cluster. Reads files not found')
+        if not os.path.exists(self.references_fa):
+            raise Error('Error making cluster. References fasta not found')
 
-        os.chdir(cwd)
 
+    def _set_up_input_files(self):
+        if os.path.exists(self.root_dir):
+            self._input_files_exist()
+        else:
+            assert self.read_store is not None
+            assert self.reference_names is not None
+            try:
+                os.mkdir(self.root_dir)
+            except:
+                raise Error('Error making directory ' + self.root_dir)
+            self.read_store.get_reads(self.name, self.all_reads1, self.all_reads2)
+            self.refdata.write_seqs_to_fasta(self.references_fa, self.reference_names)
 
-    def _scaffold_with_sspace(self):
-        if not os.path.exists(self.assembly_contigs):
-            raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs)
 
-        try:
-            os.mkdir(self.scaffold_dir)
-        except:
-            raise Error('Error mkdir '+  self.scaffold_dir)
+    def _clean_file(self, filename):
+        if self.clean:
+            print('Deleting file', filename, file=self.log_fh)
+            os.unlink(filename)
 
-        cwd = os.getcwd()
 
-        if self.sspace_exe is None:
-            os.chdir(self.assembly_dir)
-            os.symlink(os.path.basename(self.assembly_contigs), os.path.basename(self.scaffolder_scaffolds))
-            os.chdir(cwd)
+    def _clean(self):
+        if not self.clean:
+            print('   ... not deleting anything because --noclean used', file=self.log_fh, flush=True)
             return
 
-        os.chdir(self.scaffold_dir)
-        lib_file = 'lib'
-        with open(lib_file, 'w') as f:
-            print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f)
 
-        cmd = ' '.join([
-            'perl', self.sspace_exe,
-            '-k', str(self.sspace_k),
-            '-l', lib_file,
-            '-s', self.assembly_contigs
-        ])
-
-        sspace_scaffolds = os.path.abspath('standard_output.final.scaffolds.fasta')
-        common.syscall(cmd, verbose=self.verbose)
-        os.chdir(self.assembly_dir)
-        os.symlink(os.path.relpath(sspace_scaffolds), os.path.basename(self.scaffolder_scaffolds))
-        os.chdir(cwd)
-
-
-    def _has_gaps_to_fill(self, filename):
-        seq_reader = pyfastaq.sequences.file_reader(filename)
-        for seq in seq_reader:
-            if 'n' in seq.seq or 'N' in seq.seq:
-                return True
-        return False
+        to_delete = [
+            'assembly.fa',
+            'assembly.fa.fai',
+            'assembly_compare.nucmer.coords',
+            'assembly_compare.nucmer.coords.snps',
+            'assembly.reads_mapped.bam.bai',
+            'assembly.reads_mapped.bam.vcf',
+            'assembly.reads_mapped.bam',
+            'assembly.reads_mapped.bam.read_depths.gz',
+            'assembly.reads_mapped.bam.read_depths.gz.tbi',
+            'reads_1.fq',
+            'reads_2.fq',
+            'reference.fa',
+        ]
 
+        to_delete = [os.path.join(self.root_dir, x) for x in to_delete]
 
-    def _gap_fill_with_gapfiller(self):
-        if not os.path.exists(self.scaffolder_scaffolds):
-            raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds)
+        for filename in to_delete:
+            if os.path.exists(filename):
+                self._clean_file(filename)
 
 
-        cwd = os.getcwd()
+    @staticmethod
+    def _number_of_reads_for_assembly(reference_fa, insert_size, total_bases, total_reads, coverage):
+        file_reader = pyfastaq.sequences.file_reader(reference_fa)
+        ref_length = sum([len(x) for x in file_reader])
+        assert ref_length > 0
+        ref_length += 2 * insert_size
+        mean_read_length = total_bases / total_reads
+        wanted_bases = coverage * ref_length
+        wanted_reads = int(math.ceil(wanted_bases / mean_read_length))
+        wanted_reads += wanted_reads % 2
+        return wanted_reads
 
-        if self.gapfiller_exe is None or not self._has_gaps_to_fill(self.scaffolder_scaffolds):
-            self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds)
-            return
 
-        try:
-            os.mkdir(self.gapfill_dir)
-        except:
-            raise Error('Error mkdir '+  self.gapfill_dir)
-
-        os.chdir(self.gapfill_dir)
-        lib_file = 'lib'
-        with open(lib_file, 'w') as f:
-            print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f)
-
-        cmd = ' '.join([
-            'perl', self.gapfiller_exe,
-            '-l', lib_file,
-            '-s', self.scaffolder_scaffolds
-        ])
-
-        gapfilled_scaffolds = os.path.join(self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa')
-        common.syscall(cmd, verbose=self.verbose)
-        self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds)
-        os.chdir(cwd)
-
-
-    def _rename_scaffolds(self, infile, outfile):
-        freader = pyfastaq.sequences.file_reader(infile)
-        f_out = pyfastaq.utils.open_file_write(outfile)
-        i = 1
-        for scaff in freader:
-            scaff.id = self.gene.id + '.scaffold.' + str(i)
-            i += 1
-            print(scaff, file=f_out)
-        pyfastaq.utils.close(f_out)
-
-
-    def _run_nucmer(self, qry, outfile, show_snps=False):
-        pymummer.nucmer.Runner(
-            self.gene_fa,
-            qry,
-            outfile,
-            min_id=self.nucmer_min_id,
-            min_length=self.nucmer_min_len,
-            breaklen=self.nucmer_breaklen,
-            show_snps=show_snps
-        ).run()
-
-
-    def _fix_contig_orientation(self):
-        if not os.path.exists(self.gapfilled_scaffolds):
-            raise Error('Cannot fix orientation of assembly contigs because file not found: ' + self.gapfilled_scaffolds)
-
-        tmp_coords = os.path.join(self.root_dir, 'tmp.coords')
-        self._run_nucmer(self.gapfilled_scaffolds, tmp_coords)
-
-        to_revcomp = set()
-        not_revcomp = set()
-        file_reader = pymummer.coords_file.reader(tmp_coords)
-        for hit in file_reader:
-            if hit.on_same_strand():
-                not_revcomp.add(hit.qry_name)
-            else:
-                to_revcomp.add(hit.qry_name)
-
-        os.unlink(tmp_coords)
-        in_both = to_revcomp.intersection(not_revcomp)
-        for name in in_both:
-            print('WARNING: hits to both strands of gene for scaffold. Interpretation of any variants cannot be trusted for this scaffold:', name, file=sys.stderr)
-            to_revcomp.remove(name)
-            self.status_flag.add('hit_both_strands')
-
-        f = pyfastaq.utils.open_file_write(self.final_assembly_fa)
-        seq_reader = pyfastaq.sequences.file_reader(self.gapfilled_scaffolds)
-        for seq in seq_reader:
-            if seq.id in to_revcomp:
-                seq.revcomp()
-            print(seq, file=f)
-        pyfastaq.utils.close(f)
-
-
-    def _load_final_contigs(self):
-        if not os.path.exists(self.final_assembly_fa):
-            raise Error('Cannot load final assembled contigs because file not found:' + self.final_assembly_fa)
-
-        self.final_assembly = {}
-        pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.final_assembly)
-
-
-    def _parse_assembly_bam(self):
-        if not os.path.exists(self.final_assembly_bam):
-            raise Error('File not found: ' + self.final_assembly_bam)
-
-        bam_parser = bam_parse.Parser(self.final_assembly_bam, self.final_assembly)
-        bam_parser.parse()
-        bam_parser.write_files(self.final_assembly_bam)
-        if not bam_parser.scaff_graph_is_consistent(self.min_scaff_depth, self.max_insert):
-            self.status_flag.add('scaffold_graph_bad')
-
-
-    def _parse_assembly_vs_gene_coords(self):
-        file_reader = pymummer.coords_file.reader(self.assembly_vs_gene_coords)
-        self.nucmer_hits = {}
-        for hit in file_reader:
-            assert hit.ref_name == self.gene.id
-            contig = hit.qry_name
-            if contig not in self.nucmer_hits:
-                self.nucmer_hits[contig] = []
-            self.nucmer_hits[contig].append(copy.copy(hit))
-
-
-    def _nucmer_hits_to_percent_identity(self):
-        self.percent_identities = {}
-        for contig in self.nucmer_hits:
-            product_sum = 0
-            length_sum = 0
-            for hit in self.nucmer_hits[contig]:
-                product_sum += hit.hit_length_qry * hit.percent_identity
-                length_sum += hit.hit_length_qry
-            assert length_sum > 0
-            self.percent_identities[contig] = round(product_sum / length_sum, 2)
-
-
-    def _nucmer_hits_to_scaff_coords(self):
-        coords = {}
-        for l in self.nucmer_hits.values():
-            for hit in l:
-                if hit.qry_name not in coords:
-                    coords[hit.qry_name] = []
-                coords[hit.qry_name].append(hit.qry_coords())
-
-        for scaff in coords:
-            pyfastaq.intervals.merge_overlapping_in_list(coords[scaff])
-
-        return coords
-
-
-    def _nucmer_hits_to_ref_coords(self, contig=None):
-        coords = []
-        if contig is None:
-            keys = list(self.nucmer_hits.keys())
+    @staticmethod
+    def _make_reads_for_assembly(number_of_wanted_reads, total_reads, reads_in1, reads_in2, reads_out1, reads_out2, random_seed=None):
+        '''Makes fastq files that are random subset of input files. Returns total number of reads in output files.
+           If the number of wanted reads is >= total reads, then just makes symlinks instead of making
+           new copies of the input files.'''
+        random.seed(random_seed)
+
+        if number_of_wanted_reads < total_reads:
+            reads_written = 0
+            percent_wanted = 100 * number_of_wanted_reads / total_reads
+            file_reader1 = pyfastaq.sequences.file_reader(reads_in1)
+            file_reader2 = pyfastaq.sequences.file_reader(reads_in2)
+            out1 = pyfastaq.utils.open_file_write(reads_out1)
+            out2 = pyfastaq.utils.open_file_write(reads_out2)
+
+            for read1 in file_reader1:
+                try:
+                    read2 = next(file_reader2)
+                except StopIteration:
+                    pyfastaq.utils.close(out1)
+                    pyfastaq.utils.close(out2)
+                    raise Error('Error subsetting reads. No mate found for read ' + read1.id)
+
+                if random.randint(0, 100) <= percent_wanted:
+                    print(read1, file=out1)
+                    print(read2, file=out2)
+                    reads_written += 2
+
+            pyfastaq.utils.close(out1)
+            pyfastaq.utils.close(out2)
+            return reads_written
         else:
-            keys = [contig]
-
-        for key in keys:
-            coords += [hit.ref_coords() for hit in self.nucmer_hits[key]]
-        coords.sort()
-        return coords
-
+            os.symlink(reads_in1, reads_out1)
+            os.symlink(reads_in2, reads_out2)
+            return total_reads
 
-    def _nucmer_hits_to_gene_cov_per_contig(self):
-        cov = {}
-        for contig in self.nucmer_hits:
-            coords = self._nucmer_hits_to_ref_coords(contig)
-            pyfastaq.intervals.merge_overlapping_in_list(coords)
-            cov[contig] = pyfastaq.intervals.length_sum_from_list(coords)
-        return cov
 
+    def run(self):
+        self._set_up_input_files()
 
-    @staticmethod
-    def _nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, outfile):
-        f = pyfastaq.utils.open_file_write(outfile)
-
-        for contig in sorted(nucmer_hits):
-            for hit in nucmer_hits[contig]:
-                qry_coords = hit.qry_coords()
-                fa = assembly[hit.qry_name].subseq(qry_coords.start, qry_coords.end + 1)
-                if hit.on_same_strand():
-                    strand = '+'
-                else:
-                    fa.revcomp()
-                    strand = '-'
-                ref_coords = hit.ref_coords()
-                fa.id = '.'.join([
-                    ref_gene.id,
-                    str(ref_coords.start + 1),
-                    str(ref_coords.end + 1),
-                    contig,
-                    str(qry_coords.start + 1),
-                    str(qry_coords.end + 1),
-                    strand
-                ])
-
-                if hit.hit_length_ref == hit.ref_length:
-                    fa.id += '.complete'
-
-                print(fa, file=f)
-
-        pyfastaq.utils.close(f)
-
-
-    def _whole_gene_covered_by_nucmer_hits(self):
-        covered = self._nucmer_hits_to_ref_coords()
-        pyfastaq.intervals.merge_overlapping_in_list(covered)
-        return pyfastaq.intervals.length_sum_from_list(covered) / len(self.gene) >= self.assembled_threshold
-
-
-    def _gene_coverage_unique(self):
-        covered = self._nucmer_hits_to_ref_coords()
-        covered.sort()
-        if len(covered) <= 1:
-            return True
-
-        coverage = {}
-        for i in covered:
-            for j in range(i.start, i.end + 1):
-                coverage[j] = coverage.get(j, 0) + 1
-
-        bases_depth_at_least_two = len([1 for x in coverage.values() if x > 1])
-        return bases_depth_at_least_two / len(self.gene) <= self.unique_threshold
-
-
-    def _gene_covered_by_complete_contig_with_orf(self):
-        for l in self.nucmer_hits.values():
-            for hit in l:
-                if hit.hit_length_ref == len(self.gene):
-                    start = min(hit.qry_start, hit.qry_end)
-                    end = max(hit.qry_start, hit.qry_end)
-                    assembled_gene = pyfastaq.sequences.Fasta('x', self.final_assembly[hit.qry_name][start:end+1])
-                    if (hit.ref_start < hit.ref_end) != (hit.qry_start < hit.qry_end):
-                        assembled_gene.revcomp()
-                    assembled_gene_aa = assembled_gene.translate()
-                    orfs = assembled_gene.orfs()
-                    if len(orfs) == 0:
-                        continue
-
-                    max_orf = orfs[0]
-                    for o in orfs:
-                        if len(o) > len(max_orf):
-                            max_orf = o
-
-                    if len(max_orf) == len(assembled_gene):
-                        return True
-        return False
-
-
-    def _gene_covered_by_at_least_one_full_length_contig(self):
-        for l in self.nucmer_hits.values():
-            for hit in l:
-                if len(hit.ref_coords()) == len(self.gene):
-                    return True
-        return False
-
-
-    def _update_flag_from_nucmer_file(self):
-        if self._whole_gene_covered_by_nucmer_hits():
-            self.status_flag.add('gene_assembled')
-
-        if self._gene_covered_by_at_least_one_full_length_contig():
-            self.status_flag.add('gene_assembled_into_one_contig')
-
-        if not self._gene_coverage_unique():
-            self.status_flag.add('gene_region_assembled_twice')
-
-        if self._gene_covered_by_complete_contig_with_orf():
-            self.status_flag.add('complete_orf')
-
-        if len(self.nucmer_hits) == 1:
-            self.status_flag.add('unique_contig')
-
-
-    def _get_mummer_variants(self):
-        snp_file = self.assembly_vs_gene_coords + '.snps'
-        if not os.path.exists(snp_file):
-            raise Error('File not found ' + snp_file)
-        variants = pymummer.snp_file.get_all_variants(snp_file)
-        self.mummer_variants = {}
+        for fname in [self.all_reads1, self.all_reads2, self.references_fa]:
+            if not os.path.exists(fname):
+                raise Error('File ' + fname + ' not found. Cannot continue')
 
-        if len(variants) == 0:
-            return
+        if self.logfile is None:
+            self.logfile = os.path.join(self.root_dir, 'log.txt')
 
-        variants.sort(key=operator.attrgetter('qry_name'))
-        variants.sort(key=operator.attrgetter('ref_start'))
-
-        for v in variants:
-            if v.qry_name not in self.mummer_variants:
-                self.mummer_variants[v.qry_name] = []
-            self.mummer_variants[v.qry_name].append(v)
-
-        for contig in self.mummer_variants:
-            l = self.mummer_variants[contig]
-            if len(l) > 1:
-                new_l = [[l[0]]]
-                previous_codon_start = self._get_codon_start(0, l[0].ref_start)
-                for variant in l[1:]:
-                    codon_start = self._get_codon_start(0, variant.ref_start)
-                    if codon_start == previous_codon_start:
-                        new_l[-1].append(variant)
-                    else:
-                        new_l.append([variant])
-                        previous_codon_start = codon_start
-                self.mummer_variants[contig] = new_l
-            else:
-                self.mummer_variants[contig] = [l]
-
-
-    def _filter_mummer_variants(self):
-        if len(self.mummer_variants) == 0:
-            return
+        self.log_fh = pyfastaq.utils.open_file_write(self.logfile)
 
-        for contig in self.mummer_variants:
-            variants = self.mummer_variants[contig]
-            for i in range(len(variants)):
-                t = self._get_variant_effect(variants[i])
-                if t is not None and t[0] in ['TRUNC', 'FSHIFT']:
-                    break
-            self.mummer_variants[contig] = variants[:i+1]
-
-
-    def _get_codon_start(self, gene_start, position):
-        assert position >= gene_start
-        while  (position - gene_start) % 3 != 0:
-            position -= 1
-        return position
-
-
-    def _get_variant_effect(self, variants):
-        if len(variants) == 0:
-            return None
-
-        var_types = [x.var_type for x in variants]
-        if len(set(var_types)) != 1:
-            return None
-
-        var_type = var_types[0]
-
-        assert set([x.ref_name for x in variants]) == set([self.gene.id])
-        codon_starts = [self._get_codon_start(0, x.ref_start) for x in variants]
-        assert len(set(codon_starts)) == 1
-        codon_start = codon_starts[0]
-        aa_start = codon_start // 3
-        ref_codon = pyfastaq.sequences.Fasta('codon', self.gene[codon_start:codon_start+3])
-        ref_aa = ref_codon.translate()
-
-        if var_type == pymummer.variant.SNP:
-            new_codon = list(ref_codon.seq)
-            for v in variants:
-                new_codon[v.ref_start - codon_start] = v.qry_base
-            new_codon = pyfastaq.sequences.Fasta('new', ''.join(new_codon))
-            qry_aa = new_codon.translate()
-
-            if ref_aa.seq == qry_aa.seq:
-                return ('SYN', '.')
-            elif qry_aa.seq == '*':
-                return ('TRUNC', ref_aa.seq + str(aa_start + 1) + 'trunc')
-            else:
-                return ('NONSYN', ref_aa.seq + str(aa_start + 1) + qry_aa.seq)
-        elif var_type in [pymummer.variant.INS, pymummer.variant.DEL]:
-            if len(variants) > 1:
-                print('More than one indel in same codon not yet implemented!', self.gene.id, file=sys.stderr)
-                return None
-
-            var = variants[0]
-
-            if var_type == pymummer.variant.INS:
-                new_seq = pyfastaq.sequences.Fasta('seq', var.qry_base)
-            else:
-                new_seq = pyfastaq.sequences.Fasta('seq', var.ref_base)
-
-            if len(new_seq) % 3 != 0:
-                return ('FSHIFT', ref_aa.seq + str(aa_start + 1) + 'fs')
-
-            new_seq_aa = new_seq.translate()
-            if '*' in new_seq_aa.seq:
-                return ('TRUNC', ref_aa.seq + str(aa_start + 1) + 'trunc')
-            elif var_type == pymummer.variant.INS:
-                ref_codon_after_ins = pyfastaq.sequences.Fasta('codon', self.gene[codon_start+3:codon_start+6])
-                aa_after_ins = ref_codon_after_ins.translate()
-                return ('INS', ref_aa.seq + str(aa_start + 1) + '_' + aa_after_ins.seq + str(aa_start + 2) + 'ins' + new_seq_aa.seq )
-            else:
-                if len(new_seq) == 3:
-                    return ('DEL', ref_aa.seq + str(aa_start + 1) + 'del')
-                else:
-                    assert len(new_seq) % 3 == 0
-                    new_aa = new_seq.translate()
-                    ref_codon_after_ins = pyfastaq.sequences.Fasta('codon', self.gene[codon_start+3:codon_start+6])
-                    aa_after_ins = ref_codon_after_ins.translate()
-                    return ('DEL', ref_aa.seq + str(aa_start + 1)+ '_' + aa_after_ins.seq + str(aa_start + 2) + 'del')
+        original_dir = os.getcwd()
+        os.chdir(self.root_dir)
 
-        else:
-            return ('UNKNOWN', '.')
-
-
-    def _make_assembly_vcf(self):
-        tmp_vcf = self.final_assembly_vcf + '.tmp'
-        cmd = ' '.join([
-            self.samtools_exe, 'mpileup',
-            '-t INFO/DPR,DV',
-            '-A',
-            '-f', self.final_assembly_fa,
-            '-u',
-            '-v',
-            self.final_assembly_bam,
-            '>',
-            tmp_vcf
-        ])
-
-        common.syscall(cmd, verbose=self.verbose)
-
-        cmd = ' '.join([
-            self.bcftools_exe, 'call -m',
-            tmp_vcf,
-            '|',
-            self.bcftools_exe, 'query',
-            r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''',
-            '>',
-            self.final_assembly_read_depths + '.tmp'
-        ])
-
-        common.syscall(cmd, verbose=self.verbose)
-        pysam.tabix_compress(self.final_assembly_read_depths + '.tmp', self.final_assembly_read_depths)
-        pysam.tabix_index(self.final_assembly_read_depths, seq_col=0, start_col=1, end_col=1)
-        os.unlink(self.final_assembly_read_depths + '.tmp')
-
-        cmd = ' '.join([
-            self.bcftools_exe, 'call -m -v',
-            tmp_vcf,
-            '|',
-            self.bcftools_exe, 'filter',
-            '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
-                  ' & MIN(DV)>=' + str(self.bcf_min_dv),
-                  ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp),
-                  ' & QUAL >=', str(self.bcf_min_qual), '"',
-            '-o', self.final_assembly_vcf
-        ])
-
-        common.syscall(cmd, verbose=self.verbose)
-        os.unlink(tmp_vcf)
-
-
-    def _get_assembly_read_depths(self, ref, position):
-        '''Returns total read depth and depth of reads supporting alternative (if present)'''
-        assert os.path.exists(self.final_assembly_read_depths)
-        assert os.path.exists(self.final_assembly_read_depths + '.tbi')
-        tbx = pysam.TabixFile(self.final_assembly_read_depths)
         try:
-            rows = [x for x in tbx.fetch(ref, position, position + 1)]
-        except:
-            return None
-
-        if len(rows) > 1: # which happens with indels, mutiple lines for same base of reference
-            test_rows = [x for x in rows if x.rstrip().split()[3] != '.']
-            if len(test_rows) != 1:
-                rows = [rows[-1]]
-            else:
-                rows = test_rows
-
-        if len(rows) == 1:
-            r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
-            return ref_base, alt_base, int(ref_counts), alt_counts
+            self._run()
+        except Error as err:
+            os.chdir(original_dir)
+            print('Error running cluster! Error was:', err, sep='\n', file=self.log_fh)
+            pyfastaq.utils.close(self.log_fh)
+            self.log_fh = None
+            raise Error('Error running cluster ' + self.name + '!')
+
+        os.chdir(original_dir)
+        print('Finished', file=self.log_fh, flush=True)
+        print('{:_^79}'.format(' LOG FILE END ' + self.name + ' '), file=self.log_fh, flush=True)
+
+        # This stops multiprocessing complaining with the error:
+        # multiprocessing.pool.MaybeEncodingError: Error sending result: '[<ariba.cluster.Cluster object at 0x7ffa50f8bcd0>]'. Reason: 'TypeError("cannot serialize '_io.TextIOWrapper' object",)'
+        pyfastaq.utils.close(self.log_fh)
+        self.log_fh = None
+
+
+    def _run(self):
+        print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True)
+
+        print('Choosing best reference sequence:', file=self.log_fh, flush=True)
+        seq_chooser = best_seq_chooser.BestSeqChooser(
+            self.all_reads1,
+            self.all_reads2,
+            self.references_fa,
+            self.log_fh,
+            samtools_exe=self.extern_progs.exe('samtools'),
+            bowtie2_exe=self.extern_progs.exe('bowtie2'),
+            bowtie2_preset=self.bowtie2_preset,
+            threads=1,
+        )
+        self.ref_sequence = seq_chooser.best_seq(self.reference_fa)
+        self._clean_file(self.references_fa)
+        self._clean_file(self.references_fa + '.fai')
+
+        if self.ref_sequence is None:
+            self.status_flag.add('ref_seq_choose_fail')
+            self.assembled_ok = False
         else:
-            return None
-
-
-    def _get_samtools_variant_positions(self):
-        if not os.path.exists(self.final_assembly_vcf):
-            return []
-        f = pyfastaq.utils.open_file_read(self.final_assembly_vcf)
-        positions = [l.rstrip().split('\t')[0:2] for l in f if not l.startswith('#')]
-        positions = [(t[0], int(t[1]) - 1) for t in positions]
-        pyfastaq.utils.close(f)
-        return positions
-
-
-    def _get_samtools_variants(self, positions=None):
-        if positions is None:
-            positions = self._get_samtools_variant_positions()
-        variants = {}
-        if len(positions) == 0:
-            return variants
-        if not (os.path.exists(self.final_assembly_vcf) and os.path.exists(self.final_assembly_read_depths)):
-            return variants
-        for t in positions:
-            name, pos = t[0], t[1]
-            depths = self._get_assembly_read_depths(name, pos)
-            if depths is None:
-                raise Error('Error getting read depths for sequence ' + name + ' at position ' + t[1])
-            if name not in variants:
-                variants[name] = {}
-            variants[name][t[1]] = depths
-        return variants
-
-
-    def _get_vcf_variant_counts(self):
-        scaff_coords = self._nucmer_hits_to_scaff_coords()
-        self.vcf_variant_counts = {}
-        f = pyfastaq.utils.open_file_read(self.final_assembly_vcf)
-        for line in f:
-            if line.startswith('#'):
-                continue
-
-            data = line.rstrip().split('\t')
-            scaff = data[0]
-
-            if scaff in scaff_coords:
-                position = int(data[1]) - 1
-                i = pyfastaq.intervals.Interval(position, position)
-                intersects = len([x for x in scaff_coords[scaff] if x.intersects(i)]) > 0
-                if intersects:
-                    self.vcf_variant_counts[scaff] = self.vcf_variant_counts.get(scaff, 0) + 1
-
-        pyfastaq.utils.close(f)
-        total = sum(list(self.vcf_variant_counts.values()))
-        if total >= 1:
-            self.status_flag.add('variants_suggest_collapsed_repeat')
-
-
-    def _initial_make_report_lines(self):
-        '''Makes report lines. While they are being made, we discover if there were
-        and non-synonymous variants. This affects the flag, which also gets updated
-        by the function. To then fix the report lines, must run _update_flag_in_report_lines()'''
-        self.report_lines = []
-        total_reads = self._get_read_counts()
-
-        if not self.assembled_ok:
-            gene_name = 'NA' if self.gene is None else self.gene.id
-            gene_length = '.' if self.gene is None else len(self.gene)
-            self.report_lines.append([
-                    gene_name,
-                    self.status_flag.to_number(),
-                    total_reads,
-                    self.name,
-                    gene_length,
-                    '.',
-                    '.',
-                  ] + \
-                  ['.'] * 14
+            wanted_reads = self._number_of_reads_for_assembly(self.reference_fa, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage)
+            made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed)
+            print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True)
+            print('Assembling reads:', file=self.log_fh, flush=True)
+            self.ref_sequence_type = self.refdata.sequence_type(self.ref_sequence.id)
+            assert self.ref_sequence_type is not None
+            self.assembly = assembly.Assembly(
+              self.reads_for_assembly1,
+              self.reads_for_assembly2,
+              self.reference_fa,
+              self.assembly_dir,
+              self.final_assembly_fa,
+              self.final_assembly_bam,
+              self.log_fh,
+              scaff_name_prefix=self.ref_sequence.id,
+              kmer=self.assembly_kmer,
+              assembler=self.assembler,
+              spades_other_options=self.spades_other_options,
+              sspace_k=self.sspace_k,
+              sspace_sd=self.sspace_sd,
+              reads_insert=self.reads_insert,
+              extern_progs=self.extern_progs,
+              clean=self.clean
             )
-            return
-
-        cov_per_contig = self._nucmer_hits_to_gene_cov_per_contig()
-        samtools_variants = self._get_samtools_variants()
 
+            self.assembly.run()
+            self.assembled_ok = self.assembly.assembled_ok
+            self._clean_file(self.reads_for_assembly1)
+            self._clean_file(self.reads_for_assembly2)
+            if self.clean:
+                print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True)
+                shutil.rmtree(self.assembly_dir)
 
-        for contig in self.mummer_variants:
-            for variants in self.mummer_variants[contig]:
-                t = self._get_variant_effect(variants)
-                if t is not None:
-                    effect, new_bases = t
-                    if effect != 'SYN':
-                        self.status_flag.add('has_nonsynonymous_variants')
-
-                    for v in variants:
-                        depths = self._get_assembly_read_depths(contig, v.qry_start)
-                        if depths is None:
-                            # this happens with low coverage contigs. It can get assembled, but
-                            # there are some bases that do not have reads mapped to them.
-                            # If mummer called a variant at one of these, then we're looking
-                            # for read dpeth where there is none.
-                            print('Warning: could not get read depth info on contig "' + contig + '" at position ', str(v.qry_start + 1), 'from file', self.final_assembly_read_depths, file=sys.stderr)
-                            print(' - a variant was called at this position using nucmer, but there is no read depth (probably a mapping artifact)', file=sys.stderr)
-                            depths = ['.'] * 4
-
-                        ref_base, alt_base, ref_counts, alt_counts = depths
-
-                        self.report_lines.append([
-                            self.gene.id,
-                            self.status_flag.to_number(),
-                            total_reads,
-                            self.name,
-                            len(self.gene),
-                            cov_per_contig[contig],
-                            self.percent_identities[contig],
-                            pymummer.variant.var_types[v.var_type],
-                            effect,
-                            new_bases,
-                            v.ref_start + 1,
-                            v.ref_end + 1,
-                            v.ref_base,
-                            v.qry_name,
-                            v.qry_length,
-                            v.qry_start + 1,
-                            v.qry_end + 1,
-                            v.qry_base,
-                            ref_counts,
-                            alt_base,
-                            alt_counts,
-                        ])
-
-                        if contig in samtools_variants and v.qry_start in samtools_variants[contig]:
-                            del samtools_variants[contig][v.qry_start]
-                            if len(samtools_variants[contig]) == 0:
-                                del samtools_variants[contig]
-
-            if contig in samtools_variants:
-                for pos in samtools_variants[contig]:
-                    ref_base, alt_base, ref_counts, alt_counts = samtools_variants[contig][pos]
-                    self.report_lines.append(
-                      [
-                        self.gene.id,
-                        self.status_flag.to_number(),
-                        total_reads,
-                        self.name,
-                        len(self.gene),
-                        cov_per_contig[contig],
-                        self.percent_identities[contig],
-                      ] + \
-                      ['.'] * 6 + \
-                      [
-                        contig,
-                        len(self.final_assembly[contig]),
-                        pos + 1,
-                        pos + 1,
-                        ref_base,
-                        ref_counts,
-                        alt_base,
-                        alt_counts
-                      ]
-                    )
-
-        if len(self.report_lines) == 0:
-            for contig in self.percent_identities:
-                self.report_lines.append([
-                    self.gene.id,
-                    self.status_flag.to_number(),
-                    total_reads,
-                    self.name,
-                    len(self.gene),
-                    cov_per_contig[contig],
-                    self.percent_identities[contig],
-                  ] + \
-                  ['.'] * 6 + [contig, len(self.final_assembly[contig])] + ['.'] * 6
-                )
-
-        self.report_lines.sort(key=itemgetter(0, 14, 15))
-
-
-    def _update_flag_in_report_lines(self):
-        '''This corrects the flag in all the report lines made by _initial_make_report_lines()'''
-        flag_column = 1
-        if self.status_flag.has('has_nonsynonymous_variants'):
-            for line in self.report_lines:
-                line[flag_column] = self.status_flag.to_number()
-
-
-    def _make_report_lines(self):
-        self._initial_make_report_lines()
-        self._update_flag_in_report_lines()
+        if self.assembled_ok:
+            print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True)
 
+            mapping.run_bowtie2(
+                self.all_reads1,
+                self.all_reads2,
+                self.final_assembly_fa,
+                self.final_assembly_bam[:-4],
+                threads=1,
+                sort=True,
+                samtools=self.extern_progs.exe('samtools'),
+                bowtie2=self.extern_progs.exe('bowtie2'),
+                bowtie2_preset=self.bowtie2_preset,
+                verbose=True,
+                verbose_filehandle=self.log_fh
+            )
 
-    def _clean(self):
-        if self.verbose:
-            print('Cleaning', self.root_dir)
-
-        if self.clean > 0:
-            if self.verbose:
-                print('  rm -r', self.assembly_dir)
-            shutil.rmtree(self.assembly_dir)
-
-        to_clean = [
-            [
-                'assembly.reads_mapped.unsorted.bam',
-            ],
-            [
-                'assembly.fa.fai',
-                'assembly.reads_mapped.bam.scaff',
-                'assembly.reads_mapped.bam.soft_clipped',
-                'assembly.reads_mapped.bam.unmapped_mates',
-                'assembly_vs_gene.coords',
-                'assembly_vs_gene.coords.snps',
-                'genes.fa',
-                'genes.fa.fai',
-                'reads_1.fq',
-                'reads_2.fq',
-            ],
-            [
-                'assembly.fa.fai',
-                'assembly.reads_mapped.bam',
-                'assembly.reads_mapped.bam.vcf',
-                'assembly_vs_gene.coords',
-                'assembly_vs_gene.coords.snps',
-            ]
-        ]
+            if self.assembly.has_contigs_on_both_strands:
+                self.status_flag.add('hit_both_strands')
+
+            print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True)
+
+            if not self.assembly.scaff_graph_ok:
+                self.status_flag.add('scaffold_graph_bad')
+
+            print('Comparing assembly against reference sequence', file=self.log_fh, flush=True)
+            self.assembly_compare = assembly_compare.AssemblyCompare(
+              self.final_assembly_fa,
+              self.assembly.sequences,
+              self.reference_fa,
+              self.ref_sequence,
+              self.assembly_compare_prefix,
+              self.refdata,
+              nucmer_min_id=self.nucmer_min_id,
+              nucmer_min_len=self.nucmer_min_len,
+              nucmer_breaklen=self.nucmer_breaklen,
+              assembled_threshold=self.assembled_threshold,
+              unique_threshold=self.unique_threshold,
+              max_gene_nt_extend=self.max_gene_nt_extend,
+            )
+            self.assembly_compare.run()
+            self.status_flag = self.assembly_compare.update_flag(self.status_flag)
 
-        for i in range(self.clean + 1):
-            for fname in to_clean[i]:
-                fullname = os.path.join(self.root_dir, fname)
-                if os.path.exists(fullname):
-                    if self.verbose:
-                        print('  rm', fname)
-                    os.unlink(fullname)
+            nucmer_hits_to_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(self.assembly_compare.nucmer_hits)
+            assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file)
+            self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, nucmer_hits_to_ref)
 
+            for var_list in self.assembly_variants.values():
+                for var in var_list:
+                    if var[3] not in ['.', 'SYN', None]:
+                        self.status_flag.add('has_nonsynonymous_variants')
+                        break
 
-    def run(self):
-        self.gene = self._choose_best_gene()
-        if self.gene is None:
-            self.assembled_ok = False
-        else:
-            if self.assembler == 'velvet':
-                self._assemble_with_velvet()
-            elif self.assembler == 'spades':
-                self._assemble_with_spades()
+                if self.status_flag.has('has_nonsynonymous_variants'):
+                    break
 
-        # velvet can finish successfully, but make an empty contigs file
-        if self.assembled_ok:
-            number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs)
-            if number_of_contigs == 0:
-                self.assembled_ok = False
-                self.status_flag.add('assembly_fail')
 
-        if self.assembled_ok:
-            # finish the assembly
-            self._scaffold_with_sspace()
-            self._gap_fill_with_gapfiller()
-            self._fix_contig_orientation()
-            self._load_final_contigs()
+            print('\nCalling variants with samtools:', file=self.log_fh, flush=True)
 
-            # map reads to assembly
-            mapping.run_bowtie2(
-                self.reads1,
-                self.reads2,
+            self.samtools_vars = samtools_variants.SamtoolsVariants(
                 self.final_assembly_fa,
-                self.final_assembly_bam[:-4],
-                threads=self.threads,
-                sort=True,
-                samtools=self.samtools_exe,
-                bowtie2=self.bowtie2_exe,
-                bowtie2_preset=self.bowtie2_preset,
-                verbose=self.verbose,
+                self.final_assembly_bam,
+                self.samtools_vars_prefix,
+                log_fh=self.log_fh,
+                samtools_exe=self.extern_progs.exe('samtools'),
+                bcftools_exe=self.extern_progs.exe('bcftools'),
+                bcf_min_dp=self.bcf_min_dp,
+                bcf_min_dv=self.bcf_min_dv,
+                bcf_min_dv_over_dp=self.bcf_min_dv_over_dp,
+                bcf_min_qual=self.bcf_min_qual,
             )
-            self._parse_assembly_bam()
+            self.samtools_vars.run()
+
+            self.total_contig_depths = self.samtools_vars.total_depth_per_contig(self.samtools_vars.read_depths_file)
 
+            if self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file):
+                self.status_flag.add('variants_suggest_collapsed_repeat')
+        else:
+            print('\nAssembly failed\n', file=self.log_fh, flush=True)
+            self.status_flag.add('assembly_fail')
 
-            # compare gene and assembly
-            self._run_nucmer(self.final_assembly_fa, self.assembly_vs_gene_coords, show_snps=True)
-            self._parse_assembly_vs_gene_coords()
-            self._nucmer_hits_to_percent_identity()
-            self._get_mummer_variants()
-            self._filter_mummer_variants()
-            self._update_flag_from_nucmer_file()
-            self._make_assembly_vcf()
-            self._get_vcf_variant_counts()
-            self._nucmer_hits_to_assembled_gene_sequences(self.nucmer_hits, self.gene, self.final_assembly, self.final_assembled_genes_fa)
 
-        self._make_report_lines()
+        print('\nMaking report lines', file=self.log_fh, flush=True)
+        self.report_lines = report.report_lines(self)
         self._clean()
+        atexit.unregister(self._atexit)
diff --git a/ariba/clusters.py b/ariba/clusters.py
index 6f328cf..1606fdb 100644
--- a/ariba/clusters.py
+++ b/ariba/clusters.py
@@ -1,77 +1,110 @@
+import signal
+import time
+import atexit
 import os
+import copy
+import tempfile
+import pickle
+import itertools
 import sys
 import shutil
 import openpyxl
+import multiprocessing
 import pysam
 import pyfastaq
-from ariba import cdhit, cluster, common, mapping, histogram, faidx
+from ariba import cluster, common, mapping, histogram, read_store, report, report_filter, reference_data
 
 class Error (Exception): pass
 
 
+def _run_cluster(obj, verbose, clean, fails_dir):
+    failed_clusters = os.listdir(fails_dir)
+
+    if len(failed_clusters) > 0:
+        print('Other clusters failed. Will not start cluster', obj.name, file=sys.stderr)
+        return obj
+
+    if verbose:
+        print('Start running cluster', obj.name, 'in directory', obj.root_dir, flush=True)
+    try:
+        obj.run()
+    except:
+        print('Failed cluster:', obj.name, file=sys.stderr)
+        with open(os.path.join(fails_dir, obj.name), 'w'):
+            pass
+
+    if verbose:
+        print('Finished running cluster', obj.name, 'in directory', obj.root_dir, flush=True)
+
+    if clean:
+        if verbose:
+            print('Deleting cluster dir', obj.root_dir, flush=True)
+        if os.path.exists(obj.root_dir):
+            shutil.rmtree(obj.root_dir)
+
+    return obj
+
+
 class Clusters:
     def __init__(self,
-      db_fasta,
+      refdata_dir,
       reads_1,
       reads_2,
       outdir,
+      extern_progs,
+      version_report_lines=None,
       assembly_kmer=21,
+      assembly_coverage=100,
       threads=1,
       verbose=False,
-      assembler='velvet',
-      smalt_k=13,
-      smalt_s=2,
-      smalt_min_id=0.9,
+      assembler='spades',
       spades_other=None,
       max_insert=1000,
       min_scaff_depth=10,
       nucmer_min_id=90,
-      nucmer_min_len=50,
-      nucmer_breaklen=50,
+      nucmer_min_len=20,
+      nucmer_breaklen=200,
       assembled_threshold=0.95,
       unique_threshold=0.03,
-      bcftools_exe='bcftools',
-      gapfiller_exe='GapFiller.pl',
-      samtools_exe='samtools',
-      smalt_exe='smalt',
-      bowtie2_exe='bowtie2',
+      max_gene_nt_extend=30,
       bowtie2_preset='very-sensitive-local',
-      spades_exe='spades.py',
-      sspace_exe='SSPACE_Basic_v2.0.pl',
-      velvet_exe='velvet', # prefix of velvet{g,h}
-      cdhit_seq_identity_threshold=0.9,
-      cdhit_length_diff_cutoff=0.9,
-      run_cd_hit=True,
-      clean=1,
+      clean=True,
+      tmp_dir=None,
     ):
+        self.refdata_dir = os.path.abspath(refdata_dir)
+        self.refdata, self.cluster_ids = self._load_reference_data_from_dir(refdata_dir)
         self.reads_1 = os.path.abspath(reads_1)
         self.reads_2 = os.path.abspath(reads_2)
         self.outdir = os.path.abspath(outdir)
-        self.clusters_outdir = os.path.join(self.outdir, 'Clusters')
-        self.clusters_info_file = os.path.join(self.outdir, 'clusters.tsv')
+        self.extern_progs = extern_progs
+
+        if version_report_lines is None:
+            self.version_report_lines = []
+        else:
+            self.version_report_lines = version_report_lines
+
         self.clean = clean
+        self.logs_dir = os.path.join(self.outdir, 'Logs')
 
         self.assembler = assembler
-        assert self.assembler in ['velvet', 'spades']
+        assert self.assembler in ['spades']
         self.assembly_kmer = assembly_kmer
+        self.assembly_coverage = assembly_coverage
         self.spades_other = spades_other
 
-        self.db_fasta_clustered = os.path.join(self.outdir, 'input_genes.clustered.fa')
-        self.cluster_ids = {}
-        self.bam_prefix = os.path.join(self.outdir, 'map_all_reads')
+        self.cdhit_files_prefix = os.path.join(self.refdata_dir, 'cdhit')
+        self.cdhit_cluster_representatives_fa = self.cdhit_files_prefix + '.cluster_representatives.fa'
+        self.bam_prefix = os.path.join(self.outdir, 'map_reads_to_cluster_reps')
         self.bam = self.bam_prefix + '.bam'
-        self.report_file_tsv = os.path.join(self.outdir, 'report.tsv')
-        self.report_file_xls = os.path.join(self.outdir, 'report.xls')
-        self.catted_assembled_genes_fasta = os.path.join(self.outdir, 'assembled_genes.fa')
+        self.report_file_all_tsv = os.path.join(self.outdir, 'report.all.tsv')
+        self.report_file_all_xls = os.path.join(self.outdir, 'report.all.xls')
+        self.report_file_filtered_prefix = os.path.join(self.outdir, 'report')
+        self.catted_assembled_seqs_fasta = os.path.join(self.outdir, 'assembled_seqs.fa.gz')
+        self.catted_genes_matching_refs_fasta = os.path.join(self.outdir, 'assembled_genes.fa.gz')
         self.threads = threads
         self.verbose = verbose
 
-        self.smalt_k = smalt_k
-        self.smalt_s = smalt_s
-        self.smalt_min_id = smalt_min_id
         self.max_insert = max_insert
-        self.smalt_exe = smalt_exe
-        self.bowtie2_exe = bowtie2_exe
         self.bowtie2_preset = bowtie2_preset
 
         self.insert_hist_bin = 10
@@ -87,123 +120,146 @@ class Clusters:
 
         self.assembled_threshold = assembled_threshold
         self.unique_threshold = unique_threshold
+        self.max_gene_nt_extend = max_gene_nt_extend
 
         self.cluster_to_dir = {}  # gene name -> abs path of cluster directory
         self.clusters = {}        # gene name -> Cluster object
+        self.cluster_read_counts = {} # gene name -> number of reads
+        self.cluster_base_counts = {} # gene name -> number of bases
+        self.pool = None
+        self.fails_dir = os.path.join(self.outdir ,'.fails')
+        self.clusters_all_ran_ok = True
 
-        self.bcftools_exe = bcftools_exe
+        for d in [self.outdir, self.logs_dir, self.fails_dir]:
+            try:
+                os.mkdir(d)
+            except:
+                raise Error('Error mkdir ' + d)
 
-        self.sspace_exe = shutil.which(sspace_exe)
-        if self.sspace_exe is None:
-            print('WARNING: SSPACE not found. Scaffolding and gap filling will be skipped!', file=sys.stderr)
-            self.gapfiller_exe = None
-        else:
-            self.sspace_exe = os.path.realpath(self.sspace_exe) # otherwise sspace dies loading packages
-            self.gapfiller_exe = shutil.which(gapfiller_exe)
-            if self.gapfiller_exe is None:
-                print('WARNING: GapFiller not found. No gap filling will be run after scaffolding!', file=sys.stderr)
+        if tmp_dir is None:
+            if 'ARIBA_TMPDIR' in os.environ:
+                tmp_dir = os.path.abspath(os.environ['ARIBA_TMPDIR'])
             else:
-                self.gapfiller_exe = os.path.realpath(self.gapfiller_exe) # otherwise gapfiller dies loading packages
+                tmp_dir = self.outdir
 
-        self.samtools_exe = samtools_exe
-        self.spades_exe = spades_exe
+        if not os.path.exists(tmp_dir):
+            raise Error('Temporary directory ' + tmp_dir + ' not found. Cannot continue')
 
-        self.velvet = velvet_exe
+        if self.clean:
+            self.tmp_dir_obj = tempfile.TemporaryDirectory(prefix='ariba.tmp.', dir=os.path.abspath(tmp_dir))
+            self.tmp_dir = self.tmp_dir_obj.name
+        else:
+            self.tmp_dir_obj = None
+            self.tmp_dir = os.path.join(self.outdir, 'clusters')
+            try:
+                os.mkdir(self.tmp_dir)
+            except:
+                raise Error('Error making directory ' + self.tmp_dir)
 
-        self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold
-        self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff
-        self.run_cd_hit = run_cd_hit
+        if self.verbose:
+            print('Temporary directory:', self.tmp_dir)
 
-        for d in [self.outdir, self.clusters_outdir]:
+        for i in [x for x in dir(signal) if x.startswith("SIG") and x not in {'SIGCHLD', 'SIGCLD'}]:
             try:
-                os.mkdir(d)
+                signum = getattr(signal, i)
+                signal.signal(signum, self._receive_signal)
             except:
-                raise Error('Error mkdir ' + d)
+                pass
 
-        self.db_fasta = os.path.join(self.outdir, 'input_genes.not_clustered.fa')
-        pyfastaq.tasks.to_fasta(db_fasta, self.db_fasta, check_unique=True)
 
+    def _stop_pool(self):
+        if self.pool is None:
+            return
+        self.pool.close()
+        self.pool.terminate()
+        while len(multiprocessing.active_children()) > 0:
+            time.sleep(1)
 
-    def _run_cdhit(self):
-        r = cdhit.Runner(
-            self.db_fasta,
-            self.db_fasta_clustered,
-            seq_identity_threshold=self.cdhit_seq_identity_threshold,
-            threads=self.threads,
-            length_diff_cutoff=self.cdhit_length_diff_cutoff,
-            verbose=self.verbose,
+
+    def _emergency_stop(self):
+        self._stop_pool()
+        if self.clean:
+            try:
+                self.tmp_dir_obj.cleanup()
+            except:
+                pass
+
+
+    def _receive_signal(self, signum, stack):
+        print('Stopping! Signal received:', signum, file=sys.stderr, flush=True)
+        self._emergency_stop()
+        sys.exit(1)
+
+
+    @classmethod
+    def _load_reference_data_info_file(cls, filename):
+        data = {
+            'genetic_code': None
+        }
+
+        with open(filename) as f:
+            for line in f:
+                key, val = line.rstrip().split('\t')
+                if key in data:
+                    data[key] = val
+
+        if None in data.values():
+            missing_values = [x for x in data if data[x] is None]
+            raise Error('Error reading reference info file ' + filename + '. These values not found: ' + ','.join(missing_values))
+
+        data['genetic_code'] = int(data['genetic_code'])
+        return data
+
+
+    @staticmethod
+    def _load_reference_data_from_dir(indir):
+        if not os.path.exists(indir):
+            raise Error('Error loading reference data. Input directory ' + indir + ' not found. Cannot continue')
+
+        variants_only_fa = os.path.join(indir, 'refcheck.01.check_variants.variants_only.fa')
+        presence_absence_fa = os.path.join(indir, 'refcheck.01.check_variants.presence_absence.fa')
+        non_coding_fa = os.path.join(indir, 'refcheck.01.check_variants.non_coding.fa')
+        metadata_tsv = os.path.join(indir, 'refcheck.01.check_variants.tsv')
+        info_file = os.path.join(indir, 'info.txt')
+        clusters_file = os.path.join(indir, 'cdhit.clusters.pickle')
+        params = Clusters._load_reference_data_info_file(info_file)
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa if os.path.exists(presence_absence_fa) else None,
+            variants_only_fa=variants_only_fa if os.path.exists(variants_only_fa) else None,
+            non_coding_fa=non_coding_fa if os.path.exists(non_coding_fa) else None,
+            metadata_tsv=metadata_tsv if os.path.exists(metadata_tsv) else None,
+            genetic_code=params['genetic_code'],
         )
-        if self.run_cd_hit:
-            self.cluster_ids = r.run()
-        else:
-            if self.verbose:
-                print('Skipping cd-hit because --no_cdhit option used')
-            self.cluster_ids = r.fake_run()
 
+        with open(clusters_file, 'rb') as f:
+            cluster_ids = pickle.load(f)
 
-    def _write_clusters_info_file(self):
-        f = pyfastaq.utils.open_file_write(self.clusters_info_file)
-        print('#Cluster\tGene', file=f)
-        for c in sorted([int(x) for x in self.cluster_ids]):
-            for seqname in sorted(list(self.cluster_ids[str(c)])):
-                print(c, seqname, sep='\t', file=f)
-        pyfastaq.utils.close(f)
+        return refdata, cluster_ids
 
 
     def _map_reads_to_clustered_genes(self):
         mapping.run_bowtie2(
             self.reads_1,
             self.reads_2,
-            self.db_fasta_clustered,
+            self.cdhit_cluster_representatives_fa,
             self.bam_prefix,
             threads=self.threads,
-            samtools=self.samtools_exe,
-            bowtie2=self.bowtie2_exe,
+            samtools=self.extern_progs.exe('samtools'),
+            bowtie2=self.extern_progs.exe('bowtie2'),
             bowtie2_preset=self.bowtie2_preset,
             verbose=self.verbose,
+            remove_both_unmapped=True,
         )
 
 
-    def _sam_to_fastq(self, s):
-        name = s.qname
-        if s.is_read1:
-            name += '/1'
-        elif s.is_read2:
-            name += '/2'
-        else:
-            raise Error('Read ' + name + ' must be first or second of pair according to flag. Cannot continue')
-
-        seq = pyfastaq.sequences.Fastq(name, common.decode(s.seq), common.decode(s.qual))
-        if s.is_reverse:
-            seq.revcomp()
-
-        return seq
-
-
-    def _sam_pair_to_insert(self, s1, s2):
-        if s1.is_unmapped or s2.is_unmapped or (s1.tid != s2.tid) or (s1.is_reverse == s2.is_reverse):
-            return None
-
-        # If here, reads are both mapped to the same ref, and in opposite orientations
-        if s1.is_reverse:
-            end = s1.reference_end - 1
-            start = s2.reference_start
-        else:
-            end = s2.reference_end - 1
-            start = s1.reference_start
-
-        if start < end:
-            return end - start + 1
-        else:
-            return None
-
-
     def _bam_to_clusters_reads(self):
-        '''Sets up Cluster directories (one for each gene that has reads that mapped to it), writes reads fwd and rev files. Also gathers histogram data of insert size'''
-        filehandles_1 = {} # gene name -> filehandle of fwd_reads
-        filehandles_2 = {} # gene name -> filehandle of rev_reads
+        '''Sets up ReadStore of reads for all the clusters. Also gathers histogram data of insert size'''
+        reads_file_for_read_store = os.path.join(self.outdir, 'reads')
+        f_out = pyfastaq.utils.open_file_write(reads_file_for_read_store)
+
         sam_reader = pysam.Samfile(self.bam, "rb")
         sam1 = None
+        self.proper_pairs = 0
 
         for s in sam_reader.fetch(until_eof=True):
             if sam1 is None:
@@ -216,55 +272,65 @@ class Clusters:
             if not sam1.is_unmapped:
                 ref_seqs.add(sam_reader.getrname(sam1.tid))
 
-            read1 = self._sam_to_fastq(sam1)
-            read2 = self._sam_to_fastq(s)
+            read1 = mapping.sam_to_fastq(sam1)
+            read2 = mapping.sam_to_fastq(s)
             if read1.id.endswith('/2'):
                 read1, read2 = read2, read1
 
-            insert = self._sam_pair_to_insert(s, sam1)
+            insert = mapping.sam_pair_to_insert(s, sam1)
             if insert is not None:
                 self.insert_hist.add(insert)
+                self.proper_pairs += 1
 
             for ref in ref_seqs:
                 if ref not in self.cluster_to_dir:
-                    assert ref not in filehandles_1
-                    assert ref not in filehandles_2
-
-                    new_dir = os.path.join(self.clusters_outdir, ref)
-                    try:
-                        os.mkdir(new_dir)
-                    except:
-                        raise Error('Error mkdir ' + new_dir)
-
+                    new_dir = os.path.join(self.tmp_dir, ref)
                     self.cluster_to_dir[ref] = new_dir
-                    filehandles_1[ref] = pyfastaq.utils.open_file_write(os.path.join(new_dir, 'reads_1.fq'))
-                    filehandles_2[ref] = pyfastaq.utils.open_file_write(os.path.join(new_dir, 'reads_2.fq'))
                     if self.verbose:
                         print('New cluster with reads that hit:', ref, flush=True)
 
-                print(read1, file=filehandles_1[ref])
-                print(read2, file=filehandles_2[ref])
+                self.cluster_read_counts[ref] = self.cluster_read_counts.get(ref, 0) + 2
+                self.cluster_base_counts[ref] = self.cluster_base_counts.get(ref, 0) + len(read1) + len(read2)
+                print(ref, self.cluster_read_counts[ref] - 1, read1.seq, read1.qual, sep='\t', file=f_out)
+                print(ref, self.cluster_read_counts[ref], read2.seq, read2.qual, sep='\t', file=f_out)
 
             sam1 = None
 
-        for ref in filehandles_1:
-            pyfastaq.utils.close(filehandles_1[ref])
-            pyfastaq.utils.close(filehandles_2[ref])
+        pyfastaq.utils.close(f_out)
+
+        if len(self.cluster_read_counts):
+            if self.verbose:
+                filehandle = sys.stdout
+            else:
+                filehandle = None
+
+            self.read_store = read_store.ReadStore(
+              reads_file_for_read_store,
+              os.path.join(self.outdir, 'read_store'),
+              log_fh=filehandle
+            )
+
+        os.unlink(reads_file_for_read_store)
 
         if self.verbose:
+            print('Found', self.proper_pairs, 'proper read pairs')
             print('Total clusters to perform local assemblies:', len(self.cluster_to_dir), flush=True)
 
+
     def _set_insert_size_data(self):
-        assert len(self.insert_hist) > 0
-        (x, self.insert_size, pc95, self.insert_sspace_sd) = self.insert_hist.stats()
-        self.insert_sspace_sd = min(1, self.insert_sspace_sd)
-        self.insert_proper_pair_max = 1.1 * pc95
-        if self.verbose:
-            print('\nInsert size information from reads mapped to reference genes:')
-            print('Insert size:', self.insert_size, sep='\t')
-            print('Insert sspace sd:', self.insert_sspace_sd, sep='\t')
-            print('Max insert:', self.insert_proper_pair_max, sep='\t')
-            print()
+        if len(self.insert_hist) == 0:
+            return False
+        else:
+            (x, self.insert_size, pc95, self.insert_sspace_sd) = self.insert_hist.stats()
+            self.insert_sspace_sd = min(1, self.insert_sspace_sd)
+            self.insert_proper_pair_max = 1.1 * pc95
+            if self.verbose:
+                print('\nInsert size information from reads mapped to reference genes:')
+                print('Insert size:', self.insert_size, sep='\t')
+                print('Insert sspace sd:', self.insert_sspace_sd, sep='\t')
+                print('Max insert:', self.insert_proper_pair_max, sep='\t')
+                print()
+            return True
 
 
     def _init_and_run_clusters(self):
@@ -272,173 +338,245 @@ class Clusters:
             raise Error('Did not get any reads mapped to genes. Cannot continue')
 
         counter = 0
+        cluster_list = []
+        self.log_files = []
 
-        for gene in sorted(self.cluster_to_dir):
-            counter += 1
-            if self.verbose:
-                print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir)))
-            new_dir = self.cluster_to_dir[gene]
-
-            faidx.write_fa_subset(
-                self.cluster_ids[gene],
-                self.db_fasta,
-                os.path.join(new_dir, 'genes.fa'),
-                samtools_exe=self.samtools_exe,
-                verbose=self.verbose
-            )
+        for seq_type in sorted(self.cluster_ids):
+            if self.cluster_ids[seq_type] is None:
+                continue
 
-            self.clusters[gene] = cluster.Cluster(
-                new_dir,
-                gene,
-                assembly_kmer=self.assembly_kmer,
-                assembler=self.assembler,
-                max_insert=self.insert_proper_pair_max,
-                min_scaff_depth=self.min_scaff_depth,
-                nucmer_min_id=self.nucmer_min_id,
-                nucmer_min_len=self.nucmer_min_len,
-                nucmer_breaklen=self.nucmer_breaklen,
-                sspace_k=self.min_scaff_depth,
-                reads_insert=self.insert_size,
-                sspace_sd=self.insert_sspace_sd,
-                threads=self.threads,
-                assembled_threshold=self.assembled_threshold,
-                unique_threshold=self.unique_threshold,
-                verbose=self.verbose,
-                bcftools_exe=self.bcftools_exe,
-                gapfiller_exe=self.gapfiller_exe,
-                samtools_exe=self.samtools_exe,
-                bowtie2_exe=self.bowtie2_exe,
-                bowtie2_preset=self.bowtie2_preset,
-                spades_exe=self.spades_exe,
-                sspace_exe=self.sspace_exe,
-                velvet_exe=self.velvet,
-                spades_other=self.spades_other,
-                clean=self.clean,
-            )
+            for seq_name in sorted(self.cluster_ids[seq_type]):
+                if seq_name not in self.cluster_to_dir:
+                    continue
+                counter += 1
+                if self.verbose:
+                    print('Constructing cluster', seq_name + '.', counter, 'of', str(len(self.cluster_to_dir)))
+                new_dir = self.cluster_to_dir[seq_name]
+                self.log_files.append(os.path.join(self.logs_dir, seq_name + '.log'))
+
+                cluster_list.append(cluster.Cluster(
+                    new_dir,
+                    seq_name,
+                    self.refdata,
+                    self.cluster_read_counts[seq_name],
+                    self.cluster_base_counts[seq_name],
+                    fail_file=os.path.join(self.fails_dir, seq_name),
+                    read_store=self.read_store,
+                    reference_names=self.cluster_ids[seq_type][seq_name],
+                    logfile=self.log_files[-1],
+                    assembly_coverage=self.assembly_coverage,
+                    assembly_kmer=self.assembly_kmer,
+                    assembler=self.assembler,
+                    max_insert=self.insert_proper_pair_max,
+                    min_scaff_depth=self.min_scaff_depth,
+                    nucmer_min_id=self.nucmer_min_id,
+                    nucmer_min_len=self.nucmer_min_len,
+                    nucmer_breaklen=self.nucmer_breaklen,
+                    reads_insert=self.insert_size,
+                    sspace_k=self.min_scaff_depth,
+                    sspace_sd=self.insert_sspace_sd,
+                    threads=1, # clusters now run in parallel, so this should always be 1!
+                    bcf_min_dp=10,            # let the user change this in a future version?
+                    bcf_min_dv=5,             # let the user change this in a future version?
+                    bcf_min_dv_over_dp=0.3,   # let the user change this in a future version?
+                    bcf_min_qual=20,          # let the user change this in a future version?
+                    assembled_threshold=self.assembled_threshold,
+                    unique_threshold=self.unique_threshold,
+                    max_gene_nt_extend=self.max_gene_nt_extend,
+                    bowtie2_preset=self.bowtie2_preset,
+                    spades_other_options=self.spades_other,
+                    clean=self.clean,
+                    extern_progs=self.extern_progs,
+                ))
+
+        try:
+            if self.threads > 1:
+                self.pool = multiprocessing.Pool(self.threads)
+                cluster_list = self.pool.starmap(_run_cluster, zip(cluster_list, itertools.repeat(self.verbose), itertools.repeat(self.clean), itertools.repeat(self.fails_dir)))
+            else:
+                for c in cluster_list:
+                    _run_cluster(c, self.verbose, self.clean, self.fails_dir)
+        except:
+            self.clusters_all_ran_ok = False
+
+        if len(os.listdir(self.fails_dir)) > 0:
+            self.clusters_all_ran_ok = False
+
+        self.clusters = {c.name: c for c in cluster_list}
+
+
+    @staticmethod
+    def _write_reports(clusters_in, tsv_out, xls_out=None):
+        columns = copy.copy(report.columns)
+        columns[0] = '#' + columns[0]
 
-            self.clusters[gene].run()
-
-
-    def _write_reports(self):
-        columns = [
-            '#gene',
-            'flag',
-            'reads',
-            'cluster',
-            'gene_len',
-            'assembled',
-            'pc_ident',
-            'var_type',
-            'var_effect',
-            'new_aa',
-            'gene_start',
-            'gene_end',
-            'gene_nt',
-            'scaffold',
-            'scaff_len',
-            'scaff_start',
-            'scaff_end',
-            'scaff_nt',
-            'read_depth',
-            'alt_bases',
-            'ref_alt_depth'
-        ]
-
-        f = pyfastaq.utils.open_file_write(self.report_file_tsv)
+        f = pyfastaq.utils.open_file_write(tsv_out)
         print('\t'.join(columns), file=f)
 
-        columns[0] = 'gene'
+        columns[0] = columns[0][1:]
         workbook = openpyxl.Workbook()
         worksheet = workbook.worksheets[0]
         worksheet.title = 'ARIBA_report'
         worksheet.append(columns)
 
+        for seq_name in sorted(clusters_in):
+            if clusters_in[seq_name].report_lines is None:
+                continue
+
+            for line in clusters_in[seq_name].report_lines:
+                print(line, file=f)
+                worksheet.append(line.split('\t'))
+
+        pyfastaq.utils.close(f)
+        if xls_out is not None:
+            workbook.save(xls_out)
+
+
+    def _write_catted_assembled_seqs_fasta(self, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
+
         for gene in sorted(self.clusters):
-            for line in self.clusters[gene].report_lines:
-                print('\t'.join([str(x) for x in line]), file=f)
-                worksheet.append(line)
+            try:
+                seq_dict = self.clusters[gene].assembly_compare.assembled_reference_sequences
+            except:
+                continue
+
+            for seq_name in sorted(seq_dict):
+                print(seq_dict[seq_name], file=f)
+
         pyfastaq.utils.close(f)
-        workbook.save(self.report_file_xls)
 
 
-    def _write_catted_assembled_genes_fasta(self):
-        f = pyfastaq.utils.open_file_write(self.catted_assembled_genes_fasta)
+    def _write_catted_genes_matching_refs_fasta(self, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
 
         for gene in sorted(self.clusters):
-            cluster_fasta = self.clusters[gene].final_assembled_genes_fa
-            if os.path.exists(cluster_fasta):
-                file_reader = pyfastaq.sequences.file_reader(cluster_fasta)
-                for seq in file_reader:
-                    print(seq, file=f)
+            if self.clusters[gene].assembly_compare is not None and self.clusters[gene].assembly_compare.gene_matching_ref is not None:
+                seq = copy.copy(self.clusters[gene].assembly_compare.gene_matching_ref)
+                seq.id += '.' + '.'.join([
+                    self.clusters[gene].assembly_compare.gene_matching_ref_type,
+                    str(self.clusters[gene].assembly_compare.gene_start_bases_added),
+                    str(self.clusters[gene].assembly_compare.gene_end_bases_added)
+                ])
+                print(seq, file=f)
 
         pyfastaq.utils.close(f)
 
 
     def _clean(self):
-        to_clean = [
-            [
-            ],
-            [
-                self.bam,
-                self.db_fasta,
-                self.db_fasta + '.fai',
-            ],
-            [
-                self.db_fasta_clustered,
-                self.db_fasta_clustered + '.fai',
-                self.clusters_info_file,
-            ]
-        ]
-
-        for i in range(self.clean + 1):
-            for fname in to_clean[i]:
-                if os.path.exists(fname):
-                    if self.verbose:
-                        print('  rm', fname)
-                    os.unlink(fname)
+        if self.clean:
+            shutil.rmtree(self.fails_dir)
+
+            try:
+                self.tmp_dir_obj.cleanup()
+            except:
+                pass
 
-        if self.clean >= 2:
             if self.verbose:
-                print('  rm -r', self.clusters_outdir)
-                shutil.rmtree(self.clusters_outdir)
+                print('Deleting Logs directory', self.logs_dir)
+            try:
+                shutil.rmtree(self.logs_dir)
+            except:
+                pass
+
+            if self.verbose:
+                print('Deleting reads store files', self.read_store.outfile + '[.tbi]')
+            try:
+                self.read_store.clean()
+            except:
+                pass
+        else:
+            if self.verbose:
+                print('Not deleting anything because --noclean used')
+
+
+    def write_versions_file(self, original_dir):
+        with open('version_info.txt', 'w') as f:
+            print('ARIBA run with this command:', file=f)
+            print(' '.join([sys.argv[0]] + ['run'] + sys.argv[1:]), file=f)
+            print('from this directory:', original_dir, file=f)
+            print(file=f)
+            print(*self.version_report_lines, sep='\n', file=f)
 
 
     def run(self):
+        try:
+            self._run()
+        except Error as err:
+            self._emergency_stop()
+            raise Error('Something went wrong during ariba run. Cannot continue. Error was:\n' + str(err))
+
+
+    def _run(self):
         cwd = os.getcwd()
         os.chdir(self.outdir)
+        self.write_versions_file(cwd)
 
         if self.verbose:
-            print('{:_^79}'.format(' Running cd-hit '), flush=True)
-        self._run_cdhit()
-        self._write_clusters_info_file()
-        if self.verbose:
-            print('Finished cd-hit\n')
             print('{:_^79}'.format(' Mapping reads to clustered genes '), flush=True)
         self._map_reads_to_clustered_genes()
+
         if self.verbose:
             print('Finished mapping\n')
             print('{:_^79}'.format(' Generating clusters '), flush=True)
         self._bam_to_clusters_reads()
-        if len(self.cluster_to_dir) > 0:
-            self._set_insert_size_data()
-            if self.verbose:
-                print('{:_^79}'.format(' Assembling each cluster '), flush=True)
-            self._init_and_run_clusters()
+        if self.clean:
             if self.verbose:
-                print('Finished assembling clusters\n')
+                print('Deleting BAM', self.bam, flush=True)
+            os.unlink(self.bam)
+
+        if len(self.cluster_to_dir) > 0:
+            got_insert_data_ok = self._set_insert_size_data()
+            if not got_insert_data_ok:
+                print('WARNING: not enough proper read pairs (found ' + str(self.proper_pairs) + ') to determine insert size.', file=sys.stderr)
+                print('This probably means that very few reads were mapped at all. No local assemblies will be run', file=sys.stderr)
+                if self.verbose:
+                    print('Not enough proper read pairs mapped to determine insert size. Skipping all assemblies.', flush=True)
+            else:
+                if self.verbose:
+                    print('{:_^79}'.format(' Assembling each cluster '))
+                    print('Will run', self.threads, 'cluster(s) in parallel', flush=True)
+                self._init_and_run_clusters()
+                if self.verbose:
+                    print('Finished assembling clusters\n')
         else:
             if self.verbose:
                 print('No reads mapped. Skipping all assemblies', flush=True)
             print('WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run', file=sys.stderr)
 
+        if not self.clusters_all_ran_ok:
+            raise Error('At least one cluster failed! Stopping...')
+
         if self.verbose:
-            print('{:_^79}'.format(' Writing report files '), flush=True)
-        self._write_reports()
-        self._write_catted_assembled_genes_fasta()
+            print('{:_^79}'.format(' Writing reports '), flush=True)
+            print('Making', self.report_file_all_tsv)
+        self._write_reports(self.clusters, self.report_file_all_tsv)
+
         if self.verbose:
-            print('Finished writing report files. Cleaning files', flush=True)
-        self._clean()
+            print('Making', self.report_file_filtered_prefix + '.tsv')
+        rf = report_filter.ReportFilter(infile=self.report_file_all_tsv)
+        rf.run(self.report_file_filtered_prefix)
+
+        if self.verbose:
+            print()
+            print('{:_^79}'.format(' Writing fasta of assembled sequences '), flush=True)
+            print(self.catted_assembled_seqs_fasta, 'and', self.catted_genes_matching_refs_fasta, flush=True)
+        self._write_catted_assembled_seqs_fasta(self.catted_assembled_seqs_fasta)
+        self._write_catted_genes_matching_refs_fasta(self.catted_genes_matching_refs_fasta)
+
+        clusters_log_file = os.path.join(self.outdir, 'log.clusters.gz')
         if self.verbose:
+            print()
+            print('{:_^79}'.format(' Catting cluster log files '), flush=True)
+            print('Writing file', clusters_log_file, flush=True)
+        common.cat_files(self.log_files, clusters_log_file)
+
+        if self.verbose:
+            print()
+            print('{:_^79}'.format(' Cleaning files '), flush=True)
+        self._clean()
+
+        if self.clusters_all_ran_ok and self.verbose:
             print('\nAll done!\n')
 
         os.chdir(cwd)
diff --git a/ariba/common.py b/ariba/common.py
index fcc298e..e4b5977 100644
--- a/ariba/common.py
+++ b/ariba/common.py
@@ -1,19 +1,20 @@
+import os
 import sys
 import subprocess
+import pyfastaq
 
-version = '0.6.0'
-
-def syscall(cmd, allow_fail=False, verbose=False):
+def syscall(cmd, allow_fail=False, verbose=False, verbose_filehandle=sys.stdout, print_errors=True):
     if verbose:
-        print('syscall:', cmd, flush=True)
+        print('syscall:', cmd, flush=True, file=verbose_filehandle)
     try:
         subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
     except subprocess.CalledProcessError as error:
         errors = error.output.decode()
-        print('The following command failed with exit code', error.returncode, file=sys.stderr)
-        print(cmd, file=sys.stderr)
-        print('\nThe output was:\n', file=sys.stderr)
-        print(errors, file=sys.stderr, flush=True)
+        if print_errors:
+            print('The following command failed with exit code', error.returncode, file=sys.stderr)
+            print(cmd, file=sys.stderr)
+            print('\nThe output was:\n', file=sys.stderr)
+            print(errors, file=sys.stderr, flush=True)
 
         if allow_fail:
             return False, errors
@@ -29,3 +30,17 @@ def decode(x):
     except:
         return x
     return s
+
+
+def cat_files(infiles, outfile):
+    '''Cats all files in list infiles into outfile'''
+    f_out = pyfastaq.utils.open_file_write(outfile)
+
+    for filename in infiles:
+        if os.path.exists(filename):
+            f_in = pyfastaq.utils.open_file_read(filename)
+            for line in f_in:
+                print(line, end='', file=f_out)
+            pyfastaq.utils.close(f_in)
+
+    pyfastaq.utils.close(f_out)
diff --git a/ariba/external_progs.py b/ariba/external_progs.py
index a1d7239..35fca54 100644
--- a/ariba/external_progs.py
+++ b/ariba/external_progs.py
@@ -4,14 +4,10 @@ import os
 from distutils.version import LooseVersion
 import re
 import sys
-import pyfastaq
 from ariba import common
 
 class Error (Exception): pass
 
-def is_in_path(prog):
-    return shutil.which(prog) is not None
-
 
 prog_to_default = {
     'bcftools': 'bcftools',
@@ -20,33 +16,25 @@ prog_to_default = {
     'gapfiller': 'GapFiller.pl',
     'nucmer' : 'nucmer',
     'samtools': 'samtools',
-    'smalt': 'smalt',
     'spades': 'spades.py',
     'sspace': 'SSPACE_Basic_v2.0.pl',
-    'velvetg': 'velvetg',
-    'velveth': 'velveth',
+    'r': 'Rscript',
 }
 
 
-prog_to_env_var = {
-    'bcftools': 'ARIBA_BCFTOOLS',
-    'samtools': 'ARIBA_SAMTOOLS',
-    'spades': 'ARIBA_SPADES', 
-}
+prog_to_env_var = {x: 'ARIBA_' + x.upper() for x in prog_to_default if x not in {'nucmer'}}
 
 
 prog_to_version_cmd = {
     'bcftools': ('', re.compile('^Version: ([0-9\.]+)')),
-    'bowtie2': ('--version', re.compile('.*bowtie2-align version (.*)$')),
+    'bowtie2': ('--version', re.compile('.*bowtie2.*version (.*)$')),
     'cdhit': ('', re.compile('CD-HIT version ([0-9\.]+) \(')),
     'gapfiller': ('', re.compile('^Usage: .*pl \[GapFiller_(.*)\]')),
     'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')),
     'samtools': ('', re.compile('^Version: ([0-9\.]+)')),
-    'smalt': ('version', re.compile('^Version: ([0-9\.]+)')),
-    'spades': ('', re.compile('^SPAdes genome assembler v.([0-9\.]+)')),
+    'spades': ('', re.compile('^SPAdes genome assembler v.?([0-9\.]+)')),
     'sspace': ('', re.compile('^Usage: .*pl \[SSPACE_(.*)\]')),
-    'velvetg': ('', re.compile('Version ([0-9\.]+)')),
-    'velveth': ('', re.compile('Version ([0-9\.]+)')),
+    'r': ('--version', re.compile('^R .*version ([0-9\.]+)')),
 }
 
 
@@ -56,110 +44,118 @@ min_versions = {
     'cd-hit': '4.6',
     'nucmer': '3.1',
     'samtools': '1.2',
-    'smalt': '0.7.4',
     'spades': '3.5.0',
-    'velvetg': '1.2.07',
-    'velveth': '1.2.07',
+    'r': '2.14.0'
 }
 
 
-def set_path(prog, opts):
-    path_from_opts = eval('opts.' + prog)
-    if path_from_opts is not None:
-        return
-
-    if prog in prog_to_env_var:
-        env_var = prog_to_env_var[prog]
-        if env_var in os.environ:
-            exec('opts.' + prog + ' = "' + os.environ[env_var] + '"')
-            return
-
-    exec('opts.' + prog + ' = "' + prog_to_default[prog] + '"')
+class ExternalProgs:
+    def __init__(self, verbose=False, fail_on_error=True):
+        optional_progs = {'sspace', 'gapfiller'}
+        self.progs = {}
+        self.version_report = []
+        self.all_deps_ok = True
 
+        if verbose:
+            print('{:_^79}'.format(' Checking dependencies and their versions '))
+
+        errors = []
+        warnings = []
+
+        for prog in sorted(prog_to_default):
+            prog_exe = self._get_exe(prog)
+            self.progs[prog] = shutil.which(prog_exe)
+            # Travis is using python3.4, and actually "python" in travis means
+            # python3.4, not python2. SPAdes throws an error about not being
+            # compatible with python3.4.
+            # This means we need to explicitly run SPAdes with python2.
+            if prog == 'spades' and self.progs[prog] is not None:
+                self.progs[prog] = 'python2 ' + self.progs[prog]
+            if self.progs[prog] is None:
+                if prog in optional_progs:
+                    warnings.append(prog + ' not found in path. Looked for ' + prog_exe + '. But it is optional so will be skipped during assembly')
+                else:
+                    errors.append(prog + ' not found in path. Looked for ' + prog_exe)
+
+                self.version_report.append('\t'.join([prog, 'NA', 'NOT_FOUND']))
+                if verbose:
+                    print(self.version_report[-1])
+                continue
+            elif prog in {'sspace', 'gapfiller'}:
+                self.progs[prog] = os.path.realpath(self.progs[prog])
+
+            got_version, version = self._get_version(prog, self.progs[prog])
+
+            if got_version:
+                if prog in min_versions and LooseVersion(version) < LooseVersion(min_versions[prog]):
+                    errors.append(' '.join(['Found version', version, 'of', prog, 'which is too low! Please update to at least', min_versions[prog] + '. Found it here:', prog_exe]))
+            else:
+                errors.append(version)
+                version = 'ERROR'
+
+            self.version_report.append('\t'.join([prog, version, self.progs[prog]]))
+            if verbose:
+                print(self.version_report[-1])
 
-def get_version(prog, path=None, raise_error=True):
-    assert prog in prog_to_version_cmd
-    if path is None:
-        path = prog
 
-    if not is_in_path(path):
-        if raise_error:
-            raise Error('Error getting version of ' + path + ' - not found in path.')
-        else:
-            return 'Not_in_path', 'Not_in_path'
-
-    path = shutil.which(path)
-
-    if prog in ['sspace', 'gapfiller']:
-        cmd = 'perl ' + os.path.realpath(shutil.which(path))
-        regex = prog_to_version_cmd[prog][1]
-    else:
-        cmd, regex = prog_to_version_cmd[prog]
-        cmd = path + ' ' + cmd
-
-    cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
-    cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(cmd_output[1]).split('\n')[:-1]
-    for line in cmd_output:
-        hits = regex.search(line)
-        if hits:
-            return hits.group(1), path
-    return 'UNKNOWN ...\n I tried running this to get the version: "' + cmd + '"\n and the output didn\'t match this regular expression: "' + regex.pattern + '"', path
-
-
-def check_versions(opts, verbose=False, not_required=None):
-    if not_required is None:
-        not_required = set()
-
-    if verbose:
-        print('{:_^79}'.format(' Checking dependencies and their versions '))
-        print('tool', 'version', 'path', sep='\t')
-
-    to_check = [
-        'bcftools',
-        'bowtie2',
-        'cdhit',
-        'nucmer',
-        'samtools',
-        'sspace',
-        'gapfiller',
-    ]
-    
-    if opts.assembler == 'spades':
-        to_check.append('spades')
-    elif opts.assembler == 'velvet':
-        to_check.append('velvetg')
-        to_check.append('velveth')
-    else:
-        raise Error('Assembler ' + opts.assembler + ' not recognised. Cannot continue')
-
-    errors = []
-    failed_to_find = set()
-
-    for prog in to_check:
-        set_path(prog, opts)
-        version, path = get_version(prog, path=eval('opts.' + prog), raise_error=prog not in not_required)
         if verbose:
-            print(prog, version, path, sep='\t')
-        if path == 'Not_in_path':
-            print('\nWARNING:', prog, 'not found in path, so will be skipped during assembly\n', file=sys.stderr)
-
-        if prog in min_versions and LooseVersion(version) < LooseVersion(min_versions[prog]):
-            errors.append(' '.join(['Found version', version, 'of', prog, 'which is too low! Please update to at least', min_versions[prog] + '\n   Found it here:', path]))
-            failed_to_find.add(prog)
-
-    if len(errors):
-        for e in errors:
-            print('\n*** Error! Bad dependency! ***', file=sys.stderr)
-            print(e, file=sys.stderr)
             print()
-        if len(failed_to_find.difference(not_required)) > 0:
-            raise Error('Cannot continue. Some dependencies need updating')
+
+        for line in warnings:
+            print('WARNING:', line, file=sys.stderr)
+
+
+        if len(errors):
+            self.all_deps_ok = False
+
+            for line in errors:
+                print('ERROR:', line, file=sys.stderr)
+            print('\nSomething wrong with at least one dependency. Please see the above error message(s)', file=sys.stderr)
+            if fail_on_error:
+                raise Error('Dependency error(s). Cannot continue')
+        elif verbose:
+            if len(warnings):
+                print('\nWARNING: Required dependencies found, but at least one optional one was not. Please see previous warning(s) for more details.', file=sys.stderr)
+            else:
+                print('\nDependencies look OK')
+
+
+    def exe(self, prog):
+        return self.progs[prog]
+
+
+    @staticmethod
+    def _get_exe(prog):
+        '''Given a program name, return what we expect its exectuable to be called'''
+        if prog in prog_to_env_var:
+            env_var = prog_to_env_var[prog]
+            if env_var in os.environ:
+                return os.environ[env_var]
+
+        return prog_to_default[prog]
+
+
+    @staticmethod
+    def _get_version(prog, path):
+        '''Given a program name and expected path, tries to determine its version.
+           Returns tuple (bool, version). First element True iff found version ok.
+           Second element is version string (if found), otherwise an error message'''
+        assert prog in prog_to_version_cmd
+
+        if prog in ['sspace', 'gapfiller']:
+            cmd = 'perl ' + os.path.realpath(shutil.which(path))
+            regex = prog_to_version_cmd[prog][1]
         else:
-            assert failed_to_find.issubset(not_required)
-            if 'sspace' in failed_to_find:
-                print('WARNING: SSPACE not found. Will not run scaffolding or gap filling', file=sys.stderr)
-            elif 'gapfiller' in failed_to_find:
-                print('WARNING: GapFiller not found. Will not run gap filling after scaffolding', file=sys.stderr)
-
-    if verbose:
-        print('\nDependencies look OK (but check in case there are warnings about SSPACE or GapFiller)\n')
+            cmd, regex = prog_to_version_cmd[prog]
+            cmd = path + ' ' + cmd
+
+        cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+        cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(cmd_output[1]).split('\n')[:-1]
+
+        for line in cmd_output:
+            hits = regex.search(line)
+            if hits:
+                return True, hits.group(1)
+
+        return False, 'I tried to get the version of ' + prog + ' with: "' + cmd + '" and the output didn\'t match this regular expression: "' + regex.pattern + '"'
+
diff --git a/ariba/faidx.py b/ariba/faidx.py
index cefd7c6..339c6fe 100644
--- a/ariba/faidx.py
+++ b/ariba/faidx.py
@@ -1,10 +1,11 @@
+import sys
 import os
 from ariba import common
 
 
-def write_fa_subset(seq_names, infile, outfile, samtools_exe='samtools', verbose=False):
+def write_fa_subset(seq_names, infile, outfile, samtools_exe='samtools', verbose=False, verbose_filehandle=sys.stdout):
     if not os.path.exists(infile + '.fai'):
-        common.syscall(samtools_exe + ' faidx ' + infile, verbose=verbose)
+        common.syscall(samtools_exe + ' faidx ' + infile, verbose=verbose, verbose_filehandle=verbose_filehandle)
 
     if os.path.exists(outfile):
         os.path.unlink(outfile)
diff --git a/ariba/flag.py b/ariba/flag.py
index fa0d5ce..dc580d5 100644
--- a/ariba/flag.py
+++ b/ariba/flag.py
@@ -2,16 +2,17 @@ class Error (Exception): pass
 
 
 flags_in_order = [
-    'gene_assembled',
-    'gene_assembled_into_one_contig',
-    'gene_region_assembled_twice',
-    'complete_orf',
+    'assembled',
+    'assembled_into_one_contig',
+    'region_assembled_twice',
+    'complete_gene',
     'unique_contig',
     'scaffold_graph_bad',
     'assembly_fail',
     'variants_suggest_collapsed_repeat',
     'hit_both_strands',
     'has_nonsynonymous_variants',
+    'ref_seq_choose_fail',
 ]
 
 
diff --git a/ariba/mapping.py b/ariba/mapping.py
index f6ad182..331d5ed 100644
--- a/ariba/mapping.py
+++ b/ariba/mapping.py
@@ -1,9 +1,33 @@
 import os
+import sys
 import pysam
+import pyfastaq
 from ariba import common
 
 class Error (Exception): pass
 
+bowtie2_index_extensions = [x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
+
+def bowtie2_index(ref_fa, outprefix, bowtie2='bowtie2', verbose=False, verbose_filehandle=sys.stdout):
+    expected_files = [outprefix + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
+    file_missing = False
+    for filename in expected_files:
+        if not os.path.exists(filename):
+            file_missing = True
+            break
+
+    if not file_missing:
+        return
+
+    cmd = ' '.join([
+        bowtie2 + '-build',
+        '-q',
+        ref_fa,
+        outprefix
+    ])
+
+    common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
+
 
 def run_bowtie2(
       reads_fwd,
@@ -16,17 +40,30 @@ def run_bowtie2(
       samtools='samtools',
       bowtie2='bowtie2',
       bowtie2_preset='very-sensitive-local',
-      verbose=False
+      verbose=False,
+      verbose_filehandle=sys.stdout,
+      remove_both_unmapped=False,
+      clean_index=True,
     ):
 
-    map_index = out_prefix + '.map_index'
-    clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
-    index_cmd = ' '.join([
-        bowtie2 + '-build',
-        '-q',
-        ref_fa,
-        map_index
-    ])
+    ref_is_indexed = True
+    for ext in bowtie2_index_extensions:
+        if not os.path.exists(ref_fa + '.' + ext):
+            ref_is_indexed = False
+            break
+
+    clean_files = []
+
+    if ref_is_indexed:
+        if verbose:
+            print('Bowtie2 index files found (', ref_fa, '.*.bt2) so no need to index', sep='', file=verbose_filehandle)
+        map_index = ref_fa
+    else:
+        map_index = out_prefix + '.map_index'
+        bowtie2_index(ref_fa, map_index, bowtie2=bowtie2, verbose=verbose, verbose_filehandle=verbose_filehandle)
+
+        if clean_index:
+            clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
 
     final_bam = out_prefix + '.bam'
     if sort:
@@ -34,101 +71,49 @@ def run_bowtie2(
     else:
         intermediate_bam = final_bam
 
-    map_cmd = ' '.join([
+    map_cmd = [
         bowtie2,
         '--threads', str(threads),
+        '--reorder',
         '--' + bowtie2_preset,
         '-X', str(max_insert),
         '-x', map_index,
         '-1', reads_fwd,
         '-2', reads_rev,
-        '|', samtools, 'view',
-        '-bS -T', ref_fa,
-        '- >', intermediate_bam
-    ])
+    ]
 
-    common.syscall(index_cmd, verbose=verbose)
-    common.syscall(map_cmd, verbose=verbose)
-
-    if sort:
-        threads = min(4, threads)
-        thread_mem = int(500 / threads)
-        sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
-        index_cmd = samtools + ' index ' + final_bam
-        common.syscall(sort_cmd, verbose=verbose)
-        common.syscall(index_cmd, verbose=verbose)
-    for fname in clean_files:
-        os.unlink(fname)
+    if remove_both_unmapped:
+        map_cmd.append(r''' | awk ' !(and($2,4)) || !(and($2,8)) ' ''')
 
 
-def run_smalt(
-      reads_fwd,
-      reads_rev,
-      ref_fa,
-      out_prefix,
-      index_k=9,
-      index_s=2,
-      threads=1,
-      max_insert=1000,
-      minid=0.9,
-      sort=False,
-      extra_smalt_map_ops='-x',
-      samtools='samtools',
-      smalt='smalt',
-      verbose=False
-    ):
-    if extra_smalt_map_ops is None:
-        extra_smalt_map_ops = ''
-    map_index = out_prefix + '.map_index'
-    clean_files = [map_index + '.' + x for x in ['smi', 'sma']]
-    index_cmd = ' '.join([
-        smalt, 'index',
-        '-k', str(index_k),
-        '-s', str(index_s),
-        map_index,
-        ref_fa
+    map_cmd.extend([
+        '|', samtools, 'view',
+        '-bS -T', ref_fa,
+        '- >', intermediate_bam
     ])
 
-    map_cmd = smalt + ' map ' + extra_smalt_map_ops + ' '
+    map_cmd = ' '.join(map_cmd)
 
-    # depending on OS, -n can break smalt, so only use -n if it's > 1.
-    if threads > 1:
-        map_cmd += '-n ' + str(threads) + ' -O '
-
-    if reads_rev is None:
-        map_cmd += ' '.join([
-            '-y', str(minid),
-            map_index,
-            reads_fwd,
-        ])
-    else:
-        map_cmd += ' '.join([
-            '-i', str(max_insert),
-            '-y', str(minid),
-            map_index,
-            reads_fwd,
-            reads_rev,
-        ])
-
-    map_cmd += ' | ' + samtools + ' view'
-
-    final_bam = out_prefix + '.bam'
-    if sort:
-        intermediate_bam = out_prefix + '.unsorted.bam'
-    else:
-        intermediate_bam = final_bam
-
-    map_cmd += ' -bS -T ' + ref_fa + '  - > ' + intermediate_bam
-    common.syscall(index_cmd, verbose=verbose)
-    common.syscall(map_cmd, verbose=verbose)
+    common.syscall(map_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
 
     if sort:
         threads = min(4, threads)
         thread_mem = int(500 / threads)
-        sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
+        sort_cmd = ' '.join([
+            samtools,
+            'sort',
+            '-@' + str(threads),
+            '-m' + str(thread_mem) + 'M',
+            '-o', final_bam,
+            '-O bam',
+            '-T', out_prefix + '.tmp.samtool_sort',
+            intermediate_bam,
+        ])
         index_cmd = samtools + ' index ' + final_bam
-        common.syscall(sort_cmd, verbose=verbose)
-        common.syscall(index_cmd, verbose=verbose)
+        common.syscall(sort_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
+        common.syscall(index_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
+        clean_files.append(intermediate_bam)
+
     for fname in clean_files:
         os.unlink(fname)
 
@@ -144,3 +129,42 @@ def get_total_alignment_score(bam):
             pass
     return total
 
+
+def sam_to_fastq(sam):
+    '''Given a pysam alignment, returns the sequence a Fastq object.
+       Reverse complements as required and add suffix /1 or /2 as appropriate from the flag'''
+    name = sam.qname
+    if sam.is_read1:
+        name += '/1'
+    elif sam.is_read2:
+        name += '/2'
+    else:
+        raise Error('Read ' + name + ' must be first or second of pair according to flag. Cannot continue')
+
+    seq = pyfastaq.sequences.Fastq(name, common.decode(sam.seq), common.decode(sam.qual))
+    if sam.is_reverse:
+        seq.revcomp()
+
+    return seq
+
+
+def sam_pair_to_insert(s1, s2):
+    '''Returns insert size from pair of sam records, as long as their orientation is "innies".
+       Otherwise returns None.'''
+    if s1.is_unmapped or s2.is_unmapped or (s1.tid != s2.tid) or (s1.is_reverse == s2.is_reverse):
+        return None
+
+    # If here, reads are both mapped to the same ref, and in opposite orientations
+    if s1.is_reverse:
+        end = s1.reference_end - 1
+        start = s2.reference_start
+    else:
+        end = s2.reference_end - 1
+        start = s1.reference_start
+
+    if start < end:
+        return end - start + 1
+    else:
+        return None
+
+
diff --git a/ariba/read_store.py b/ariba/read_store.py
new file mode 100644
index 0000000..d75bea8
--- /dev/null
+++ b/ariba/read_store.py
@@ -0,0 +1,60 @@
+import pyfastaq
+import os
+import pysam
+from ariba import common
+
+class Error (Exception): pass
+
+class ReadStore:
+    def __init__(self, infile, outprefix, log_fh=None):
+        assert infile != outprefix
+        self.infile = os.path.abspath(infile)
+        self.outprefix = os.path.abspath(outprefix)
+        self.outfile = os.path.abspath(outprefix) + '.gz'
+
+        if not os.path.exists(self.infile):
+            raise Error('File not found ' + self.infile + '. Cannot continue')
+
+        self._sort_file(self.infile, self.outprefix, log_fh)
+        self._compress_and_index_file(self.outprefix, log_fh)
+        os.unlink(self.outprefix)
+
+
+    @staticmethod
+    def _sort_file(infile, outfile, log_fh=None):
+        cmd = 'sort -k1,1 -k 2,2n ' + infile + ' > ' + outfile
+        verbose = log_fh is not None
+        common.syscall(cmd, verbose=verbose, verbose_filehandle=log_fh)
+
+
+    @staticmethod
+    def _compress_and_index_file(infile, log_fh=None):
+        if log_fh is not None:
+            print('Compressing file', infile, file=log_fh, flush=True)
+        pysam.tabix_compress(infile, infile + '.gz')
+        pysam.tabix_index(infile + '.gz', seq_col=0, start_col=1, end_col=1)
+
+
+    def get_reads(self, cluster_name, out1, out2, log_fh=None):
+        if log_fh is not None:
+            print('Getting reads for', cluster_name, 'from', self.outfile, file=log_fh)
+        tabix_file = pysam.TabixFile(self.outfile)
+        f_out1 = pyfastaq.utils.open_file_write(out1)
+        f_out2 = pyfastaq.utils.open_file_write(out2)
+
+        for line in tabix_file.fetch(reference=cluster_name):
+            cluster, number, seq, qual = line.rstrip().split()
+            number = int(number)
+            if number % 2 == 0:
+                print('@' + str(number - 1) + '/2', seq, '+', qual, sep='\n', file=f_out2)
+            else:
+                print('@' + str(number) + '/1', seq, '+', qual, sep='\n', file=f_out1)
+
+        pyfastaq.utils.close(f_out1)
+        pyfastaq.utils.close(f_out2)
+        if log_fh is not None:
+            print('Finished getting reads for', cluster_name, 'from', self.outfile, file=log_fh)
+
+    def clean(self):
+        os.unlink(self.outfile)
+        os.unlink(self.outfile + '.tbi')
diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py
new file mode 100644
index 0000000..07f097b
--- /dev/null
+++ b/ariba/ref_genes_getter.py
@@ -0,0 +1,267 @@
+class Error (Exception): pass
+
+import os
+import sys
+import shutil
+import tarfile
+import pyfastaq
+import urllib.request
+import time
+import json
+from ariba import common, card_record, vfdb_parser
+
+
+class RefGenesGetter:
+    def __init__(self, ref_db, genetic_code=11):
+        allowed_ref_dbs = {'card', 'argannot', 'resfinder','vfdb'}
+        if ref_db not in allowed_ref_dbs:
+            raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db)
+        self.ref_db=ref_db
+        self.genetic_code = genetic_code
+        self.max_download_attempts = 3
+        self.sleep_time = 2
+        pyfastaq.sequences.genetic_code = self.genetic_code
+
+
+    def _download_file(self, url, outfile):
+        print('Downloading "', url, '" and saving as "', outfile, '" ...', end='', sep='')
+        for i in range(self.max_download_attempts):
+            time.sleep(self.sleep_time)
+            try:
+                urllib.request.urlretrieve(url, filename=outfile)
+            except:
+                continue
+            break
+        else:
+            raise Error('Error downloading: ' + url)
+        print(' done', flush=True)
+
+
+
+    def _get_from_card(self, outprefix):
+        outprefix = os.path.abspath(outprefix)
+        tmpdir = outprefix + '.download'
+        current_dir = os.getcwd()
+
+        try:
+            os.mkdir(tmpdir)
+            os.chdir(tmpdir)
+        except:
+            raise Error('Error mkdir/chdir ' + tmpdir)
+
+        card_version = '1.0.6'
+        card_tarball_url = 'https://card.mcmaster.ca/download/0/broadsteet-v' + card_version + '.tar.gz'
+        card_tarball = 'card.tar.gz'
+        print('Working in temporary directory', tmpdir)
+        print('Downloading data from card:', card_tarball_url, flush=True)
+        common.syscall('wget -O ' + card_tarball + ' ' + card_tarball_url, verbose=True)
+        print('...finished downloading', flush=True)
+        if not tarfile.is_tarfile(card_tarball):
+            raise Error('File ' + card_tarball + ' downloaded from ' + card_tarball_url + ' does not look like a valid tar archive. Cannot continue')
+
+        json_file = './card.json'
+        with tarfile.open(card_tarball, 'r') as tfile:
+            tfile.extract(json_file)
+
+        print('Extracted json data file ', json_file,'. Reading its contents...', sep='')
+
+        variant_metadata_tsv = outprefix + '.metadata.tsv'
+        presence_absence_fa = outprefix + '.presence_absence.fa'
+        variants_only_fa = outprefix + '.variants_only.fa'
+        noncoding_fa = outprefix + '.noncoding.fa'
+        log_file = outprefix + '.log'
+        f_out_tsv = pyfastaq.utils.open_file_write(variant_metadata_tsv)
+        f_out_presabs = pyfastaq.utils.open_file_write(presence_absence_fa)
+        f_out_var_only = pyfastaq.utils.open_file_write(variants_only_fa)
+        f_out_noncoding = pyfastaq.utils.open_file_write(noncoding_fa)
+        f_out_log = pyfastaq.utils.open_file_write(log_file)
+
+        with open(json_file) as f:
+            json_data = json.load(f)
+
+        json_data = {int(x): json_data[x] for x in json_data if not x.startswith('_')}
+        print('Found', len(json_data), 'records in the json file. Analysing...', flush=True)
+
+        for gene_key, gene_dict in sorted(json_data.items()):
+            crecord = card_record.CardRecord(gene_dict)
+            data = crecord.get_data()
+            fasta_name_prefix = '.'.join([
+                card_record.CardRecord._ARO_name_to_fasta_name(data['ARO_name']),
+                data['ARO_accession'],
+            ])
+
+            for card_key, gi, genbank_id, start, end, dna_seq, protein_seq in data['dna_seqs_and_ids']:
+                if dna_seq == '':
+                    print('Empty dna sequence', gene_key, data['ARO_id'], data['ARO_accession'], sep='\t', file=f_out_log)
+                    continue
+
+                fasta_id = '.'.join([
+                    fasta_name_prefix,
+                    genbank_id,
+                    start + '-' + end,
+                    card_key
+                ])
+                fasta = pyfastaq.sequences.Fasta(fasta_id, dna_seq)
+                variant_type = 'p'
+
+                if gi != 'NA':
+                    gene_tuple = fasta.make_into_gene()
+                    if gene_tuple is None:
+                        print('Could not make gene from sequence', fasta.id, sep='\t', file=f_out_log)
+                        continue
+                    else:
+                        translated =  gene_tuple[0].translate()
+                        if gene_tuple[0][:3] in pyfastaq.genetic_codes.starts[self.genetic_code]:
+                            translated.seq = 'M' + translated.seq[1:]
+
+                        if translated.seq[:-1] != protein_seq:
+                            print('Translation of inferred gene dna sequence does not match protein sequence', fasta.id, sep='\t', file=f_out_log)
+                            continue
+
+                if gi == 'NA':
+                    fasta_filehandle = f_out_noncoding
+                    variant_type = 'n'
+                elif len(data['snps']) == 0:
+                    fasta_filehandle = f_out_presabs
+                else:
+                    fasta_filehandle = f_out_var_only
+
+                print(fasta.id, '.', '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv)
+
+                if len(data['snps']) == 0:
+                    print(fasta, file=fasta_filehandle)
+                    print(fasta.id, '.', '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv)
+                else:
+                    print(fasta, file=fasta_filehandle)
+                    for snp in data['snps']:
+                        print(fasta.id, variant_type, snp, '.', data['ARO_description'], sep='\t', file=f_out_tsv)
+
+
+        pyfastaq.utils.close(f_out_tsv)
+        pyfastaq.utils.close(f_out_presabs)
+        pyfastaq.utils.close(f_out_var_only)
+        pyfastaq.utils.close(f_out_noncoding)
+        pyfastaq.utils.close(f_out_log)
+        os.chdir(current_dir)
+        print('Extracted data and written ARIBA input files\n')
+        print('Final genes files and metadata file:')
+        print('   ', presence_absence_fa)
+        print('   ', variants_only_fa)
+        print('   ', variant_metadata_tsv)
+
+        print('\nYou can use those files with ARIBA like this:')
+        print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n')
+
+        print('If you use this downloaded data, please cite:')
+        print('"The Comprehensive Antibiotic Resistance Database", McArthur et al 2013, PMID: 23650175')
+        print('and in your methods say that version', card_version, 'of the database was used')
+
+
+    def _get_from_resfinder(self, outprefix):
+        outprefix = os.path.abspath(outprefix)
+        final_fasta = outprefix + '.presence_absence.fa'
+        tmpdir = outprefix + '.tmp.download'
+        current_dir = os.getcwd()
+
+        try:
+            os.mkdir(tmpdir)
+            os.chdir(tmpdir)
+        except:
+            raise Error('Error mkdir/chdir ' + tmpdir)
+
+        zipfile = 'resfinder.zip'
+        cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
+        print('Downloading data with:', cmd, sep='\n')
+        common.syscall(cmd)
+        common.syscall('unzip ' + zipfile)
+
+        print('Combining downloaded fasta files...')
+        f = pyfastaq.utils.open_file_write(final_fasta)
+
+        for filename in os.listdir('database'):
+            if filename.endswith('.fsa'):
+                print('   ', filename)
+                prefix = filename.split('.')[0]
+                file_reader = pyfastaq.sequences.file_reader(os.path.join('database', filename))
+                for seq in file_reader:
+                    seq.id = prefix + '.' + seq.id
+                    print(seq, file=f)
+
+        pyfastaq.utils.close(f)
+
+        print('\nCombined files. Final genes file is called', final_fasta, end='\n\n')
+        os.chdir(current_dir)
+        shutil.rmtree(tmpdir)
+
+        print('You can use it with ARIBA like this:')
+        print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n')
+        print('If you use this downloaded data, please cite:')
+        print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
+
+
+    def _get_from_argannot(self, outprefix):
+        outprefix = os.path.abspath(outprefix)
+        tmpdir = outprefix + '.tmp.download'
+        current_dir = os.getcwd()
+
+        try:
+            os.mkdir(tmpdir)
+            os.chdir(tmpdir)
+        except:
+            raise Error('Error mkdir/chdir ' + tmpdir)
+
+        zipfile = 'arg-annot-database_doc.zip'
+        self._download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile)
+        common.syscall('unzip ' + zipfile)
+        os.chdir(current_dir)
+        print('Extracted files.')
+
+        genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt')
+        final_fasta = outprefix + '.presence_absence.fa'
+
+        seq_reader = pyfastaq.sequences.file_reader(genes_file)
+        ids = {}
+        for seq in seq_reader:
+            ids[seq.id] = ids.get(seq.id, 0) + 1
+
+        for name, count in sorted(ids.items()):
+            if count > 1:
+                print('Warning! Sequence name', name, 'found', count, 'times in download. Keeping longest sequence', file=sys.stderr)
+
+        pyfastaq.tasks.to_unique_by_id(genes_file, final_fasta)
+        shutil.rmtree(tmpdir)
+
+        print('Finished. Final genes file is called', final_fasta, end='\n\n')
+        print('You can use it with ARIBA like this:')
+        print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n')
+        print('If you use this downloaded data, please cite:')
+        print('"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n')
+
+    def _get_from_vfdb(self, outprefix):
+        outprefix = os.path.abspath(outprefix)
+        tmpdir = outprefix + '.tmp.download'
+        current_dir = os.getcwd()
+
+        try:
+            os.mkdir(tmpdir)
+        except:
+            raise Error('Error mkdir ' + tmpdir)
+
+        zipfile = os.path.join(tmpdir, 'VFDB_setA_nt.fas.gz')
+        self._download_file('http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz', zipfile)
+        vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
+        vparser.run()
+        shutil.rmtree(tmpdir)
+        print('Extracted files.')
+        final_fasta = outprefix + '.presence_absence.fa'
+        final_tsv = outprefix + '.metadata.tsv'
+
+        print('Extracted core DNA sequence dataset and metadata. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
+        print('You can use it with ARIBA like this:')
+        print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n')
+        print('If you use this downloaded data, please cite:')
+        print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
+
+    def run(self, outprefix):
+        exec('self._get_from_' + self.ref_db + '(outprefix)')
+
diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py
new file mode 100644
index 0000000..b4b4938
--- /dev/null
+++ b/ariba/ref_preparer.py
@@ -0,0 +1,189 @@
+import sys
+import os
+import pickle
+from ariba import common, mapping, reference_data
+
+class Error (Exception): pass
+
+
+class RefPreparer:
+    def __init__(self,
+        extern_progs,
+        version_report_lines=None,
+        ref_prefix=None,
+        presabs=None,
+        varonly=None,
+        noncoding=None,
+        metadata=None,
+        min_gene_length=6,
+        max_gene_length=10000,
+        genetic_code=11,
+        cdhit_min_id=0.9,
+        cdhit_min_length=0.9,
+        run_cdhit=True,
+        clusters_file=None,
+        threads=1,
+        verbose=False,
+    ):
+        self.extern_progs = extern_progs
+
+        if version_report_lines is None:
+            self.version_report_lines = []
+        else:
+            self.version_report_lines = version_report_lines
+
+        self.ref_prefix = ref_prefix
+        self.presabs = presabs
+        self.varonly = varonly
+        self.noncoding = noncoding
+        self.metadata = metadata
+        self.min_gene_length = min_gene_length
+        self.max_gene_length = max_gene_length
+        self.genetic_code = genetic_code
+        self.cdhit_min_id = cdhit_min_id
+        self.cdhit_min_length = cdhit_min_length
+        self.run_cdhit = run_cdhit
+        self.clusters_file = clusters_file
+        self.threads = threads
+        self.verbose = verbose
+
+
+    @staticmethod
+    def _get_ref_files(ref_prefix, presabs, varonly, noncoding, metadata, verbose=False):
+        if {None} == {ref_prefix, presabs, varonly, noncoding}:
+            raise Error('Error in RefPreparer._get_ref_files. All input files and ref_prefix were None. Cannot continue')
+
+        filenames = {
+            'presabs': presabs,
+            'varonly': varonly,
+            'noncoding': noncoding,
+            'metadata': metadata,
+        }
+
+        file_suffixes = {
+            'presabs': 'presence_absence.fa',
+            'varonly': 'variants_only.fa',
+            'noncoding': 'noncoding.fa',
+            'metadata': 'metadata.tsv',
+        }
+
+        if verbose:
+            print('\nLooking for input files ...')
+
+        for key in file_suffixes:
+            if ref_prefix is not None:
+                filename = os.path.abspath(ref_prefix + '.' + file_suffixes[key])
+
+                if os.path.exists(filename):
+                    if verbose:
+                        print('Found: ', filename, '.\n    ...treating it as if this was used: --', key, ' ', filename, sep='')
+                    filenames[key] = filename
+                else:
+                    if verbose:
+                        print('Not found:', filename)
+                    filenames[key] = None
+            elif filenames[key] is not None:
+                if os.path.exists(filenames[key]):
+                    filenames[key] = os.path.abspath(filenames[key])
+                    if verbose:
+                        print('Found: ', filenames[key], ' from option --', key, sep='')
+                else:
+                    raise Error('File not found! Cannot continue. Looked for: ' + filenames[key])
+
+        if {None} == {filenames['presabs'], filenames['varonly'], filenames['noncoding']}:
+            raise Error('Error in RefPreparer._get_ref_files. No FASTA files given! Cannot continue')
+
+        return filenames
+
+
+    def _write_info_file(self, outfile):
+        with open(outfile, 'w') as fout:
+            for key in ('presabs', 'varonly', 'noncoding', 'metadata'):
+                print('input_' + key, self.filenames[key], sep='\t', file=fout)
+
+            print('genetic_code', self.genetic_code, sep='\t', file=fout)
+
+
+    def run(self, outdir):
+        original_dir = os.getcwd()
+
+        if os.path.exists(outdir):
+            raise Error('Error! Output directory ' + outdir + ' already exists. Cannot continue')
+
+        try:
+            os.mkdir(outdir)
+        except:
+            raise Error('Error making output directory ' + outdir + '. Cannot continue')
+
+        with open(os.path.join(outdir, 'version_info.txt'), 'w') as f:
+            print('ARIBA run with this command:', file=f)
+            print(' '.join([sys.argv[0]] + ['prepareref'] + sys.argv[1:]), file=f)
+            print('from this directory:', original_dir, file=f)
+            print(file=f)
+            print(*self.version_report_lines, sep='\n', file=f)
+
+        self.filenames = self._get_ref_files(self.ref_prefix, self.presabs, self.varonly, self.noncoding, self.metadata, self.verbose)
+        self._write_info_file(os.path.join(outdir, 'info.txt'))
+
+        self.refdata = reference_data.ReferenceData(
+            presence_absence_fa=self.filenames['presabs'],
+            variants_only_fa=self.filenames['varonly'],
+            non_coding_fa=self.filenames['noncoding'],
+            metadata_tsv=self.filenames['metadata'],
+            min_gene_length=self.min_gene_length,
+            max_gene_length=self.max_gene_length,
+            genetic_code=self.genetic_code,
+        )
+
+        if self.verbose:
+            print('\nLoading and checking input data', flush=True)
+
+        refdata_outprefix = os.path.join(outdir, 'refcheck')
+        self.refdata.rename_sequences(refdata_outprefix + '.rename_info')
+        self.refdata.sanity_check(refdata_outprefix)
+
+        if self.verbose:
+            print('\nRunning cdhit', flush=True)
+        cdhit_outprefix = os.path.join(outdir, 'cdhit')
+
+        clusters = self.refdata.cluster_with_cdhit(
+            refdata_outprefix + '.01.check_variants',
+            cdhit_outprefix,
+            seq_identity_threshold=self.cdhit_min_id,
+            threads=self.threads,
+            length_diff_cutoff=self.cdhit_min_length,
+            nocluster=not self.run_cdhit,
+            verbose=self.verbose,
+            cd_hit_est=self.extern_progs.exe('cdhit'),
+            clusters_file=self.clusters_file,
+        )
+
+        if self.verbose:
+            print('\nWriting clusters to file.', len(clusters), 'in total', flush=True)
+
+        clusters_pickle_file = cdhit_outprefix + '.clusters.pickle'
+        with open(clusters_pickle_file, 'wb') as f:
+            pickle.dump(clusters, f)
+
+        cluster_representatives_fa = cdhit_outprefix + '.cluster_representatives.fa'
+
+        if self.verbose:
+            print('\nRunning bowtie2-build on FASTA of cluster representatives')
+
+        mapping.bowtie2_index(
+            cluster_representatives_fa,
+            cluster_representatives_fa,
+            bowtie2=self.extern_progs.exe('bowtie2'),
+            verbose=self.verbose,
+        )
+
+        if self.verbose:
+            print('\nRunning samtools faidx on FASTA of cluster representatives')
+
+        cmd = ' '.join([
+            self.extern_progs.exe('samtools'),
+            'faidx',
+            cluster_representatives_fa
+        ])
+
+        common.syscall(cmd, verbose=self.verbose)
diff --git a/ariba/refcheck.py b/ariba/refcheck.py
deleted file mode 100644
index 551b648..0000000
--- a/ariba/refcheck.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import os
-import pyfastaq
-
-class Error (Exception): pass
-
-
-class Checker:
-    def __init__(self, infile, min_length=1, max_length=10000, outprefix=None):
-        self.infile = os.path.abspath(infile)
-        if not os.path.exists(self.infile):
-            raise Error('File not found: "' + self.infile + '". Cannot continue')
-
-        self.min_length = min_length
-        self.max_length = max_length
-        self.outprefix = outprefix
-
-
-    def run(self):
-        file_reader = pyfastaq.sequences.file_reader(self.infile)
-        names = {}
-
-        if self.outprefix is not None:
-            old2new_out = self.outprefix + '.rename'
-            fasta_out = self.outprefix + '.fa'
-            bad_seqs_out = self.outprefix + '.removed.fa'
-            log_out = self.outprefix + '.log'
-            old2new_out_fh = pyfastaq.utils.open_file_write(old2new_out)
-            fasta_out_fh = pyfastaq.utils.open_file_write(fasta_out)
-            bad_seqs_out_fh = pyfastaq.utils.open_file_write(bad_seqs_out)
-            log_out_fh = pyfastaq.utils.open_file_write(log_out)
-
-        for seq in file_reader:
-            seq.seq = seq.seq.upper()
-            if len(seq) < self.min_length:
-                if self.outprefix is None:
-                    return False, 'Too short', seq
-                else:
-                    print(seq.id, 'Too short. Skipping', sep='\t', file=log_out_fh)
-                    print(seq, file=bad_seqs_out_fh)
-                    continue
-            elif len(seq) > self.max_length:
-                if self.outprefix is None:
-                    return False, 'Too long', seq
-                else:
-                    print(seq.id, 'Too long. Skipping', sep='\t', file=log_out_fh)
-                    print(seq, file=bad_seqs_out_fh)
-                    continue
-
-            if not seq.looks_like_gene():
-                if self.outprefix is None:
-                    return False, 'Not a gene', seq
-                else:
-                    seq.revcomp()
-                    if seq.looks_like_gene():
-                        print(seq.id, 'Reverse complemented', sep='\t', file=log_out_fh)
-                    else:
-                        print(seq.id, 'Does not look like a gene. Skipping', sep='\t', file=log_out_fh)
-                        seq.revcomp()
-                        print(seq, file=bad_seqs_out_fh)
-                        continue
-
-            original_id = seq.id
-            # replace unwanted characters with underscores
-            to_replace = ' '
-            seq.id = seq.id.translate(str.maketrans(to_replace, '_' *  len(to_replace)))
-
-            if self.outprefix is None and original_id != seq.id:
-                seq.id = original_id
-                return False, 'Name has spaces', seq
-
-            if seq.id in names:
-                if self.outprefix is None:
-                    return False, 'Duplicate name', seq
-                else:
-                    names[seq.id] += 1
-                    seq.id += '.' + str(names[seq.id])
-            else:
-                names[seq.id] = 1
-
-            if self.outprefix is not None:
-                print(original_id, seq.id, sep='\t', file=old2new_out_fh)
-                print(seq, file=fasta_out_fh)
-
-        if self.outprefix is not None:
-            pyfastaq.utils.close(fasta_out_fh)
-            pyfastaq.utils.close(bad_seqs_out_fh)
-            pyfastaq.utils.close(log_out_fh)
-            pyfastaq.utils.close(old2new_out_fh)
-
-        return True, None, None
diff --git a/ariba/reference_data.py b/ariba/reference_data.py
new file mode 100644
index 0000000..3251a05
--- /dev/null
+++ b/ariba/reference_data.py
@@ -0,0 +1,510 @@
+import os
+import sys
+import re
+import copy
+import pyfastaq
+from ariba import sequence_metadata, cdhit
+
+
+class Error (Exception): pass
+
+rename_sub_regex = re.compile(r'[^\w.-]')
+
+
+class ReferenceData:
+    def __init__(self,
+        presence_absence_fa=None,
+        variants_only_fa=None,
+        non_coding_fa=None,
+        metadata_tsv=None,
+        min_gene_length=6,
+        max_gene_length=10000,
+        genetic_code=11,
+    ):
+        self.seq_filenames = {}
+        self.seq_dicts = {}
+        self.min_gene_length = min_gene_length
+        self.max_gene_length = max_gene_length
+
+        total_ref_seqs_loaded = 0
+
+        for x in ['presence_absence', 'variants_only', 'non_coding']:
+            exec('self.seq_filenames[x] = self._get_filename(' + x + '_fa)')
+            self.seq_dicts[x] = self._load_fasta_file(self.seq_filenames[x])
+            total_ref_seqs_loaded += len(self.seq_dicts[x])
+
+        if {None} == set(self.seq_filenames.values()):
+            raise Error('Error! Must supply at least one of presence_absence_fa, variants_only_fa, non_coding_fa. Cannot continue')
+
+        if total_ref_seqs_loaded == 0:
+            raise Error('Error! No sequences found in input file(s). Maybe they were empty? Cannot continue.')
+
+        self.metadata = self._load_metadata_tsv(metadata_tsv)
+        self.genetic_code = genetic_code
+        pyfastaq.sequences.genetic_code = self.genetic_code
+        common_names = self._dict_keys_intersection(list(self.seq_dicts.values()))
+        if len(common_names):
+            raise Error('Error! Non-unique names found in input fasta files:\n' + '\n'.join(common_names))
+
+
+    @staticmethod
+    def _dict_keys_intersection(dicts):
+        dicts = [x for x in dicts if x is not None]
+        if len(dicts) == 0:
+            return set()
+
+        inter = set(dicts[0].keys())
+
+        for d in dicts[1:]:
+            inter = inter.intersection(set(d.keys()))
+        return inter
+
+
+    @staticmethod
+    def _get_filename(filename):
+        if filename is None:
+            return None
+        else:
+            if os.path.exists(filename):
+                return os.path.abspath(filename)
+            else:
+                raise Error('Error! File not found: ' + filename)
+
+
+    @staticmethod
+    def _load_metadata_tsv(filename):
+        if filename is None:
+            return {}
+
+        f = pyfastaq.utils.open_file_read(filename)
+        metadata_dict = {}
+
+        for line in f:
+            try:
+                metadata = sequence_metadata.SequenceMetadata(line)
+            except:
+                print('Problem with this line of metadata, which will be ignored:', line.rstrip(), file=sys.stderr)
+                continue
+
+            if metadata.name not in metadata_dict:
+                metadata_dict[metadata.name] = {'n': {}, 'p': {}, '.': set()}
+
+            if metadata.variant_type == '.':
+                metadata_dict[metadata.name]['.'].add(metadata)
+            else:
+                if metadata.variant.position not in metadata_dict[metadata.name][metadata.variant_type]:
+                    metadata_dict[metadata.name][metadata.variant_type][metadata.variant.position] = set()
+
+                metadata_dict[metadata.name][metadata.variant_type][metadata.variant.position].add(metadata)
+
+        pyfastaq.utils.close(f)
+        return metadata_dict
+
+
+    @staticmethod
+    def _load_fasta_file(filename):
+        d = {}
+
+        if filename is not None:
+            seq_reader = pyfastaq.sequences.file_reader(filename)
+            for seq in seq_reader:
+                #seq.id = seq.id.split()[0]
+                if seq.id in d:
+                    raise Error('Duplicate name "' + seq.id + '" found in file ' + filename + '. Cannot continue)')
+                d[seq.id] = copy.copy(seq)
+
+        return d
+
+
+    @staticmethod
+    def _find_gene_in_seqs(name, dicts):
+        for dict_name, this_dict in dicts.items():
+            if this_dict is None:
+                continue
+            elif name in this_dict:
+                return dict_name
+
+        return None
+
+
+    @staticmethod
+    def _write_metadata_tsv(metadata, filename):
+        f = pyfastaq.utils.open_file_write(filename)
+
+        for gene_name, data_dict in sorted(metadata.items()):
+            for meta in sorted([str(x) for x in data_dict['.']]):
+                print(meta, file=f)
+
+            variants = []
+
+            for variant_type in ['n', 'p']:
+                for position in data_dict[variant_type]:
+                    for meta in data_dict[variant_type][position]:
+                        variants.append(meta)
+
+            variants.sort()
+            for v in variants:
+                print(v, file=f)
+
+        pyfastaq.utils.close(f)
+
+
+    @staticmethod
+    def _write_dict_of_sequences(seq_dict, filename):
+        f = pyfastaq.utils.open_file_write(filename)
+        for seq in sorted(seq_dict):
+            print(seq_dict[seq], file=f)
+        pyfastaq.utils.close(f)
+
+
+    def _write_sequences(self, filename, sequences_to_write):
+        assert sequences_to_write in self.seq_dicts and sequences_to_write in self.seq_filenames
+        if self.seq_filenames[sequences_to_write] is not None:
+            self._write_dict_of_sequences(self.seq_dicts[sequences_to_write], filename)
+
+
+    def _filter_bad_variant_data(self, out_prefix, presence_absence_removed, variants_only_removed):
+        genes_to_remove = set()
+        variants_only_genes_not_found = set(self.seq_dicts['variants_only'].keys())
+        log_file = out_prefix + '.log'
+        tsv_file = out_prefix + '.tsv'
+        log_fh = pyfastaq.utils.open_file_write(log_file)
+
+        for gene_name, metadata_dict in sorted(self.metadata.items()):
+            if gene_name in presence_absence_removed:
+                print(gene_name, 'was removed from presence/absence fasta, so removing its metadata', file=log_fh)
+                genes_to_remove.add(gene_name)
+                continue
+            elif gene_name in variants_only_removed:
+                print(gene_name, 'was removed from variants only fasta, so removing its metadata', file=log_fh)
+                genes_to_remove.add(gene_name)
+                continue
+
+            gene_in_seq_dict = self._find_gene_in_seqs(gene_name, self.seq_dicts)
+            if gene_in_seq_dict is None:
+                print(gene_name, 'is in input tsv file, but not found in any input sequence files. Removing', file=log_fh)
+                genes_to_remove.add(gene_name)
+                continue
+
+            # take out any metadata that is not a variant and has no extra info.
+            to_remove = []
+
+            for metadata in metadata_dict['.']:
+                if metadata.free_text == '.':
+                    print(gene_name, 'metadata has no info. Just gene name given. Removing. Line of file was:', metadata, file=log_fh)
+                    to_remove.append(metadata)
+
+            for metadata in to_remove:
+                metadata_dict['.'].remove(metadata)
+
+
+            # if this is non_coding, we shouldn't have any amino acid variants
+            if gene_in_seq_dict == 'non_coding':
+                for position in metadata_dict['p']:
+                    for metadata in metadata_dict['p'][position]:
+                        print(gene_name, 'variant of type "p" for protein, but sequence is non-coding. Removing. Line of file was:', metadata, file=log_fh)
+
+                metadata_dict['p'] = {}
+
+
+            # take out variant metadata that doesn't make sense (eg bases not matching ref sequence)
+            for variant_type in ['n', 'p']:
+                positions_to_remove = []
+                for position in metadata_dict[variant_type]:
+                    meta_to_remove = []
+                    for metadata in metadata_dict[variant_type][position]:
+                        to_translate = variant_type == 'p'
+
+                        if not metadata.variant.sanity_check_against_seq(self.seq_dicts[gene_in_seq_dict][gene_name], translate_seq=to_translate):
+                            print(gene_name, 'variant does not match reference. Removing. Line of file was:', metadata, file=log_fh)
+                            meta_to_remove.append(metadata)
+                            continue
+
+                        if gene_in_seq_dict == 'variants_only':
+                            variants_only_genes_not_found.discard(gene_name)
+
+                    for metadata in meta_to_remove:
+                        metadata_dict[variant_type][position].remove(metadata)
+                    if len(metadata_dict[variant_type][position]) == 0:
+                        positions_to_remove.append(position)
+
+                for position in positions_to_remove:
+                    del metadata_dict[variant_type][position]
+
+
+            if gene_in_seq_dict == 'variants_only' and len(metadata_dict['n']) == len(metadata_dict['p']) == len(metadata_dict['.']) == 0:
+                print(gene_name, 'No remaining data after checks. Removing this sequence because it is in the variants only file', file=log_fh)
+                genes_to_remove.add(gene_name)
+
+        for gene_name in genes_to_remove:
+            self.metadata.pop(gene_name)
+
+        for gene_name in variants_only_genes_not_found:
+            print(gene_name, 'is in variants only gene file, but no variants found. Removing.', file=log_fh)
+            self.seq_dicts['variants_only'].pop(gene_name)
+
+        pyfastaq.utils.close(log_fh)
+        self._write_metadata_tsv(self.metadata, tsv_file)
+        self._write_sequences(out_prefix + '.presence_absence.fa', 'presence_absence')
+        self._write_sequences(out_prefix + '.non_coding.fa', 'non_coding')
+        self._write_sequences(out_prefix + '.variants_only.fa', 'variants_only')
+
+
+    @staticmethod
+    def _try_to_get_gene_seq(seq, min_length, max_length):
+        seq.seq = seq.seq.upper()
+        if len(seq) < min_length:
+            return None, 'Remove: too short. Length: ' + str(len(seq))
+        elif len(seq) > max_length:
+            return None, 'Remove: too long. Length: ' + str(len(seq))
+        else:
+            got = seq.make_into_gene()
+            if got is None:
+                return None, 'Does not look like a gene (tried both strands and all reading frames) ' + seq.seq
+            else:
+                return got[0], 'Made ' + seq.id + ' into gene. strand=' + got[1] + ', frame=' + str(got[2])
+
+
+    def _remove_bad_genes(self, seqs_dict, log_file):
+        to_remove = set()
+
+        if len(seqs_dict) == 0:
+            return to_remove
+
+        log_fh = pyfastaq.utils.open_file_write(log_file)
+
+        for name in sorted(seqs_dict):
+            new_seq, message = self._try_to_get_gene_seq(seqs_dict[name], self.min_gene_length, self.max_gene_length)
+            if new_seq is None:
+                to_remove.add(name)
+            else:
+                seqs_dict[name] = new_seq
+
+            if message is not None:
+                print(name, message, file=log_fh)
+
+        pyfastaq.utils.close(log_fh)
+
+        for name in to_remove:
+            seqs_dict.pop(name)
+
+        return to_remove
+
+
+    def sanity_check(self, outprefix):
+        variants_only_removed = self._remove_bad_genes(self.seq_dicts['variants_only'], outprefix + '.00.check_fasta_variants_only.log')
+        presence_absence_removed = self._remove_bad_genes(self.seq_dicts['presence_absence'], outprefix + '.00.check_fasta_presence_absence.log')
+        self._filter_bad_variant_data(outprefix + '.01.check_variants', presence_absence_removed, variants_only_removed)
+
+
+    @classmethod
+    def _new_seq_name(cls, name):
+        name = name.split()[0]
+        return re.sub(rename_sub_regex, '_', name)
+
+
+    @classmethod
+    def _seq_names_to_rename_dict(cls, names):
+        used_names = set()
+        old_name_to_new = {}
+
+        for old_name in sorted(names):
+            new_name = ReferenceData._new_seq_name(old_name)
+            if new_name in used_names:
+                i = 1
+                new_name_prefix = new_name
+                while new_name in used_names:
+                    new_name = new_name_prefix + '_' + str(i)
+                    i += 1
+
+            assert new_name not in used_names
+            if new_name != old_name:
+                old_name_to_new[old_name] = new_name
+
+            used_names.add(new_name)
+
+        return old_name_to_new
+
+
+    @classmethod
+    def _rename_names_in_seq_dicts(cls, seq_dicts, rename_dict):
+        '''Changes seq_dicts in place'''
+        for seq_type in ['presence_absence', 'variants_only', 'non_coding']:
+            new_dict = {}
+            while len(seq_dicts[seq_type]):
+                old_name, seq = seq_dicts[seq_type].popitem()
+                if old_name in rename_dict:
+                    seq.id = rename_dict[old_name]
+
+                new_dict[seq.id] = seq
+            seq_dicts[seq_type] = new_dict
+
+
+    @classmethod
+    def _rename_metadata_set(cls, metadata_set, new_name):
+        new_set = set()
+        for meta in metadata_set:
+            new_meta = copy.copy(meta)
+            new_meta.name = new_name
+            new_set.add(new_meta)
+        return new_set
+
+
+    @classmethod
+    def _rename_names_in_metadata(cls, meta_dict, rename_dict):
+        new_dict = {}
+
+        while len(meta_dict):
+            old_name, gene_dict = meta_dict.popitem()
+            if old_name in rename_dict:
+                new_name = rename_dict[old_name]
+                for seq_type in ['n', 'p']:
+                    for position, metaset in gene_dict[seq_type].items():
+                        gene_dict[seq_type][position] = ReferenceData._rename_metadata_set(metaset, new_name)
+
+                gene_dict['.'] = ReferenceData._rename_metadata_set(gene_dict['.'], new_name)
+            else:
+                new_name = old_name
+
+            new_dict[new_name] = gene_dict
+
+        return new_dict
+
+
+    def rename_sequences(self, outfile):
+        presabs_names = set(self.seq_dicts['presence_absence'].keys())
+        noncoding_names = set(self.seq_dicts['non_coding'].keys())
+        varonly_names = set(self.seq_dicts['variants_only'].keys())
+        # we should have already checked that all the names are unique, but let's do it again!
+        all_names = presabs_names.union(noncoding_names).union(varonly_names)
+        if len(all_names) != len(presabs_names) + len(noncoding_names) + len(varonly_names):
+            raise Error('Got a non-unique name in input data. Cannot continue')
+
+        rename_dict = ReferenceData._seq_names_to_rename_dict(all_names)
+        if len(rename_dict):
+            print('Had to rename some sequences. See', outfile, 'for old -> new names', file=sys.stderr)
+            with open(outfile, 'w') as f:
+                for old_name, new_name in sorted(rename_dict.items()):
+                    print(old_name, new_name, sep='\t', file=f)
+
+            ReferenceData._rename_names_in_seq_dicts(self.seq_dicts, rename_dict)
+            self.metadata = ReferenceData._rename_names_in_metadata(self.metadata, rename_dict)
+
+
+    def make_catted_fasta(self, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
+
+        for key in ['presence_absence', 'variants_only', 'non_coding']:
+            filename = self.seq_filenames[key]
+            if filename is not None:
+                file_reader = pyfastaq.sequences.file_reader(filename)
+                for seq in file_reader:
+                    print(seq, file=f)
+
+        pyfastaq.utils.close(f)
+
+
+    def sequence_type(self, sequence_name):
+        return self._find_gene_in_seqs(sequence_name, self.seq_dicts)
+
+
+    def sequence(self, sequence_name):
+        d = self._find_gene_in_seqs(sequence_name, self.seq_dicts)
+        if d is None:
+            return None
+        else:
+            return self.seq_dicts[d][sequence_name]
+
+
+    def sequence_length(self, sequence_name):
+        seq = self.sequence(sequence_name)
+        assert seq is not None
+        return len(seq)
+
+
+    def all_non_wild_type_variants(self, ref_name):
+        ref_seq = self.sequence(ref_name)
+        variants = {'n': {}, 'p': {}}
+
+        if ref_seq is None or ref_name not in self.metadata:
+            return variants
+
+        for variant_type in ['n', 'p']:
+            for position, metadata_set in self.metadata[ref_name][variant_type].items():
+                for metadata in metadata_set:
+                    if position not in variants[variant_type]:
+                        variants[variant_type][position] = set()
+
+                    variants[variant_type][position].add(metadata)
+
+        return variants
+
+
+    @staticmethod
+    def write_cluster_allocation_file(clusters, outfile):
+        f_out = pyfastaq.utils.open_file_write(outfile)
+
+        for seq_type in ['presence_absence', 'variants_only', 'non_coding']:
+            if clusters[seq_type] is None:
+                continue
+
+            for seq_name in sorted(clusters[seq_type]):
+                other_seqs = clusters[seq_type][seq_name].difference({seq_name})
+                if len(other_seqs) > 0:
+                    other_seq_string = '\t'.join(sorted(list(other_seqs)))
+                    print(seq_name, other_seq_string, sep='\t', file=f_out)
+                else:
+                    print(seq_name, file=f_out)
+
+        pyfastaq.utils.close(f_out)
+
+
+    def cluster_with_cdhit(self, inprefix, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, nocluster=False, verbose=False, cd_hit_est='cd-hit-est', clusters_file=None):
+        files_to_cat = []
+        clusters = {}
+
+        for seqs_type in ['presence_absence', 'variants_only', 'non_coding']:
+            if len(self.seq_dicts[seqs_type]) > 0:
+                outfile = outprefix + '.' + seqs_type + '.cdhit'
+                files_to_cat.append(outfile)
+                cdhit_runner = cdhit.Runner(
+                  inprefix + '.' + seqs_type + '.fa',
+                  outfile,
+                  seq_identity_threshold=seq_identity_threshold,
+                  threads=threads,
+                  length_diff_cutoff=length_diff_cutoff,
+                  verbose=verbose,
+                  cd_hit_est=cd_hit_est,
+                  rename_suffix = seqs_type[0]
+                )
+
+                if clusters_file is not None:
+                    new_clusters = cdhit_runner.run_get_clusters_from_file(clusters_file)
+                elif nocluster:
+                    new_clusters = cdhit_runner.fake_run()
+                else:
+                    new_clusters = cdhit_runner.run()
+
+                clusters[seqs_type] = new_clusters
+            else:
+                clusters[seqs_type] = None
+
+        assert len(files_to_cat) > 0
+        f_out = pyfastaq.utils.open_file_write(outprefix + '.cluster_representatives.fa')
+
+        for filename in files_to_cat:
+            for seq in pyfastaq.sequences.file_reader(filename):
+                print(seq, file=f_out)
+
+        pyfastaq.utils.close(f_out)
+        self.write_cluster_allocation_file(clusters, outprefix + '.clusters.tsv')
+        return clusters
+
+
+    def write_seqs_to_fasta(self, outfile, names):
+        f_out = pyfastaq.utils.open_file_write(outfile)
+
+        for name in sorted(names):
+            print(self.sequence(name), file=f_out)
+
+        pyfastaq.utils.close(f_out)
diff --git a/ariba/report.py b/ariba/report.py
new file mode 100644
index 0000000..f32234e
--- /dev/null
+++ b/ariba/report.py
@@ -0,0 +1,268 @@
+import sys
+import pymummer
+
+class Error (Exception): pass
+
+columns = [
+    'ref_name',              # 0  name of reference sequence
+    'ref_type',              # 1  type of reference sequence (presence/absence, variants only, noncoding)
+    'flag',                  # 2  cluster flag
+    'reads',                 # 3  number of reads in this cluster
+    'cluster',               # 4  name of cluster
+    'ref_len',               # 5  length of reference sequence
+    'ref_base_assembled',    # 6  number of reference nucleotides assembled by this contig
+    'pc_ident',              # 7  %identity between ref sequence and contig
+    'ctg',                   # 8  name of contig matching reference
+    'ctg_len',               # 9  length of contig matching reference
+    'ctg_cov',               # 10 mean mapped read depth of this contig
+    'known_var',             # 11 is this a known SNP from reference metadata? 1|0
+    'var_type',              # 12 The type of variant. Currently only SNP supported
+    'var_seq_type',          # 13 if known_var=1, n|p for nucleotide or protein
+    'known_var_change',      # 14 if known_var=1, the wild/variant change, eg I42L
+    'has_known_var',         # 15 if known_var=1, 1|0 for whether or not the assembly has the variant
+    'ref_ctg_change',        # 16 amino acid or nucleotide change between reference and contig, eg I42L
+    'ref_ctg_effect',        # 17 effect of change between reference and contig, eg SYS, NONSYN (amino acid changes only)
+    'ref_start',             # 18 start position of variant in contig
+    'ref_end',               # 19 end position of variant in contig
+    'ref_nt',                # 20 nucleotide(s) in contig at variant position
+    'ctg_start',             # 21 start position of variant in contig
+    'ctg_end',               # 22 end position of variant in contig
+    'ctg_nt',                # 23 nucleotide(s) in contig at variant position
+    'smtls_total_depth',     # 24 total read depth at variant start position in contig, reported by mpileup
+    'smtls_alt_nt',          # 25 alt nucleotides on contig, reported by mpileup
+    'smtls_alt_depth',       # 26 alt depth on contig, reported by mpileup
+    'var_description',       # 27 description of variant from reference metdata
+    'free_text',             # 28 other free text about reference sequence, from reference metadata
+]
+
+
+var_columns = [
+    'known_var',
+    'var_type',
+    'var_seq_type',
+    'known_var_change',
+    'has_known_var',
+    'ref_ctg_change',
+    'ref_ctg_effect',
+    'ref_start',
+    'ref_end',
+    'ref_nt',
+    'ctg_start',
+    'ctg_end',
+    'ctg_nt',
+    'smtls_total_depth',
+    'smtls_alt_nt',
+    'smtls_alt_depth',
+    'var_description',
+]
+
+int_columns = [
+    'reads',
+    'ref_len',
+    'ref_base_assembled',
+    'ctg_len',
+    'ref_start',
+    'ref_end',
+    'ctg_start',
+    'ctg_end',
+]
+
+
+float_columns = [
+    'ctg_cov',
+    'pc_ident',
+]
+
+
+def header_line():
+    return '\t'.join(columns)
+
+
+def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster, variant_list):
+    '''Input is a known variants, as sequence_metadata object. The
+       assumption is that both the reference and the assembly have the
+       variant type, not wild type. The list variant_list should be a list
+       of pymummer.variant.Variant objects, only contaning variants to the
+       relevant query contig'''
+    ref_nuc_range = sequence_meta.variant.nucleotide_range()
+
+    if ref_nuc_range is None:
+        return None
+
+    ctg_nts = []
+    ref_nts = []
+    smtls_total_depths = []
+    smtls_alt_nts = []
+    smtls_alt_depths = []
+    contig_positions = []
+
+    for ref_position in range(ref_nuc_range[0], ref_nuc_range[1]+1, 1):
+        nucmer_match = cluster.assembly_compare.nucmer_hit_containing_reference_position(cluster.assembly_compare.nucmer_hits, cluster.ref_sequence.id, ref_position)
+
+        if nucmer_match is not None:
+            # work out contig position. Needs indels variants to correct the position
+            ref_nts.append(cluster.ref_sequence[ref_position])
+            contig_position, in_indel = nucmer_match.qry_coords_from_ref_coord(ref_position, variant_list)
+            contig_positions.append(contig_position)
+            ref, alt, total_depth, alt_depths = cluster.samtools_vars.get_depths_at_position(contig_name, contig_position)
+            ctg_nts.append(ref)
+            smtls_alt_nts.append(alt)
+            smtls_total_depths.append(total_depth)
+            smtls_alt_depths.append(alt_depths)
+
+    ctg_nts = ';'.join(ctg_nts) if len(ctg_nts) else '.'
+    ref_nts = ';'.join(ref_nts) if len(ref_nts) else '.'
+    smtls_alt_nts = ';'.join(smtls_alt_nts) if len(smtls_alt_nts) else '.'
+    smtls_total_depths = ';'.join([str(x)for x in smtls_total_depths]) if len(smtls_total_depths) else '.'
+    smtls_alt_depths = ';'.join([str(x)for x in smtls_alt_depths]) if len(smtls_alt_depths) else '.'
+    ctg_start = str(min(contig_positions) + 1) if contig_positions is not None else '.'
+    ctg_end = str(max(contig_positions) + 1) if contig_positions is not None else '.'
+
+    return [str(x) for x in [
+        ref_nuc_range[0] + 1,
+        ref_nuc_range[1] + 1,
+        ref_nts,
+        ctg_start,
+        ctg_end,
+        ctg_nts,
+        smtls_total_depths,
+        smtls_alt_nts,
+        smtls_alt_depths
+    ]]
+
+
+def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymummer_variants):
+    lines = []
+    contig_length = len(cluster.assembly.sequences[contig_name])
+    assert contig_length != 0
+
+    common_first_columns = [
+        cluster.ref_sequence.id,
+        cluster.ref_sequence_type,
+        str(cluster.status_flag),
+        str(cluster.total_reads),
+        cluster.name,
+        str(len(cluster.ref_sequence)),
+        str(ref_cov_per_contig[contig_name]) if contig_name in ref_cov_per_contig else '0', # 6 ref bases assembled
+        str(cluster.assembly_compare.percent_identities[contig_name]) if contig_name in cluster.assembly_compare.percent_identities else '0',
+        contig_name,
+        str(contig_length),  # 9 length of scaffold matching reference
+    ]
+
+    # it's possible that there is no read depth on an assembled contig
+    if contig_name in cluster.total_contig_depths:
+        common_first_columns.append(str(round(cluster.total_contig_depths[contig_name] / contig_length, 1)))
+    else:
+        common_first_columns.append('0')
+
+    if cluster.ref_sequence.id in cluster.refdata.metadata and  len(cluster.refdata.metadata[cluster.ref_sequence.id]['.']) > 0:
+        free_text_column = ';'.join([x.free_text for x in cluster.refdata.metadata[cluster.ref_sequence.id]['.']])
+    else:
+        free_text_column = ';'.join(['.'])
+
+    if cluster.assembled_ok and contig_name in cluster.assembly_variants and len(cluster.assembly_variants[contig_name]) > 0:
+        for (position, var_seq_type, ref_ctg_change, var_effect, contributing_vars, matching_vars_set, metainfo_set) in cluster.assembly_variants[contig_name]:
+            if len(matching_vars_set) > 0:
+                is_known_var = '1'
+                known_var_change = 'unknown'
+                var_type = 'SNP'
+                has_known_var = '1'
+                matching_vars_column = ';;;'.join([x.to_string(separator=':') for x in matching_vars_set])
+            else:
+                is_known_var = '0'
+                known_var_change = '.'
+                has_known_var = '0'
+                var_type = '.'
+                matching_vars_column = '.'
+
+            variant_columns = ['.' if x is None else str(x) for x in [is_known_var, var_type, var_seq_type, known_var_change, has_known_var, ref_ctg_change, var_effect]]
+
+            if contributing_vars is None:
+                samtools_columns = [['.'] * 9]
+            else:
+                contributing_vars.sort(key = lambda x: x.qry_start)
+
+                smtls_total_depth = []
+                smtls_alt_nt = []
+                smtls_alt_depth = []
+
+                for var in contributing_vars:
+                    depths_tuple = cluster.samtools_vars.get_depths_at_position(contig_name, var.qry_start)
+                    if depths_tuple is not None:
+                        smtls_alt_nt.append(depths_tuple[1])
+                        smtls_total_depth.append(str(depths_tuple[2]))
+                        smtls_alt_depth.append(str(depths_tuple[3]))
+
+                smtls_total_depth = ';'.join(smtls_total_depth) if len(smtls_total_depth) else '.'
+                smtls_alt_nt = ';'.join(smtls_alt_nt) if len(smtls_alt_nt) else '.'
+                smtls_alt_depth = ';'.join(smtls_alt_depth) if len(smtls_alt_depth) else '.'
+                samtools_columns = [
+                        str(contributing_vars[0].ref_start), #ref_start
+                        str(contributing_vars[0].ref_end), # ref_end
+                        ';'.join([x.ref_base for x in contributing_vars]), # ref_nt
+                        str(contributing_vars[0].qry_start),  # ctg_start
+                        str(contributing_vars[0].qry_end),  #ctg_end
+                        ';'.join([x.qry_base for x in contributing_vars]), #ctg_nt
+                        smtls_total_depth,
+                        smtls_alt_nt,
+                        smtls_alt_depth,
+                ]
+
+
+            if len(matching_vars_set) > 0:
+                for matching_var in matching_vars_set:
+                    if contributing_vars is None:
+                        samtools_columns = _samtools_depths_at_known_snps_all_wild(matching_var, contig_name, cluster, pymummer_variants)
+                    variant_columns[3] = str(matching_var.variant)
+
+                    if matching_var.has_variant(cluster.ref_sequence) == (ref_ctg_change is not None):
+                        variant_columns[4] = '0'
+                    else:
+                        variant_columns[4] = '1'
+
+                    if samtools_columns is None:
+                        samtools_columns = [['.'] * 9]
+
+                    lines.append('\t'.join(common_first_columns + variant_columns + samtools_columns + [matching_vars_column] + [free_text_column]))
+            else:
+                lines.append('\t'.join(
+                    common_first_columns + variant_columns + \
+                    samtools_columns + \
+                    [matching_vars_column] + [free_text_column]
+                ))
+    else:
+        lines.append('\t'.join(common_first_columns + ['.'] * (len(columns) - len(common_first_columns) - 1) + [free_text_column]))
+
+    return lines
+
+
+def report_lines(cluster):
+    if cluster.status_flag.has('ref_seq_choose_fail'):
+        return ['\t'.join(['.', '.', str(cluster.status_flag), str(cluster.total_reads), cluster.name] + ['.'] * (len(columns) - 5))]
+    elif cluster.status_flag.has('assembly_fail'):
+        return ['\t'.join([cluster.ref_sequence.id, cluster.ref_sequence_type, str(cluster.status_flag), str(cluster.total_reads), cluster.name] + ['.'] * (len(columns) - 5))]
+
+
+    ref_cov_per_contig = cluster.assembly_compare.ref_cov_per_contig(cluster.assembly_compare.nucmer_hits)
+    lines = []
+    pymummer_variants = pymummer.snp_file.get_all_variants(cluster.assembly_compare.nucmer_snps_file)
+
+    for contig_name in sorted(cluster.assembly.sequences):
+        contig_pymummer_variants = [x for x in pymummer_variants if x.qry_name == contig_name]
+        lines.extend(_report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, contig_pymummer_variants))
+
+    lines_ok = True
+
+    for line in lines:
+        if len(line.split('\t')) != len(columns):
+            cols = line.split('\t')
+            print('Error making report - wrong number of columns. Expected', len(columns), 'but got', len(cols), file=sys.stderr)
+            for i in range(len(cols)):
+                print(i, cols[i], sep='\t', file=sys.stderr)
+            lines_ok = False
+
+    if not lines_ok:
+        raise Error('Error making report. Cannot continue')
+
+    return lines if len(lines) > 0 else None
+
diff --git a/ariba/report_filter.py b/ariba/report_filter.py
new file mode 100644
index 0000000..166fcc4
--- /dev/null
+++ b/ariba/report_filter.py
@@ -0,0 +1,216 @@
+import openpyxl
+import pyfastaq
+from ariba import report, flag
+
+class Error (Exception): pass
+
+class ReportFilter:
+    def __init__(self,
+            infile=None,
+            min_pc_ident=90,
+            min_ref_base_assembled=1,
+            ignore_not_has_known_variant=False,
+            remove_synonymous_snps=True,
+            exclude_flags=None,
+        ):
+
+        if infile is not None:
+            self.report = self._load_report(infile)
+        else:
+            self.report = {}
+
+        self.min_pc_ident = min_pc_ident
+        self.min_ref_base_assembled = min_ref_base_assembled
+        self.ignore_not_has_known_variant = ignore_not_has_known_variant
+        self.remove_synonymous_snps = remove_synonymous_snps
+
+        if exclude_flags is None:
+            self.exclude_flags = ['assembly_fail', 'ref_seq_choose_fail']
+        else:
+            self.exclude_flags = exclude_flags
+
+
+    @classmethod
+    def _report_line_to_dict(cls, line):
+        '''Takes report line string as input. Returns a dict of column name -> value in line'''
+        data = line.split('\t')
+        if len(data) != len(report.columns):
+            return None
+
+        d = dict(zip(report.columns, data))
+        for key in report.int_columns:
+            try:
+                d[key] = int(d[key])
+            except:
+                assert d[key] == '.'
+
+        for key in report.float_columns:
+            try:
+                d[key] = float(d[key])
+            except:
+                assert d[key] == '.'
+
+        d['flag'] = flag.Flag(int(d['flag']))
+        return d
+
+
+    @classmethod
+    def _dict_to_report_line(cls, report_dict):
+        '''Takes a report_dict as input and returns a report line'''
+        return '\t'.join([str(report_dict[x]) for x in report.columns])
+
+
+    @staticmethod
+    def _load_report(infile):
+        '''Loads report file into a dictionary. Key=refrence name.
+        Value = list of report lines for that reference'''
+        report_dict = {}
+        f = pyfastaq.utils.open_file_read(infile)
+        first_line = True
+
+        for line in f:
+            line = line.rstrip()
+
+            if first_line:
+                expected_first_line = '#' + '\t'.join(report.columns)
+                if line != expected_first_line:
+                    pyfastaq.utils.close(f)
+                    raise Error('Error reading report file. Expected first line of file is\n' + expected_first_line + '\nbut got:\n' + line)
+                first_line = False
+            else:
+                line_dict = ReportFilter._report_line_to_dict(line)
+                if line_dict is None:
+                    pyfastaq.utils.close(f)
+                    raise Error('Error reading report file at this line:\n' + line)
+                ref_name = line_dict['ref_name']
+                ctg_name = line_dict['ctg']
+                if ref_name not in report_dict:
+                    report_dict[ref_name] = {}
+                if ctg_name not in report_dict[ref_name]:
+                    report_dict[ref_name][ctg_name] = []
+
+                report_dict[ref_name][ctg_name].append(line_dict)
+
+        pyfastaq.utils.close(f)
+        return report_dict
+
+
+    @staticmethod
+    def _flag_passes_filter(flag, exclude_flags):
+        for f in exclude_flags:
+            if flag.has(f):
+                return False
+        return True
+
+
+    def _report_dict_passes_non_essential_filters(self, report_dict):
+        # known_var == '.' iff this line is not reporting a variant. Which means it passes all non-essential filters
+        if report_dict.get('known_var', '.') == '.':
+            return True
+
+        if self.remove_synonymous_snps and report_dict.get('ref_ctg_effect', None) == 'SYN':
+            return False
+
+        if self.ignore_not_has_known_variant and report_dict['known_var'] == '1' and report_dict['has_known_var'] == '0':
+            return False
+
+        return True
+
+
+    def _report_dict_passes_essential_filters(self, report_dict):
+        return ReportFilter._flag_passes_filter(report_dict['flag'], self.exclude_flags) \
+                   and report_dict['pc_ident'] >= self.min_pc_ident \
+                   and report_dict['ref_base_assembled'] >= self.min_ref_base_assembled \
+
+
+    def _filter_list_of_dicts(self, dicts_list):
+        if len(dicts_list) == 0:
+            return []
+
+        pass_dicts = []
+        essential_dicts = []
+        fail_dicts = []
+
+        for d in dicts_list:
+            if self._report_dict_passes_essential_filters(d):
+                if self._report_dict_passes_non_essential_filters(d):
+                    pass_dicts.append(d)
+                else:
+                    essential_dicts.append(d)
+            else:
+                fail_dicts.append(d)
+
+        if len(pass_dicts) == 0:
+            assert len(fail_dicts) + len(essential_dicts) > 0
+            if len(essential_dicts) > 0:
+                new_d = essential_dicts[0]
+                for key in report.var_columns:
+                    new_d[key] = '.'
+                pass_dicts.append(new_d)
+
+        return ReportFilter._remove_all_after_first_frameshift(pass_dicts)
+
+
+    @staticmethod
+    def _remove_all_after_first_frameshift(dicts_list):
+        fshift_starts = [int(d['ref_start']) for d in dicts_list if d.get('ref_ctg_effect', None) == 'FSHIFT']
+        if len(fshift_starts) == 0:
+            return dicts_list
+
+        first_start = min(fshift_starts)
+
+        return [d for d in dicts_list if d['ref_start'] == '.' or d['ref_start'] <= first_start]
+
+
+    def _filter_dicts(self):
+        '''Filters out all the report_dicts that do not pass the cutoffs. If any ref sequence
+           loses all of its report_dicts, then it is completely removed.'''
+        keys_to_remove = set()
+
+        for ref_name in self.report:
+            for ctg_name in self.report[ref_name]:
+                self.report[ref_name][ctg_name] = self._filter_list_of_dicts(self.report[ref_name][ctg_name])
+                if len(self.report[ref_name][ctg_name]) == 0:
+                    keys_to_remove.add((ref_name, ctg_name))
+
+        refs_to_remove = set()
+
+        for ref_name, ctg_name in keys_to_remove:
+            del self.report[ref_name][ctg_name]
+            if len(self.report[ref_name]) == 0:
+                refs_to_remove.add(ref_name)
+
+        for ref_name in refs_to_remove:
+            del self.report[ref_name]
+
+
+    def _write_report_tsv(self, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
+        print('#' + '\t'.join(report.columns), file=f)
+
+        for ref_name in sorted(self.report):
+            for ctg_name, report_dicts in sorted(self.report[ref_name].items()):
+                for d in report_dicts:
+                    print(ReportFilter._dict_to_report_line(d), file=f)
+
+        pyfastaq.utils.close(f)
+
+
+    def _write_report_xls(self, outfile):
+        workbook = openpyxl.Workbook()
+        worksheet = workbook.worksheets[0]
+        worksheet.title = 'ARIBA_report'
+        worksheet.append(report.columns)
+
+        for ref_name in sorted(self.report):
+            for ctg_name, report_dicts in sorted(self.report[ref_name].items()):
+                for d in report_dicts:
+                    worksheet.append([str(d[x]) for x in report.columns])
+
+        workbook.save(outfile)
+
+
+    def run(self, outprefix):
+        self._filter_dicts()
+        self._write_report_tsv(outprefix + '.tsv')
+
diff --git a/ariba/samtools_variants.py b/ariba/samtools_variants.py
new file mode 100644
index 0000000..1385121
--- /dev/null
+++ b/ariba/samtools_variants.py
@@ -0,0 +1,194 @@
+import os
+import sys
+import pysam
+import pyfastaq
+from ariba import common
+
+class Error (Exception): pass
+
+
+class SamtoolsVariants:
+    def __init__(self,
+      ref_fa,
+      bam,
+      outprefix,
+      log_fh=sys.stdout,
+      samtools_exe='samtools',
+      bcftools_exe='bcftools',
+      bcf_min_dp=10,
+      bcf_min_dv=5,
+      bcf_min_dv_over_dp=0.3,
+      bcf_min_qual=20,
+    ):
+        self.ref_fa = os.path.abspath(ref_fa)
+        self.bam = os.path.abspath(bam)
+        self.outprefix = os.path.abspath(outprefix)
+        self.log_fh = log_fh
+        self.samtools_exe = samtools_exe
+        self.bcftools_exe = bcftools_exe
+        self.bcf_min_dp = bcf_min_dp
+        self.bcf_min_dv = bcf_min_dv
+        self.bcf_min_dv_over_dp = bcf_min_dv_over_dp
+        self.bcf_min_qual = bcf_min_qual
+
+        self.vcf_file = self.outprefix + '.vcf'
+        self.read_depths_file = self.outprefix + '.read_depths.gz'
+
+
+    def _make_vcf_and_read_depths_files(self):
+        tmp_vcf = self.vcf_file + '.tmp'
+        cmd = ' '.join([
+            self.samtools_exe, 'mpileup',
+            '-t INFO/DPR,DV',
+            '-A',
+            '-f', self.ref_fa,
+            '-u',
+            '-v',
+            self.bam,
+            '>',
+            tmp_vcf
+        ])
+
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+
+        cmd = ' '.join([
+            self.bcftools_exe, 'call -m',
+            tmp_vcf,
+            '|',
+            self.bcftools_exe, 'query',
+            r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''',
+            '>',
+            self.read_depths_file + '.tmp'
+        ])
+
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+        pysam.tabix_compress(self.read_depths_file + '.tmp', self.read_depths_file)
+        pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1)
+        os.unlink(self.read_depths_file + '.tmp')
+
+        cmd = ' '.join([
+            self.bcftools_exe, 'call -m -v',
+            tmp_vcf,
+            '|',
+            self.bcftools_exe, 'filter',
+            '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
+                  ' & MIN(DV)>=' + str(self.bcf_min_dv),
+                  ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp),
+                  ' & QUAL >=', str(self.bcf_min_qual), '"',
+            '-o', self.vcf_file
+        ])
+
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+        os.unlink(tmp_vcf)
+
+
+    @classmethod
+    def _get_read_depths(cls, read_depths_file, sequence_name, position):
+        '''Returns total read depth and depth of reads supporting alternative (if present)'''
+        assert os.path.exists(read_depths_file)
+        assert os.path.exists(read_depths_file + '.tbi')
+        tbx = pysam.TabixFile(read_depths_file)
+        try:
+            rows = [x for x in tbx.fetch(sequence_name, position, position + 1)]
+        except:
+            return None
+
+        if len(rows) > 1: # which happens with indels, mutiple lines for same base of reference
+            test_rows = [x for x in rows if x.rstrip().split()[3] != '.']
+            if len(test_rows) != 1:
+                rows = [rows[-1]]
+            else:
+                rows = test_rows
+
+        if len(rows) == 1:
+            r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
+            return ref_base, alt_base, int(ref_counts), alt_counts
+        else:
+            return None
+
+
+    @classmethod
+    def _get_variant_positions_from_vcf(cls, vcf_file):
+        if not os.path.exists(vcf_file):
+            return []
+        f = pyfastaq.utils.open_file_read(vcf_file)
+        positions = [l.rstrip().split('\t')[0:2] for l in f if not l.startswith('#')]
+        positions = [(t[0], int(t[1]) - 1) for t in positions]
+        pyfastaq.utils.close(f)
+        return positions
+
+
+    @staticmethod
+    def _get_variants(vcf_file, read_depths_file, positions=None):
+        if positions is None:
+            positions = SamtoolsVariants._get_variant_positions_from_vcf(vcf_file)
+        variants = {}
+        if len(positions) == 0:
+            return variants
+        if not (os.path.exists(vcf_file) and os.path.exists(read_depths_file)):
+            return variants
+        for t in positions:
+            name, pos = t[0], t[1]
+            depths = SamtoolsVariants._get_read_depths(read_depths_file, name, pos)
+            if depths is None:
+                continue
+            if name not in variants:
+                variants[name] = {}
+            variants[name][t[1]] = depths
+        return variants
+
+
+    @staticmethod
+    def total_depth_per_contig(read_depths_file):
+        f = pyfastaq.utils.open_file_read(read_depths_file)
+        depths = {}
+        for line in f:
+            try:
+                name, pos, base, var, depth, depth2 = line.rstrip().split('\t')
+                depth = int(depth)
+            except:
+                pyfastaq.utils.close(f)
+                raise Error('Error getting read depth from he following line of file ' + read_depths_file + ':\n' + line)
+
+            depths[name] = depths.get(name, 0) + depth
+
+        pyfastaq.utils.close(f)
+        return depths
+
+
+    @staticmethod
+    def variants_in_coords(nucmer_matches, vcf_file):
+        '''nucmer_matches = made by assembly_compare.assembly_match_coords().
+           Returns number of variants that lie in nucmer_matches'''
+        vcf_variant_counts = {}
+        f = pyfastaq.utils.open_file_read(vcf_file)
+        for line in f:
+            if line.startswith('#'):
+                continue
+
+            data = line.rstrip().split('\t')
+            scaff = data[0]
+
+            if scaff in nucmer_matches:
+                position = int(data[1]) - 1
+                i = pyfastaq.intervals.Interval(position, position)
+                intersects = len([x for x in nucmer_matches[scaff] if x.intersects(i)]) > 0
+                if intersects:
+                    vcf_variant_counts[scaff] = vcf_variant_counts.get(scaff, 0) + 1
+
+        pyfastaq.utils.close(f)
+        return sum(list(vcf_variant_counts.values()))
+
+
+    def get_depths_at_position(self, seq_name, position):
+        d = self._get_variants(self.vcf_file, self.read_depths_file, [(seq_name, position)])
+        if seq_name in d and position in d[seq_name]:
+            return d[seq_name][position]
+        else:
+            return 'ND', 'ND', 'ND', 'ND'
+
+
+    def run(self):
+        self._make_vcf_and_read_depths_files()
+        # This is to make this object picklable, to keep multithreading happy
+        self.log_fh = None
diff --git a/ariba/sequence_metadata.py b/ariba/sequence_metadata.py
new file mode 100644
index 0000000..5b888a2
--- /dev/null
+++ b/ariba/sequence_metadata.py
@@ -0,0 +1,48 @@
+from ariba import sequence_variant
+
+class Error (Exception): pass
+
+
+class SequenceMetadata:
+    def __init__(self, line):
+        try:
+            self.name, variant_type, variant_string, identifier, self.free_text = line.rstrip().split('\t')
+        except:
+            raise Error('Error parsing line of file:\n' + line)
+
+        self.variant_type = variant_type
+
+        if self.variant_type == '.':
+            self.variant = None
+        else:
+            self.variant = sequence_variant.Variant(self.variant_type, variant_string, identifier)
+
+
+    def __eq__(self, other):
+       return type(other) is type(self) and self.name == other.name and self.variant == other.variant and self.variant_type == other.variant_type and self.free_text == other.free_text
+
+
+    def __lt__(self, other):
+        return self.name < other.name or (self.name == other.name and self.variant < other.variant)
+
+
+    def __hash__(self):
+        return hash((self.name, self.variant_type, str(self.variant), self.free_text))
+
+
+    def __str__(self):
+        return self.to_string()
+
+
+    def to_string(self, separator='\t'):
+        return separator.join([
+            self.name,
+            self.variant_type,
+            '.' if self.variant is None else str(self.variant),
+            '.' if (self.variant is None or self.variant.identifier is None) else self.variant.identifier,
+            self.free_text
+        ])
+
+
+    def has_variant(self, seq):
+        return self.variant is not None and self.variant.has_variant(seq)
diff --git a/ariba/sequence_variant.py b/ariba/sequence_variant.py
new file mode 100644
index 0000000..7e32515
--- /dev/null
+++ b/ariba/sequence_variant.py
@@ -0,0 +1,70 @@
+import pyfastaq
+import re
+
+class Error (Exception): pass
+
+
+allowed_variant_types = {'n', 'p'}
+
+class Variant:
+    def __init__(self, variant_type, variant_string, identifier):
+        if variant_type not in allowed_variant_types:
+            raise Error('Error! Variant type "' + variant_type + '" not recognised.\n' + \
+                        'Must be one of:' + ', '.join(allowed_variant_types))
+
+        self.variant_type = variant_type
+        self.identifier = None if identifier == '.' else identifier
+
+
+        m = re.match('^([A-Z])([0-9]+)([A-Z])$', variant_string.upper())
+        if m is None:
+            raise Error('Unexpected format of variant string: ', variant_string)
+
+        try:
+            self.wild_value, self.position, self.variant_value = m.group(1, 2, 3)
+        except:
+            raise Error('Error getting amino acids and position of variant from', variant_string)
+
+        self.position = int(self.position) - 1
+
+
+    def __eq__(self, other):
+       return type(other) is type(self) and self.__dict__ == other.__dict__
+
+
+    def __lt__(self, other):
+        return self.position < other.position or \
+            (self.position == other.position and self.variant_type < other.variant_type) or \
+            (self.position == other.position and self.variant_type == other.variant_type and self.wild_value < other.wild_value) or \
+            (self.position == other.position and self.variant_type == other.variant_type and self.wild_value == other.wild_value and self.variant_value < other.variant_value)
+
+
+    def __str__(self):
+        return ''.join([self.wild_value, str(self.position + 1), self.variant_value])
+
+
+    def sanity_check_against_seq(self, seq, translate_seq=False):
+        if translate_seq:
+            seq = pyfastaq.sequences.Fasta('x', seq).translate().seq
+
+        return len(seq) >= self.position + 1 and seq[self.position].upper() in [self.wild_value, self.variant_value]
+
+
+    def has_variant(self, seq):
+        if self.variant_type == 'p':
+            test_seq = seq.translate()
+        else:
+            test_seq = seq
+
+        assert self.position < len(test_seq)
+        return test_seq[self.position] == self.variant_value
+
+
+    def nucleotide_range(self):
+        '''Returns the nucleotide (start, end) positions inclusive of this variant.
+           start==end if it's an amino acid variant, otherwise start+2==end'''
+        if self.variant_type == 'p':
+            return 3 * self.position, 3 * self.position + 2
+        else:
+            return self.position, self.position
+
diff --git a/ariba/summary.py b/ariba/summary.py
index 935846b..40a48a5 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -1,55 +1,27 @@
 import os
+import copy
+import re
+import sys
 import openpyxl
 import pyfastaq
-from ariba import flag
+from ariba import flag, common, report, summary_cluster, summary_sample
 
 class Error (Exception): pass
 
-columns = [
-    'gene',
-    'flag',
-    'reads',
-    'cluster',
-    'gene_len',
-    'assembled',
-    'pc_ident',
-    'var_type',
-    'var_effect',
-    'new_aa',
-    'gene_start',
-    'gene_end',
-    'gene_nt',
-    'scaffold',
-    'scaff_len',
-    'scaff_start',
-    'scaff_end',
-    'scaff_nt',
-    'read_depth',
-    'alt_bases',
-    'ref_alt_depth'
-]
-
-int_columns = [
-    'reads',
-    'gene_len',
-    'assembled',
-    'gene_start',
-    'gene_end',
-    'scaff_len',
-    'scaff_start',
-    'scaff_end',
-    'read_depth',
-]
-
+required_keys_for_difference = {'no', 'yes', 'yes_nonunique', 'fragmented'}
 
 class Summary:
     def __init__(
       self,
-      outfile,
+      outprefix,
       filenames=None,
       fofn=None,
-      filter_output=True,
-      min_id=90.0
+      filter_rows=True,
+      filter_columns=True,
+      min_id=90.0,
+      cluster_cols='assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+      variant_cols='groups,grouped,ungrouped,novel',
+      verbose=False,
     ):
         if filenames is None and fofn is None:
             raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue')
@@ -62,9 +34,35 @@ class Summary:
         if fofn is not None:
             self.filenames.extend(self._load_fofn(fofn))
 
-        self.filter_output = filter_output
+        self.cluster_columns = self._determine_cluster_cols(cluster_cols)
+        self.var_columns = self._determine_var_cols(variant_cols)
+        self.filter_rows = filter_rows
+        self.filter_columns = filter_columns
         self.min_id = min_id
-        self.outfile = outfile
+        self.outprefix = outprefix
+        self.verbose = verbose
+
+
+    @classmethod
+    def _determine_cols(cls, cols_string, allowed_cols, error_string):
+        if cols_string == '' or cols_string is None:
+            return {x: False for x in allowed_cols}
+        wanted_cols = set(cols_string.split(','))
+        if not wanted_cols.issubset(allowed_cols):
+            raise Error('Error in ' + error_string + '. Allowed values are: ' + str(','.join(list(allowed_cols))) + '. Got: ' + cols_string)
+        return {x: x in wanted_cols for x in allowed_cols}
+
+
+    @staticmethod
+    def _determine_cluster_cols(cols_string):
+        allowed_cols = {'assembled', 'has_res', 'ref_seq', 'pct_id', 'known_var', 'novel_var'}
+        return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns')
+
+
+    @staticmethod
+    def _determine_var_cols(cols_string):
+        allowed_cols = {'groups', 'grouped', 'ungrouped', 'novel'}
+        return Summary._determine_cols(cols_string, allowed_cols, 'variant columns')
 
 
     def _load_fofn(self, fofn):
@@ -80,137 +78,325 @@ class Summary:
                 raise Error('File not found: "' + fname + '". Cannot continue')
 
 
-    def _line2dict(self, line):
-        data = line.rstrip().split('\t')
-        d = {columns[i]: data[i] for i in range(len(data))}
-        d['flag'] = flag.Flag(int(d['flag']) )
-        for key in int_columns:
-            try:
-                d[key] = int(d[key])
-            except:
-                assert d[key] == '.'
-        try:
-            d['pc_ident'] = float(d['pc_ident'])
-        except:
-            assert d['pc_ident'] == '.'
-        return d
+    @classmethod
+    def _load_input_files(cls, filenames, min_id, verbose=False):
+        samples = {}
+        for filename in filenames:
+            samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id)
+            samples[filename].run()
+            if verbose:
+                print('Loaded file', filename, flush=True)
+        return samples
+
+
+    @classmethod
+    def _get_all_cluster_names(cls, samples_dict):
+        '''Input should be output of _load_input_files'''
+        cluster_names = set()
+        for filename, sample in samples_dict.items():
+            cluster_names.update(set(sample.clusters.keys()))
+        return cluster_names
+
+
+    @classmethod
+    def _get_all_variant_columns(cls, samples_dict):
+        '''Input should be output of _load_input_files'''
+        columns = {}
+        for filename, sample in samples_dict.items():
+            for cluster in sample.column_summary_data:
+                if sample.column_summary_data[cluster]['assembled'] == 'yes':
+                    for key, tuple_set in sample.variant_column_names_tuples.items():
+                        for t in tuple_set:
+                            if key not in columns:
+                                columns[key] = set()
+                            columns[key].add(t)
+        return columns
+
+
+    @classmethod
+    def _get_all_var_groups(cls, samples_dict):
+        groups = {}
+        for filename, sample in samples_dict.items():
+            for name, name_set in sample.var_groups.items():
+                if name not in groups:
+                    groups[name] = set()
+                groups[name].update(name_set)
+        return groups
 
 
-    def _load_file(self, filename):
-        f = pyfastaq.utils.open_file_read(filename)
-        d = {}
-
-        for line in f:
-            if line.startswith('#'):
-                if line.rstrip()[1:].split('\t') != columns:
-                    raise Error('Error parsing the following line.\n' + line)
-                continue
-            data = self._line2dict(line)
-
-            if data['gene'] not in d:
-                d[data['gene']] = []
-
-            d[data['gene']].append(data)
-
-        pyfastaq.utils.close(f)
-        return d
+    def _gather_output_rows(self):
+        all_cluster_names = Summary._get_all_cluster_names(self.samples)
+        all_var_columns = Summary._get_all_variant_columns(self.samples)
+        if self.var_columns['groups']:
+            var_groups = Summary._get_all_var_groups(self.samples)
+        else:
+            var_groups = set()
+        rows = {}
 
+        for filename, sample in self.samples.items():
+            rows[filename] = {}
 
-    def _to_summary_number(self, l):
-        f = l[0]['flag']
-        if f.has('assembly_fail') or not f.has('gene_assembled') or self._pc_id_of_longest(l) <= self.min_id:
-            return 0
+            for cluster in all_cluster_names:
+                rows[filename][cluster] = {}
 
-        if f.has('hit_both_strands') or (not f.has('complete_orf')):
-            return 1
+                if cluster in sample.column_summary_data and sample.column_summary_data[cluster]['assembled'].startswith('yes'):
+                    rows[filename][cluster] = sample.column_summary_data[cluster]
+                else:
+                    rows[filename][cluster] = {
+                        'assembled': 'no',
+                        'has_res': 'no',
+                        'ref_seq': 'NA',
+                        'known_var': 'NA',
+                        'novel_var': 'NA',
+                        'pct_id': 'NA'
+                    }
+
+                if self.var_columns['groups']:
+                    for group_name in var_groups[cluster]:
+                        if cluster in sample.var_groups and group_name in sample.var_groups[cluster]:
+                            rows[filename][cluster]['vgroup.' + group_name] = 'yes'
+                        else:
+                            rows[filename][cluster]['vgroup.' + group_name] = 'no'
+
+                if cluster in all_var_columns:
+                    for (ref_name, variant, grouped_or_novel, group_name) in all_var_columns[cluster]:
+                        if not self.var_columns[grouped_or_novel]:
+                            continue
+
+                        key = ref_name + '.' + variant
+                        if rows[filename][cluster]['assembled'] == 'no':
+                            rows[filename][cluster][key] = 'NA'
+                        elif cluster in sample.variant_column_names_tuples and (ref_name, variant, grouped_or_novel, group_name) in sample.variant_column_names_tuples[cluster]:
+                            rows[filename][cluster][key] = 'yes'
+                        else:
+                            rows[filename][cluster][key] = 'no'
+
+                for key, wanted in self.cluster_columns.items():
+                    if not wanted:
+                        del rows[filename][cluster][key]
+
+        return rows
+
+
+    @classmethod
+    def _to_matrix(cls, filenames, rows, cluster_cols):
+        '''rows = output from _gather_output_rows().
+           filenames = self.filenames
+           cluster_cols = self.cluster_columns'''
+        matrix = []
+        making_header_lines = True
+        phandango_header = ['name']
+        phandago_suffixes = {'assembled': ':o1', 'has_res': ':o1', 'ref_seq': ':o2', 'pct_id': ':c1', 'known_var': ':o1', 'novel_var': 'o1'}
+        csv_header = ['name']
+        all_cluster_cols_in_order = ['assembled', 'has_res', 'ref_seq', 'pct_id', 'known_var', 'novel_var']
+        all_cluster_cols_in_order_set = set(['assembled', 'has_res', 'ref_seq', 'pct_id', 'known_var', 'novel_var'])
+        cluster_cols_in_order = [x for x in all_cluster_cols_in_order if cluster_cols[x]]
+        cluster_cols_set = set(cluster_cols_in_order)
+
+        for filename in filenames:
+            assert filename in rows
+            line = [filename]
+
+            for cluster_name in sorted(rows[filename]):
+                for col in cluster_cols_in_order:
+                    if making_header_lines:
+                        csv_header.append(cluster_name + '.' + col)
+                        phandango_header.append(cluster_name + '.' + col + '.' + phandago_suffixes[col])
+
+                    line.append(rows[filename][cluster_name][col])
+
+                for col in sorted(rows[filename][cluster_name]):
+                    if col in all_cluster_cols_in_order_set:
+                        continue
+
+                    if making_header_lines:
+                        csv_header.append(cluster_name + '.' + col)
+                        phandango_header.append(cluster_name + '.' + col + ':o1')
+
+                    line.append(rows[filename][cluster_name][col])
+
+            making_header_lines = False
+            matrix.append(line)
+
+        return phandango_header, csv_header, matrix
+
+
+    @classmethod
+    def _filter_matrix_rows(cls, matrix):
+        '''matrix = output from _to_matrix'''
+        indexes_to_keep = []
+
+        for i in range(len(matrix)):
+            keep_row = False
+            for element in matrix[i]:
+                if element not in {'NA', 'no'}:
+                    keep_row = True
+                    break
+            if keep_row:
+                indexes_to_keep.append(i)
+
+        return [matrix[i] for i in indexes_to_keep]
+
+
+    @classmethod
+    def _filter_matrix_columns(cls, matrix, phandango_header, csv_header):
+        '''phandango_header, csv_header, matrix = output from _to_matrix'''
+        indexes_to_keep = set()
+
+        for row in matrix:
+            for i in range(len(row)):
+                if row[i] not in {'NA', 'no'}:
+                    indexes_to_keep.add(i)
+
+        indexes_to_keep = sorted(list(indexes_to_keep))
+
+        for i in range(len(matrix)):
+            matrix[i] = [matrix[i][j] for j in indexes_to_keep]
+
+        phandango_header = [phandango_header[i] for i in indexes_to_keep]
+        csv_header = [csv_header[i] for i in indexes_to_keep]
+        return phandango_header, csv_header, matrix
+
+
+    @classmethod
+    def _add_phandango_colour_columns(cls, header, matrix):
+        header = copy.deepcopy(header)
+        matrix = copy.deepcopy(matrix)
+        cols_to_add_colour_col = [i for i in range(len(header)) if header[i].endswith(':o1')]
+        field_to_col = {
+            'yes': '#1f78b4',
+            'yes_nonunique': '#a6cee3',
+            'no': '#33a02c',
+            'NA': '#b2df8a',
+        }
 
-        if f.has('unique_contig') and f.has('gene_assembled_into_one_contig') and f.has('complete_orf'):
-            if f.has('has_nonsynonymous_variants'):
-                return 3
-            else:
-                return 4
-        else:
-            return 2
+        cols_to_add_colour_col.reverse()
 
+        for col_index in cols_to_add_colour_col:
+            header[col_index] = header[col_index][:-3]
+            header.insert(col_index + 1, header[col_index] + ':colour')
+
+            for row_index in range(len(matrix)):
+                colour = field_to_col[matrix[row_index][col_index]]
+                matrix[row_index].insert(col_index + 1, colour)
+
+        return header, matrix
 
-    def _pc_id_of_longest(self, l):
-        longest = 0
-        identity = None
-        for data in l:
-            if data['assembled'] > longest:
-                longest = data['assembled']
-                identity = data['pc_ident']
 
-        assert identity is not None
-        return identity
+    @classmethod
+    def _matrix_to_csv(cls, matrix, header, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
+        print(*header, sep=',', file=f)
+        for line in matrix:
+            print(*line, sep=',', file=f)
+        pyfastaq.utils.close(f)
 
 
+    @staticmethod
+    def _distance_score_between_values(value1, value2):
+        value_set = {value1, value2}
+        if value_set.isdisjoint(required_keys_for_difference) or value1 == value2 or value_set == {'NA', 'no'}:
+            return 0
+        else:
+            return 1
 
-    def _gather_output_rows(self):
-        self.data = {filename: self._load_file(filename) for filename in self.filenames}
-
-        all_genes = set()
-        for l in self.data.values():
-            all_genes.update(set(l.keys()))
-        all_genes = list(all_genes)
-        all_genes.sort()
-
-        self.rows_out = []
-        self.rows_out.append(['filename'] + all_genes)
-
-        for filename in self.filenames:
-            new_row = [filename]
-            for gene in all_genes:
-                if gene not in self.data[filename]:
-                    new_row.append(0)
-                else:
-                    new_row.append(self._to_summary_number(self.data[filename][gene]))
 
-            self.rows_out.append(new_row)
+    @classmethod
+    def _distance_score_between_lists(cls, scores1, scores2):
+        assert len(scores1) == len(scores2)
+        return sum([cls._distance_score_between_values(scores1[i], scores2[i]) for i in range(1, len(scores1))])
 
 
-    def _filter_output_rows(self):
-        if not self.filter_output:
-            return
+    @classmethod
+    def _write_distance_matrix(cls, lines, outfile):
+        if len(lines) < 2:
+            raise Error('Cannot calculate distance matrix to make tree for phandango.\n' +
+                        'Only one sample present.')
 
-        # remove rows that are all zeros
-        self.rows_out = [x for x in self.rows_out if x[1:] != [0]*(len(x)-1)]
+        if len(lines[0]) < 2:
+            raise Error('Cannot calculate distance matrix to make tree for phandango. Not enough columns')
 
-        # remove columns that are all zeros
-        to_remove = []
-        for i in range(1, len(self.rows_out[0])):
-            if sum([x[i] for x in self.rows_out[1:]]) == 0:
-                to_remove.append(i)
+        scores = [[0 for i in range(len(lines))] for j in range(len(lines))]
 
-        for i in range(len(self.rows_out)):
-            self.rows_out[i] = [self.rows_out[i][j] for j in range(len(self.rows_out[i])) if j not in to_remove]
+        for i in range(len(lines)):
+            for j in range(i + 1, len(lines), 1):
+                scores[i][j] = Summary._distance_score_between_lists(lines[i], lines[j])
+                scores[j][i] = scores[i][j]
 
+        with open(outfile, 'w') as f:
+            sample_names = [x[0] for x in lines]
+            print(*sample_names, sep='\t', file=f)
+            for i in range(len(scores)):
+                print(lines[i][0], *scores[i][1:], sep='\t', file=f)
 
 
-    def _write_tsv(self):
-        f = pyfastaq.utils.open_file_write(self.outfile)
-        print('#', end='', file=f)
-        for row in self.rows_out:
-            print('\t'.join([str(x) for x in row]), file=f)
-        pyfastaq.utils.close(f)
+    @classmethod
+    def _newick_from_dist_matrix(cls, distance_file, outfile):
+        r_script = outfile + '.tmp.R'
 
+        with open(r_script, 'w') as f:
+            print('library(ape)', file=f)
+            print('a=read.table("', distance_file, '", header=TRUE, row.names=1, comment.char="")', sep='', file=f)
+            print('h=hclust(dist(a))', file=f)
+            print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f)
 
-    def _write_xls(self):
-        workbook = openpyxl.Workbook()
-        worksheet = workbook.worksheets[0]
-        worksheet.title = 'ARIBA_summary'
-        for row in self.rows_out:
-            worksheet.append(row)
-        workbook.save(self.outfile)
+        common.syscall('Rscript --no-save ' + r_script)
+        if os.path.exists(r_script + 'out'):
+            os.unlink(r_script + 'out')
+        os.unlink(r_script)
 
 
     def run(self):
+        if self.verbose:
+            print('Loading input files...', flush=True)
         self._check_files_exist()
-        self._gather_output_rows()
-        self._filter_output_rows()
-        if self.outfile.endswith('.xls'):
-            self._write_xls()
+        self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose)
+        if self.verbose:
+            print('Generating output rows', flush=True)
+        self.rows = self._gather_output_rows()
+        phandango_header, csv_header, matrix = Summary._to_matrix(self.filenames, self.rows, self.cluster_columns)
+
+        if self.filter_rows:
+            if self.verbose:
+                print('Filtering rows', flush=True)
+            matrix = Summary._filter_matrix_rows(matrix)
+
+        if len(matrix) == 0:
+            print('No rows left after filtering rows. Cannot continue', file=sys.stderr)
+            sys.exit(1)
+
+        if self.filter_columns:
+            if self.verbose:
+                print('Filtering columns', flush=True)
+            phandango_header, csv_header, matrix = Summary._filter_matrix_columns(matrix, phandango_header, csv_header)
+
+        if len(matrix) == 0 or len(matrix[0]) == 0:
+            print('No columns left after filtering columns. Cannot continue', file=sys.stderr)
+
+        csv_file = self.outprefix + '.csv'
+        if self.verbose:
+            print('Writing csv file', csv_file, flush=True)
+        Summary._matrix_to_csv(matrix, csv_header, csv_file)
+
+        if len(matrix) > 1:
+            if self.verbose:
+                print('Making Phandango csv file', csv_file, flush=True)
+            csv_file = self.outprefix + '.phandango.csv'
+            phandango_header, phandango_matrix = Summary._add_phandango_colour_columns(phandango_header, matrix)
+            Summary._matrix_to_csv(phandango_matrix, phandango_header, csv_file)
+            dist_matrix_file = self.outprefix + '.phandango.distance_matrix'
+            tree_file = self.outprefix + '.phandango.tre'
+
+            if self.verbose:
+                print('Making Phandango distance matrix', dist_matrix_file, flush=True)
+            Summary._write_distance_matrix(matrix, dist_matrix_file)
+
+            if self.verbose:
+                print('Making Phandango tree file', tree_file, flush=True)
+            Summary._newick_from_dist_matrix(dist_matrix_file, tree_file)
+            os.unlink(dist_matrix_file)
         else:
-            self._write_tsv()
-
+            print('Made csv file. Not making Phandango files because only one sample remains after filtering', file=sys.stderr)
 
+        if self.verbose:
+            print('Finished', flush=True)
diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py
new file mode 100644
index 0000000..f1bc7bb
--- /dev/null
+++ b/ariba/summary_cluster.py
@@ -0,0 +1,258 @@
+from ariba import flag, report
+
+class Error (Exception): pass
+
+int_columns = [
+    'reads',
+    'ref_len',
+    'ref_base_assembled',
+    'ctg_len',
+    'ref_start',
+    'ref_end',
+    'ctg_start',
+    'ctg_end',
+]
+
+
+float_columns = ['pc_ident']
+
+class SummaryCluster:
+    def __init__(self, min_pc_id=90):
+        self.min_pc_id = min_pc_id
+        self.name = None
+        self.ref_name = None
+        self.flag = None
+        self.data = []
+
+
+    def __eq__(self, other):
+       return type(other) is type(self) and self.__dict__ == other.__dict__
+
+
+    @classmethod
+    def line2dict(cls, line):
+        data = line.rstrip().split('\t')
+        if len(data) != len(report.columns):
+            raise Error('Wrong number of columns in the following line. Expected ' + str(len(report.columns)) + ' but got ' + str(len(data)) + '\n' + line)
+        d = {report.columns[i]: data[i] for i in range(len(data))}
+        try:
+            d['flag'] = flag.Flag(int(d['flag']) )
+        except:
+            raise Error('Error getting flag in the following line. Got "' + d['flag'] + '" for the flag.\n' + line)
+
+        for key in int_columns:
+            try:
+                d[key] = int(d[key])
+            except:
+                assert d[key] == '.'
+
+        for key in float_columns:
+            try:
+                d[key] = float(d[key])
+            except:
+                assert d[key] == '.'
+
+        if d['var_description'] == '.':
+            d['var_group'] = '.'
+        else:
+            try:
+                d['var_group'] = d['var_description'].split(':')[3]
+            except:
+                raise Error('Error getting variant group from the following line:\n' + line)
+
+        return d
+
+
+    def add_data_dict(self, data_dict):
+        if data_dict['pc_ident'] == '.' or data_dict['pc_ident'] < self.min_pc_id:
+            return
+
+        if self.name is None:
+            assert self.ref_name is None and self.flag is None
+            self.name = data_dict['cluster']
+            self.ref_name = data_dict['ref_name']
+            self.flag = data_dict['flag']
+
+        if self.name != data_dict['cluster']:
+            raise Error('Cannot add dict to SummaryCluster. Expected cluster name "' + self.name + '" but got "' + data_dict['cluster'] + '".')
+
+        if self.ref_name != data_dict['ref_name']:
+            raise Error('Cannot add dict to SummaryCluster. Expected ref_name "' + self.ref_name + '" but got "' + data_dict['ref_name'] + '".')
+
+        if self.flag != data_dict['flag']:
+            raise Error('Cannot add dict to SummaryCluster. Expected flag "' + str(self.flag) + '" but got "' + str(data_dict['flag']) + '".')
+        self.data.append(data_dict)
+
+
+    def pc_id_of_longest(self):
+        longest = 0
+        identity = 0
+
+        for d in self.data:
+            if d['ref_base_assembled'] > longest:
+                longest = d['ref_base_assembled']
+                identity = d['pc_ident']
+
+        return identity
+
+
+    def _to_cluster_summary_assembled(self):
+        if len(self.data) == 0:
+            return 'no'
+
+        if self.data[0]['ref_type'] == 'non_coding':
+            has_complete_gene = True
+        else:
+            has_complete_gene = self.flag.has('complete_gene')
+
+        if self.flag.has('assembly_fail') or \
+          (not self.flag.has('assembled')) or \
+          self.flag.has('ref_seq_choose_fail'):
+            return 'no'
+        elif self.flag.has('assembled_into_one_contig') and has_complete_gene:
+            if self.flag.has('unique_contig') and \
+              (not self.flag.has('scaffold_graph_bad')) and \
+              (not self.flag.has('variants_suggest_collapsed_repeat')) and \
+              (not self.flag.has('hit_both_strands')) and \
+              (not self.flag.has('region_assembled_twice')):
+                return 'yes'
+            else:
+                return 'yes_nonunique'
+        else:
+            return 'fragmented'
+
+
+    @classmethod
+    def _has_known_variant(cls, data_dict):
+        return data_dict['has_known_var'] == '1'
+
+
+    def _has_any_known_variant(self):
+        for d in self.data:
+            if self._has_known_variant(d):
+                return 'yes'
+        return 'no'
+
+
+    @classmethod
+    def _has_nonsynonymous(cls, data_dict):
+        return data_dict['ref_ctg_effect'] != 'SYN' and \
+          (
+              data_dict['has_known_var'] == '1' or \
+              (data_dict['known_var'] != '1' and (data_dict['ref_ctg_change'] != '.' or data_dict['ref_ctg_effect'] != '.'))
+          )
+
+
+    def _has_any_nonsynonymous(self):
+        for d in self.data:
+            if self._has_nonsynonymous(d):
+                return 'yes'
+        return 'no'
+
+
+    @classmethod
+    def _has_novel_nonsynonymous(cls, data_dict):
+        return SummaryCluster._has_nonsynonymous(data_dict) and not SummaryCluster._has_known_variant(data_dict)
+
+
+    def _has_any_novel_nonsynonymous(self):
+        for d in self.data:
+            if self._has_novel_nonsynonymous(d):
+                return 'yes'
+        return 'no'
+
+
+    def _to_cluster_summary_has_known_nonsynonymous(self, assembled_summary):
+        '''assembled_summary should be output of _to_cluster_summary_assembled'''
+        if assembled_summary == 'no':
+            return 'NA'
+        else:
+            return self._has_any_known_variant()
+
+
+    def _to_cluster_summary_has_novel_nonsynonymous(self, assembled_summary):
+        '''assembled_summary should be output of _to_cluster_summary_assembled'''
+        if assembled_summary == 'no':
+            return 'NA'
+        else:
+            return self._has_any_novel_nonsynonymous()
+
+
+    def _to_cluster_summary_has_nonsynonymous(self, assembled_summary):
+        '''assembled_summary should be output of _to_cluster_summary_assembled'''
+        if assembled_summary == 'no':
+            return 'NA'
+        else:
+            return self._has_any_nonsynonymous()
+
+
+    @staticmethod
+    def _get_nonsynonymous_var(data_dict):
+        '''if data_dict has a non synonymous variant, return string:
+        ref_name.change. Otherwise return None'''
+        has_nonsyn = SummaryCluster._has_nonsynonymous(data_dict)
+
+        if not has_nonsyn:
+            return None
+        elif data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
+            raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
+        else:
+            if '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \
+              data_dict['known_var_change'] != data_dict['ref_ctg_change']:
+                raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue')
+
+            var_group = 'novel', None
+
+            if data_dict['known_var'] == '1' and data_dict['known_var_change'] != '.':
+                var_change = data_dict['known_var_change']
+                if data_dict['var_group'] == '.':
+                    var_group = 'ungrouped', None
+                else:
+                    var_group = 'grouped', data_dict['var_group']
+            elif data_dict['ref_ctg_change'] != '.':
+                var_change = data_dict['ref_ctg_change']
+            else:
+                var_change = data_dict['ref_ctg_effect']
+
+            return (data_dict['ref_name'], var_change) + var_group
+
+    def _has_resistance(self, assembled_summary):
+        '''assembled_summary should be output of _to_cluster_summary_assembled'''
+        if assembled_summary.startswith('yes'):
+            if self.data[0]['ref_type'] in ['non_coding', 'presence_absence'] or self._to_cluster_summary_has_known_nonsynonymous(assembled_summary) == 'yes':
+                return 'yes'
+            else:
+                return 'no'
+        else:
+            return 'no'
+
+
+    def has_var_groups(self):
+        '''Returns a set of the variant group ids that this cluster has'''
+        ids = set()
+        for d in self.data:
+            if self._has_known_variant(d) and d['var_group'] != '.':
+                ids.add(d['var_group'])
+        return ids
+
+
+    def column_summary_data(self):
+        '''Returns a dictionary of column name -> value, for cluster-level results'''
+        assembled_summary = self._to_cluster_summary_assembled()
+
+        columns = {
+            'assembled': self._to_cluster_summary_assembled(),
+            'has_res': self._has_resistance(assembled_summary),
+            'ref_seq': self.ref_name,
+            'pct_id': str(self.pc_id_of_longest()),
+            'known_var': self._to_cluster_summary_has_known_nonsynonymous(assembled_summary),
+            'novel_var': self._to_cluster_summary_has_novel_nonsynonymous(assembled_summary)
+        }
+
+        return columns
+
+
+    def non_synon_variants(self):
+        variants = {self._get_nonsynonymous_var(d) for d in self.data}
+        variants.discard(None)
+        return variants
diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py
new file mode 100644
index 0000000..fcf0533
--- /dev/null
+++ b/ariba/summary_sample.py
@@ -0,0 +1,61 @@
+import pyfastaq
+from ariba import report, summary_cluster
+
+class Error (Exception): pass
+
+class SummarySample:
+    def __init__(self, report_tsv, min_pc_id=90):
+        self.report_tsv = report_tsv
+        self.min_pc_id = min_pc_id
+        self.clusters = {}
+
+
+    def __eq__(self, other):
+       return type(other) is type(self) and self.__dict__ == other.__dict__
+
+
+    @staticmethod
+    def _load_file(filename, min_pc_id):
+        f = pyfastaq.utils.open_file_read(filename)
+        clusters = {}
+
+        for line in f:
+            if line.startswith('#'):
+                if line.rstrip()[1:].split('\t') != report.columns:
+                    pyfastaq.utils.close(f)
+                    raise Error('Error parsing the following line.\n' + line)
+                continue
+
+            data_dict = summary_cluster.SummaryCluster.line2dict(line)
+            cluster = data_dict['cluster']
+            if cluster not in clusters:
+                clusters[cluster] = summary_cluster.SummaryCluster(min_pc_id=min_pc_id)
+            clusters[cluster].add_data_dict(data_dict)
+
+        pyfastaq.utils.close(f)
+        return clusters
+
+
+    def _column_summary_data(self):
+        return {c: self.clusters[c].column_summary_data() for c in self.clusters}
+
+
+    def _var_groups(self):
+        return {c: self.clusters[c].has_var_groups() for c in self.clusters}
+
+
+    def _variant_column_names_tuples(self):
+        variants = {}
+        for cluster_name, cluster in self.clusters.items():
+            cluster_vars = cluster.non_synon_variants()
+            if len(cluster_vars):
+                variants[cluster_name] = cluster_vars
+        return variants
+
+
+    def run(self):
+        self.clusters = self._load_file(self.report_tsv, self.min_pc_id)
+        self.column_summary_data = self._column_summary_data()
+        self.variant_column_names_tuples = self._variant_column_names_tuples()
+        self.var_groups = self._var_groups()
+
diff --git a/ariba/tasks/aln2meta.py b/ariba/tasks/aln2meta.py
new file mode 100644
index 0000000..f0b655f
--- /dev/null
+++ b/ariba/tasks/aln2meta.py
@@ -0,0 +1,28 @@
+import argparse
+from ariba import aln_to_metadata
+
+
+def run():
+    coding_choices = ['coding', 'noncoding']
+    parser = argparse.ArgumentParser(
+        description = 'Converts multi-alignment fasta and SNP info to metadata',
+        usage = 'ariba aln2meta [options] <aln_fasta> <variants_tsv> <(non)coding> <cluster_rep> <outprefix>'
+    )
+
+    parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
+    parser.add_argument('aln_fasta', help='Multi-fasta file of alignments')
+    parser.add_argument('variants_tsv', help='TSV file of variants information')
+    parser.add_argument('coding_or_non', help='Sequences are coding or noncoding. Must be one of: ' + ' '.join(coding_choices), choices=coding_choices, metavar='(non)coding')
+    parser.add_argument('cluster_rep', help='Name of sequence to be used as cluster representative. Must exactly match a sequence in aln_fasta file')
+    parser.add_argument('outprefix', help='Prefix of output filenames')
+    options = parser.parse_args()
+
+    aln_to_meta = aln_to_metadata.AlnToMetadata(
+      options.aln_fasta,
+      options.variants_tsv,
+      options.coding_or_non == 'coding',
+      options.cluster_rep,
+      genetic_code=options.genetic_code
+    )
+    aln_to_meta.run(options.outprefix)
+
diff --git a/ariba/tasks/getref.py b/ariba/tasks/getref.py
new file mode 100644
index 0000000..1abcc0b
--- /dev/null
+++ b/ariba/tasks/getref.py
@@ -0,0 +1,19 @@
+import argparse
+from ariba import ref_genes_getter
+
+
+def run():
+    allowed_dbs = ['argannot', 'card', 'resfinder','vfdb']
+    parser = argparse.ArgumentParser(
+        description = 'Downloads reference data',
+        usage = 'ariba getref [options] <' + '|'.join(allowed_dbs) + '> <outprefix>'
+    )
+
+    parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
+    parser.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs)
+    parser.add_argument('outprefix', help='Prefix of output filenames')
+    options = parser.parse_args()
+
+    getter = ref_genes_getter.RefGenesGetter(options.db, genetic_code=options.genetic_code)
+    getter.run(options.outprefix)
+
diff --git a/ariba/tasks/prepareref.py b/ariba/tasks/prepareref.py
new file mode 100644
index 0000000..62fb74e
--- /dev/null
+++ b/ariba/tasks/prepareref.py
@@ -0,0 +1,59 @@
+import sys
+import argparse
+from ariba import ref_preparer, external_progs, versions
+
+def run():
+    parser = argparse.ArgumentParser(
+        description = 'ARIBA: Antibiotic Resistance Identification By Assembly',
+        usage = 'ariba prepareref [options] <outdir>',
+        epilog = 'REQUIRED: either --ref_prefix, or at least one of --presabs, --varonly, --noncoding')
+    input_group = parser.add_argument_group('input files options')
+    input_group.add_argument('--ref_prefix', help='Prefix of input files (same as was used with getref), to save listing --preseabs,--varonly ...etc. Will look for files called "ref_prefix." followed by: metadata.tsv,presence_absence.fa,noncoding.fa,variants_only.fa. Using this will cause these to be ignored if used: --presabs,--varonly,--noncoding,--metadata', metavar='FILENAME_PREFIX')
+    input_group.add_argument('--presabs', help='FASTA file of presence absence genes', metavar='FILENAME')
+    input_group.add_argument('--varonly', help='FASTA file of variants only genes', metavar='FILENAME')
+    input_group.add_argument('--noncoding', help='FASTA file of noncoding sequences', metavar='FILENAME')
+    input_group.add_argument('--metadata', help='tsv file of metadata about the reference sequences', metavar='FILENAME')
+
+    cdhit_group = parser.add_argument_group('cd-hit options')
+    cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
+    cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. First name in line is the cluster representative. Incompatible with --no_cdhit', metavar='FILENAME')
+    cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
+    cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')
+
+    other_group = parser.add_argument_group('other options')
+    other_group.add_argument('--min_gene_length', type=int, help='Minimum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=6)
+    other_group.add_argument('--max_gene_length', type=int, help='Maximum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=10000)
+    other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
+    other_group.add_argument('--threads', type=int, help='Number of threads (currently only applies to cdhit) [%(default)s]', default=1, metavar='INT')
+    other_group.add_argument('--verbose', action='store_true', help='Be verbose')
+
+    parser.add_argument('outdir', help='Output directory (must not already exist)')
+    options = parser.parse_args()
+
+    if options.no_cdhit and options.cdhit_clusters is not None:
+        sys.exit('Cannot use both --no_cdhit and --cdhit_clusters. Neither or exactly one of those options must be used')
+
+    extern_progs, version_report_lines = versions.get_all_versions()
+    if options.verbose:
+        print(*version_report_lines, sep='\n')
+
+    preparer = ref_preparer.RefPreparer(
+        extern_progs,
+        version_report_lines=version_report_lines,
+        ref_prefix=options.ref_prefix,
+        presabs=options.presabs,
+        varonly=options.varonly,
+        noncoding=options.noncoding,
+        metadata=options.metadata,
+        min_gene_length=options.min_gene_length,
+        max_gene_length=options.max_gene_length,
+        genetic_code=options.genetic_code,
+        cdhit_min_id=options.cdhit_min_id,
+        cdhit_min_length=options.cdhit_min_length,
+        run_cdhit=not options.no_cdhit,
+        clusters_file=options.cdhit_clusters,
+        threads=options.threads,
+        verbose=options.verbose,
+    )
+
+    preparer.run(options.outdir)
diff --git a/ariba/tasks/refcheck.py b/ariba/tasks/refcheck.py
deleted file mode 100644
index 4184494..0000000
--- a/ariba/tasks/refcheck.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import argparse
-import sys
-import pyfastaq
-import ariba
-
-def run():
-    parser = argparse.ArgumentParser(
-        description = 'Check or fix resistance genes FASTA file',
-        usage = 'ariba refcheck [options] <infile>')
-    parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
-    parser.add_argument('-m', '--min_length', type=int, help='Minimum length in nucleotides of gene [%(default)s]', metavar='INT', default=6)
-    parser.add_argument('-n', '--max_length', type=int, help='Maximum length in nucleotides of gene [%(default)s]', metavar='INT', default=10000)
-    parser.add_argument('-o', '--outprefix', help='Prefix of output files. If this option is used, a fixed file will be output, together with information on what was changed in the input file. If this option is not used, the script dies if any input sequence is not OK')
-    parser.add_argument('infile', help='Input file containing genes to be checked', metavar='Filename')
-    options = parser.parse_args()
-
-    pyfastaq.sequences.genetic_code = options.genetic_code
-    checker = ariba.refcheck.Checker(
-        options.infile,
-        min_length=options.min_length,
-        max_length=options.max_length,
-        outprefix=options.outprefix
-    )
-
-    ok, reason, seq = checker.run()
-
-    if options.outprefix is None and not ok:
-        print('The following sequence not OK, for the reason:', reason)
-        print(seq)
-        sys.exit(1)
-
diff --git a/ariba/tasks/reportfilter.py b/ariba/tasks/reportfilter.py
new file mode 100644
index 0000000..4e85fa0
--- /dev/null
+++ b/ariba/tasks/reportfilter.py
@@ -0,0 +1,35 @@
+import argparse
+import sys
+import ariba
+
+def run():
+    parser = argparse.ArgumentParser(
+        description = 'Filters an ARIBA report tsv file',
+        usage = 'ariba reportfilter [options] <infile> <outprefix>'
+    )
+    parser.add_argument('--exclude_flags', help='Comma-separated list of flags to exclude. [%(default)s]', default='assembly_fail,ref_seq_choose_fail')
+    parser.add_argument('--min_pc_id', type=float, help='Minimum percent identity of nucmer match between contig and reference [%(default)s]', default=90.0, metavar='FLOAT')
+    parser.add_argument('--min_ref_base_asm', type=int, help='Minimum number of reference bases matching assembly [%(default)s]', default=1, metavar='INT')
+    parser.add_argument('--keep_syn', action='store_true', help='Keep synonymous variants (by default they are removed')
+    parser.add_argument('--discard_without_known_var', action='store_true', help='Applies to variant only genes. Filter out where there is a known variant, but the assembly has the wild type. By default these rows are kept.')
+    parser.add_argument('infile', help='Name of input tsv file')
+    parser.add_argument('outprefix', help='Prefix of output files. outprefix.tsv and outprefix.xls will be made')
+    options = parser.parse_args()
+
+    flags_to_exclude = options.exclude_flags.split(',')
+    allowed_flags = set(ariba.flag.flags_in_order)
+    bad_flags = [x for x in flags_to_exclude if x not in allowed_flags]
+    if len(bad_flags):
+        print('Error in option --exclude_flags. The following were not recognised:', ','.join(bad_flags), file=sys.stderr)
+        print('Must choose from:', ','.join(ariba.flag.flags_in_order), file=sys.stderr)
+        sys.exit(1)
+
+    rf = ariba.report_filter.ReportFilter(
+        infile=options.infile,
+        min_pc_ident=options.min_pc_id,
+        min_ref_base_assembled=options.min_ref_base_asm,
+        ignore_not_has_known_variant=options.discard_without_known_var,
+        remove_synonymous_snps=not options.keep_syn,
+    )
+    rf.run(options.outprefix)
+
diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py
index c52ccba..f7392b8 100644
--- a/ariba/tasks/run.py
+++ b/ariba/tasks/run.py
@@ -1,70 +1,74 @@
 import argparse
-import pyfastaq
+import os
+import sys
 import ariba
 
 
 def run():
     parser = argparse.ArgumentParser(
         description = 'ARIBA: Antibiotic Resistance Identification By Assembly',
-        usage = 'ariba run [options] <db.fa> <reads1.fq> <reads2.fq> <outdir>')
-    parser.add_argument('db_fasta', help='FASTA file of reference genes')
+        usage = 'ariba run [options] <prepareref_dir> <reads1.fq> <reads2.fq> <outdir>')
+    parser.add_argument('prepareref_dir', help='Name of output directory when "ariba prepareref" was run')
     parser.add_argument('reads_1', help='Name of fwd reads fastq file')
     parser.add_argument('reads_2', help='Name of rev reads fastq file')
     parser.add_argument('outdir', help='Output directory (must not already exist)')
 
-    cdhit_group = parser.add_argument_group('cd-hit options')
-    cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit')
-    cdhit_group.add_argument('--cdhit_seq_identity_threshold', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
-    cdhit_group.add_argument('--cdhit_length_diff_cutoff', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')
-
     nucmer_group = parser.add_argument_group('nucmer options')
     nucmer_group.add_argument('--nucmer_min_id', type=int, help='Minimum alignment identity (delta-filter -i) [%(default)s]', default=90, metavar='INT')
-    nucmer_group.add_argument('--nucmer_min_len', type=int, help='Minimum alignment length (delta-filter -i) [%(default)s]', default=50, metavar='INT')
-    nucmer_group.add_argument('--nucmer_breaklen', type=int, help='Value to use for -breaklen when running nucmer [%(default)s]', default=50, metavar='INT')
+    nucmer_group.add_argument('--nucmer_min_len', type=int, help='Minimum alignment length (delta-filter -i) [%(default)s]', default=20, metavar='INT')
+    nucmer_group.add_argument('--nucmer_breaklen', type=int, help='Value to use for -breaklen when running nucmer [%(default)s]', default=200, metavar='INT')
 
     assembly_group = parser.add_argument_group('Assembly options')
-    allowed_assemblers = ['velvet', 'spades']
-    assembly_group.add_argument('--assembler', help='Assembler to use. Available options: ' + ','.join(allowed_assemblers) + ' [%(default)s]', choices=allowed_assemblers, default='spades', metavar='Assembler')
-    assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs. This is also the value used for sspace -k when scaffolding [%(default)s]', default=10, metavar='INT')
+    assembly_group.add_argument('--assembly_cov', type=int, help='Target read coverage when sampling reads for assembly [%(default)s]', default=50, metavar='INT')
     assembly_group.add_argument('--assembler_k', type=int, help='kmer size to use with assembler. You can use 0 to set kmer to 2/3 of the read length. Warning - lower kmers are usually better. [%(default)s]', metavar='INT', default=21)
-    assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k or -t: for these options you should use the ariba run options --assembler_k and --threads [%(default)s]', default="--only-assembler", metavar="OPTIONS")
+    assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k (see --assembler_k), --untrusted-contigs (it is always used), or -t [%(default)s]', default="--only-assembler -m 4", metavar="OPTIONS")
+    assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs. This is also the value used for sspace -k when scaffolding [%(default)s]', default=10, metavar='INT')
 
     other_group = parser.add_argument_group('Other options')
-    other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
-    other_group.add_argument('--threads', type=int, help='Number of threads for bowtie2 and spades [%(default)s]', default=1, metavar='INT')
+    other_group.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
     bowtie2_presets = ['very-fast-local', 'fast-local', 'sensitive-local', 'very-sensitive-local']
     other_group.add_argument('--bowtie2_preset', choices=bowtie2_presets, help='Preset option for bowtie2 mapping [%(default)s]', default='very-sensitive-local', metavar='|'.join(bowtie2_presets))
     other_group.add_argument('--assembled_threshold', type=float, help='If proportion of gene assembled (regardless of into how many contigs) is at least this value then the flag gene_assembled is set [%(default)s]', default=0.95, metavar='FLOAT (between 0 and 1)')
+    other_group.add_argument('--gene_nt_extend', type=int, help='Max number of nucleotides to extend ends of gene matches to look for start/stop codons [%(default)s]', default=30, metavar='INT')
     other_group.add_argument('--unique_threshold', type=float, help='If proportion of bases in gene assembled more than once is <= this value, then the flag unique_contig is set [%(default)s]', default=0.03, metavar='FLOAT (between 0 and 1)')
-    other_group.add_argument('--clean', type=int, choices=[0,1,2], help='Specify how much cleaning to do. 0=none, 1=some, 2=only keep the report [%(default)s]', default=1, metavar='INT')
+    other_group.add_argument('--noclean', action='store_true', help='Do not clean up intermediate files')
+    other_group.add_argument('--tmp_dir', help='Existing directory in which to create a temporary directory used for local assemblies')
     other_group.add_argument('--verbose', action='store_true', help='Be verbose')
 
-    executables_group = parser.add_argument_group('executables locations')
-    executables_group.add_argument('--bcftools', help='bcftools executable [bcftools]', metavar='PATH')
-    executables_group.add_argument('--bowtie2', help='bowtie2 executable [bowtie2]', metavar='PATH')
-    executables_group.add_argument('--cdhit', help=argparse.SUPPRESS)
-    executables_group.add_argument('--gapfiller', help='GapFiller executable [GapFiller.pl]', metavar='PATH')
-    executables_group.add_argument('--nucmer', help=argparse.SUPPRESS, default='nucmer')
-    executables_group.add_argument('--samtools', help='samtools executable [samtools]', metavar='PATH')
-    executables_group.add_argument('--spades', help='SPAdes executable [spades.py]',  metavar='PATH')
-    executables_group.add_argument('--sspace', help='SSPACE executable [SSPACE_Basic_v2.0.pl]', metavar='PATH')
-    executables_group.add_argument('--velvet', help='prefix of velvet{g,h} executables [velvet]', metavar='PATH')
-    executables_group.add_argument('--velvetg', help=argparse.SUPPRESS)
-    executables_group.add_argument('--velveth', help=argparse.SUPPRESS)
-
     options = parser.parse_args()
-    if options.assembler == 'velvet':
-        options.velvet = 'velvet'
-    ariba.external_progs.check_versions(options, verbose=options.verbose, not_required=set(['sspace', 'gapfiller']))
-    pyfastaq.sequences.genetic_code = options.genetic_code
+
+    reads_not_found = []
+
+    for filename in [options.reads_1, options.reads_2]:
+        if not os.path.exists(filename):
+            reads_not_found.append(filename)
+        elif options.verbose:
+            print('Found reads file:', filename)
+
+    if len(reads_not_found):
+        print('\nThe following reads file(s) were not found:', file=sys.stderr)
+        print(*reads_not_found, sep='\n', file=sys.stderr)
+        print('Cannot continue', file=sys.stderr)
+        sys.exit(1)
+
+    if not os.path.exists(options.prepareref_dir):
+        print('Input directory', options.prepareref_dir, 'not found. Cannot continue', file=sys.stderr)
+        sys.exit(1)
+
+    extern_progs, version_report_lines = ariba.versions.get_all_versions()
+    if options.verbose:
+        print(*version_report_lines, sep='\n')
 
     c = ariba.clusters.Clusters(
-          options.db_fasta,
+          options.prepareref_dir,
           options.reads_1,
           options.reads_2,
           options.outdir,
+          extern_progs,
+          version_report_lines=version_report_lines,
           assembly_kmer=options.assembler_k,
-          assembler=options.assembler,
+          assembly_coverage=options.assembly_cov,
+          assembler='spades',
           threads=options.threads,
           verbose=options.verbose,
           min_scaff_depth=options.min_scaff_depth,
@@ -74,18 +78,10 @@ def run():
           spades_other=options.spades_other,
           assembled_threshold=options.assembled_threshold,
           unique_threshold=options.unique_threshold,
-          bcftools_exe=options.bcftools,
-          gapfiller_exe=options.gapfiller,
-          samtools_exe=options.samtools,
-          bowtie2_exe=options.bowtie2,
+          max_gene_nt_extend=options.gene_nt_extend,
           bowtie2_preset=options.bowtie2_preset,
-          spades_exe=options.spades,
-          sspace_exe=options.sspace,
-          velvet_exe=options.velvet,
-          cdhit_seq_identity_threshold=options.cdhit_seq_identity_threshold,
-          cdhit_length_diff_cutoff=options.cdhit_length_diff_cutoff,
-          clean=options.clean,
-          run_cd_hit=(not options.no_cdhit)
+          clean=(not options.noclean),
+          tmp_dir=options.tmp_dir,
         )
     c.run()
 
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index 02cf4ed..d4a22f4 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -1,25 +1,117 @@
 import argparse
 import ariba
 
+
+def use_preset(options):
+    if options.preset is None:
+        return options
+
+    preset_to_vals = {
+        'minimal': {
+            'cluster_cols': 'has_res',
+            'variant_cols': '',
+            'col_filter': 'y',
+            'row_filter': 'y',
+            'var_groups': 'n',
+            'known_vars': 'n',
+            'novel_vars': 'n'
+        },
+        'cluster_small': {
+            'cluster_cols': 'assembled,has_res,ref_seq,known_var',
+            'variant_cols': '',
+            'col_filter': 'y',
+            'row_filter': 'y',
+            'var_groups': 'n',
+            'known_vars': 'n',
+            'novel_vars': 'n'
+        },
+        'cluster_all': {
+            'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': '',
+            'col_filter': 'y',
+            'row_filter': 'y',
+            'var_groups': 'n',
+            'known_vars': 'n',
+            'novel_vars': 'n'
+        },
+        'cluster_var_groups': {
+            'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': 'groups',
+            'col_filter': 'y',
+            'row_filter': 'y',
+            'var_groups': 'y',
+            'known_vars': 'n',
+            'novel_vars': 'n'
+        },
+        'cluster_known_vars': {
+            'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': 'groups,grouped,ungrouped',
+            'col_filter': 'y',
+            'row_filter': 'y',
+            'var_groups': 'y',
+            'known_vars': 'y',
+            'novel_vars': 'n'
+        },
+        'all': {
+            'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': 'groups,grouped,ungrouped,novel',
+            'col_filter': 'y',
+            'row_filter': 'y',
+            'var_groups': 'y',
+            'known_vars': 'y',
+            'novel_vars': 'y'
+        },
+        'all_no_filter': {
+            'cluster_cols': 'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'variant_cols': 'groups,grouped,ungrouped,novel',
+            'col_filter': 'n',
+            'row_filter': 'n',
+            'var_groups': 'y',
+            'known_vars': 'y',
+            'novel_vars': 'y'
+        },
+    }
+
+    assert options.preset in preset_to_vals
+
+    for key, val in preset_to_vals[options.preset].items():
+        exec('options.' + key + ' = "' + val + '"')
+
+    return options
+
+
 def run():
+    presets = ['minimal', 'cluster_small', 'cluster_all', 'cluster_var_groups', 'cluster_known_vars', 'all', 'all_no_filter']
+
     parser = argparse.ArgumentParser(
-        description = 'Make a summry of ARIBA report files',
-        usage = 'ariba summary [options] <outfile> [report1.tsv report2.tsv ...]',
-        epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.')
+        description = 'Make a summary of ARIBA report files, and Phandango files',
+        usage = 'ariba summary [options] <outprefix> [report1.tsv report2.tsv ...]',
+        epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input.')
     parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
+    parser.add_argument('--preset', choices=presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--known_vars,--novel_vars. Using this overrides those options', metavar='|'.join(presets))
+    parser.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, has_res, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='has_res', metavar='col1,col2,...')
+    parser.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
+    parser.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n')
+    parser.add_argument('--var_cols', help='Comma separated list of variant columns to include. Choose from: groups, grouped, ungrouped, novel [none by default]', metavar='col1,col2,...', default='')
     parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
-    parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by deafult, they are removed from the output)')
-    parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')
+    parser.add_argument('--verbose', action='store_true', help='Be verbose')
+    parser.add_argument('outprefix', help='Prefix of output files')
     parser.add_argument('infiles', nargs='*', help='Files to be summarised')
     options = parser.parse_args()
     if len(options.infiles) == 0:
         options.infiles = None
 
+    options = use_preset(options)
+
     s = ariba.summary.Summary(
-        options.outfile,
+        options.outprefix,
         fofn=options.fofn,
         filenames=options.infiles,
-        filter_output=(not options.no_filter),
-        min_id=options.min_id
+        filter_rows=options.col_filter == 'y',
+        filter_columns=options.row_filter == 'y',
+        min_id=options.min_id,
+        cluster_cols=options.cluster_cols,
+        variant_cols=options.var_cols,
+        verbose=options.verbose
     )
     s.run()
diff --git a/ariba/tasks/test.py b/ariba/tasks/test.py
new file mode 100644
index 0000000..00177d1
--- /dev/null
+++ b/ariba/tasks/test.py
@@ -0,0 +1,94 @@
+import argparse
+import subprocess
+import shutil
+import os
+import sys
+import ariba
+
+
+def boxymcboxface(message):
+    print('-' * 79)
+    print('|', '=' * 77, '|', sep='')
+    print('|', '{: ^75}'.format(message), '|')
+    print('|', '=' * 77, '|', sep='')
+    print('-' * 79)
+
+
+def run():
+    parser = argparse.ArgumentParser(
+        description = 'Run ARIBA on a small test dataset',
+        usage = 'ariba test [options] <outdir>')
+    parser.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+    parser.add_argument('outdir', help='Name of output directory')
+    options = parser.parse_args()
+    ariba_exe = os.path.abspath(sys.argv[0])
+
+    print('Running ARIBA on test data...')
+
+    boxymcboxface('Preparing input data')
+
+    try:
+        os.mkdir(options.outdir)
+        os.chdir(options.outdir)
+    except:
+        print('Error making output directory "', options.outdir, '". Cannot continue.', sep='', file=sys.stderr)
+        sys.exit(1)
+
+    print('Made output directory ', options.outdir, '. Copying test data files into it:', sep='')
+
+    modules_dir = os.path.dirname(os.path.abspath(ariba.__file__))
+    test_data_dir = os.path.join(modules_dir, 'test_run_data')
+
+    for filename in ['presence_absence.fa', 'non_coding.fa', 'variants_only.fa', 'metadata.tsv', 'reads_1.fq', 'reads_2.fq']:
+        shutil.copy(os.path.join(test_data_dir, filename), filename)
+        print('    copied', filename)
+
+
+    boxymcboxface('Try running ariba prepareref')
+
+    prepareref_command = ' '.join([
+        ariba_exe,
+        'prepareref',
+        '--verbose',
+        '--presabs presence_absence.fa',
+        '--varonly variants_only.fa',
+        '--noncoding non_coding.fa',
+        '--metadata metadata.tsv',
+        '--threads', str(options.threads),
+        'PREPAREREF',
+    ])
+
+    print('\nRunning ariba prepareref with:', prepareref_command, '', sep='\n')
+    return_code = subprocess.call(prepareref_command, shell=True)
+
+    if return_code != 0:
+        print('\nSomething went wrong. See above for error message(s). Return code was', return_code)
+        sys.exit(1)
+
+    print()
+    print('ariba prepareref finished OK')
+
+
+    ariba_command = ' '.join([
+        ariba_exe,
+        'run',
+        '--verbose',
+        '--threads', str(options.threads),
+        'PREPAREREF',
+        'reads_1.fq',
+        'reads_2.fq',
+        'OUT'
+    ])
+
+    boxymcboxface('Try running ariba run')
+    print('\nRunning ARIBA with:', ariba_command, '', sep='\n')
+
+    return_code = subprocess.call(ariba_command, shell=True)
+
+    if return_code != 0:
+        print('\nSomething went wrong. See above for error message(s). Return code was', return_code)
+        sys.exit(1)
+
+    print()
+    print('ariba run finished OK')
+    print('Finished run on test data OK')
diff --git a/ariba/tasks/version.py b/ariba/tasks/version.py
index 3414601..b89f8cc 100644
--- a/ariba/tasks/version.py
+++ b/ariba/tasks/version.py
@@ -1,4 +1,6 @@
-import ariba
+import sys
+from ariba import versions
 
 def run():
-    print(ariba.common.version)
+    extern_progs, report_lines = versions.get_all_versions(raise_error=False)
+    print(*report_lines, sep='\n')
diff --git a/ariba/test_run_data/metadata.tsv b/ariba/test_run_data/metadata.tsv
new file mode 100644
index 0000000..04b6107
--- /dev/null
+++ b/ariba/test_run_data/metadata.tsv
@@ -0,0 +1,14 @@
+presence_absence1	.	.	.	Generic description of presence_absence1
+presence_absence1	p	R3S	.	Ref and assembly have wild type, so do not report
+presence_absence1	p	A10V	.	Ref has wild, reads have variant so report
+presence_absence1	p	I5A	.	Ref and reads have variant so report
+variants_only1	.	.	.	Generic description of variants_only1
+variants_only1	p	I3L	.	Ref and assembly have wild type, so do not report
+variants_only1	p	S5T	.	Ref and reads have variant so report
+variants_only2	p	R3I	.	Ref and reads have wild so do not report
+variants_only2	.	.	.	Generic description of variants_only2
+noncoding1	.	.	.	generic description of noncoding1
+noncoding1	n	A6G	.	variant in ref and reads so should report
+noncoding1	n	G9T	.	wild type in ref and reads so should not report
+noncoding1	n	A14T	noncoding_group1	ref has wild type, reads have variant so should report
+noncoding1	n	A40C	.	ref has variant, reads have wild type so should not report
diff --git a/ariba/test_run_data/non_coding.fa b/ariba/test_run_data/non_coding.fa
new file mode 100644
index 0000000..df05244
--- /dev/null
+++ b/ariba/test_run_data/non_coding.fa
@@ -0,0 +1,10 @@
+>noncoding1
+CGTACGCGGGTGGAGACATGTACTCCACTCCCATACATCCCTAAGTTTGTCCCTAAGGCA
+GTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCAC
+>noncoding2
+TCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCT
+GCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCC
+>noncoding3
+CTAACTTACTACTATGACTGACTGACTGACTGACTGATCGACTGCTGACATCTGATCGAT
+CATCCTGTCGACATCATATCTCGATCGATCGATCGACTGACTGACTGACTGACTGAATCT
+CACGTACTGACTCATCATCATCATACTCATCATATCATCGATCGATCATCTGATCTGATG
diff --git a/ariba/test_run_data/presence_absence.fa b/ariba/test_run_data/presence_absence.fa
new file mode 100644
index 0000000..638f8a0
--- /dev/null
+++ b/ariba/test_run_data/presence_absence.fa
@@ -0,0 +1,5 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCAACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGATTTAA
+>presence_absence2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/test_run_data/reads_1.fq b/ariba/test_run_data/reads_1.fq
new file mode 100644
index 0000000..70deca9
--- /dev/null
+++ b/ariba/test_run_data/reads_1.fq
@@ -0,0 +1,908 @@
+ at presence_absence1:1:154:213/1
+TAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAGGACATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:2:64:123/1
+CTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:3:21:79/1
+ACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:4:116:174/1
+ATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:5:16:76/1
+CTATAACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTCTTCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:6:76:136/1
+TGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:7:120:180/1
+CCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:8:69:129/1
+TTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:9:31:91/1
+CTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:10:76:135/1
+TGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:11:173:233/1
+AGCGCGTGGGAAAGCATGGAATAAGGACATACCTAGGTGCGAAGTGCAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:12:37:99/1
+ACCGCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:13:131:190/1
+ACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:14:94:153/1
+GCGGCATATGGATCGCGAAGCGATGACCCATGAAGTAACCGAACGCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:15:46:104/1
+AAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:16:115:175/1
+GATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:17:44:103/1
+CTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:18:141:201/1
+CGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:19:149:210/1
+AACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:20:63:124/1
+CCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:21:60:122/1
+CTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:22:111:169/1
+AAGCGATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:23:90:150/1
+TGTTGCGGCATATGGATCGCGAAGCGATGACCCATGAAGTAACCGAACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:24:56:114/1
+ACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:25:21:80/1
+ACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:26:59:118/1
+TCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:27:19:77/1
+TAACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:28:125:186/1
+GAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:29:31:89/1
+CTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:30:64:125/1
+CTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:31:150:211/1
+ACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:32:144:205/1
+GCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:33:59:119/1
+TCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:34:40:99/1
+GCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:35:105:165/1
+ATCGCGAAGCGATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:36:114:174/1
+CGATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:37:82:142/1
+TGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGACCCATGAAGTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:38:75:137/1
+CTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGACCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:39:67:129/1
+TCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:40:34:93/1
+TAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:41:163:222/1
+TAACGGCATTAGCGCGTGGGAAAGCATGGAATAAGGACATACCTAGGTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:42:42:102/1
+GCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:43:148:207/1
+CAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:44:117:177/1
+TGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:45:54:114/1
+AGCGGTAGGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:46:8:66/1
+ATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:47:74:134/1
+GCTAGCAGGATTACTTCGTGCTGATCTATGCTGATTAGCACCGAAAACAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:48:6:65/1
+AAATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:49:131:189/1
+AGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGCGAAGATAGCACCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:50:179:238/1
+GCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCATTTCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:51:60:121/1
+AGGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTGATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:52:117:176/1
+AAAACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:53:161:222/1
+GAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:54:99:157/1
+CTATGCTGATTAGCACCGAAAACACCACCGAAAGCAGCGAACGCGCGTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:55:168:227/1
+AAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:56:7:67/1
+AATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:57:9:67/1
+TCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:58:163:221/1
+ACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:59:31:93/1
+GGTTGTTCTCAGTTCTACTGATAAGCGGTAGGGAATCTCAAGTGCTAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:60:57:117/1
+GGTAGGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:61:13:72/1
+AGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:62:19:79/1
+TTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAGGGAATCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:63:153:215/1
+CGCTGACCGAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:64:116:173/1
+GAAAACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:65:18:76/1
+CTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAGGGAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:66:155:215/1
+CTGACCGAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:67:84:143/1
+TTACTTCGTGCTGATCTATGCTGATTAGCACCGAAAACACCACCGAAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:68:62:120/1
+GGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTGATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:69:165:223/1
+GCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:70:1:62/1
+TAAGTAAATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:71:25:86/1
+CCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAGGGAATCTCAAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:72:120:180/1
+ACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:73:164:222/1
+CGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:74:149:210/1
+ACCGCGCTGACCGAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:75:173:233/1
+AGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:76:179:238.dup.2/1
+GCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCATTTCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:77:62:122/1
+GGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTGATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:78:76:136/1
+TAGCAGGATTACTTCGTGCTGATCTATGCTGATTAGCACCGAAAACACCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:79:12:72/1
+GAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:80:107:167/1
+ATTAGCACCGAAAACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:81:55:113/1
+GCGGTAGGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:82:35:95/1
+GTTCTCAGTTCTACTGATAAGCGGTAGGGAATCTCAAGTGCTAGCAGGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:83:168:228/1
+AAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:84:180:240/1
+CGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCATTTCTCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:85:97:157/1
+ATCTATGCTGATTAGCACCGAAAACACCACCGAAAGCAGCGAACGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:86:119:177/1
+AACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:87:157:216/1
+GACCGAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:88:40:101/1
+CATGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:89:42:101/1
+TGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:90:14:75/1
+GTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:91:47:106/1
+TTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:92:9:67/1
+GCCCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:93:56:116/1
+GGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTTCTACATGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:94:18:78/1
+AGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:95:47:106.dup.2/1
+TTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:96:38:97/1
+TCCATGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:97:144:203/1
+CGACCACCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:98:31:90/1
+GAACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:99:147:206/1
+CCACCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:100:11:71/1
+CCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:101:8:69/1
+GGCCCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:102:45:106/1
+TATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:103:83:142/1
+CAAGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAAAATTACCTGCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:104:124:183/1
+TACCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:105:32:91/1
+AACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:106:12:72/1
+CGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:107:156:214/1
+CGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:108:85:145/1
+AGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAAAATTACCTGCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:109:84:145/1
+AAGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAAAATTACCTGCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:110:60:117/1
+CTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTTCTACATGTTTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:111:131:191/1
+CATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACGATCATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:112:23:84/1
+TGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCCGTGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:113:125:186/1
+ACCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:114:160:220/1
+CGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCATATTAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:115:148:207/1
+CACCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:116:126:186/1
+CCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:117:155:214/1
+GCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:118:79:138/1
+CCGCCAAGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAAAATTACCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:119:158:216/1
+AGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCATATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:120:139:199/1
+CATGGCGACCACCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:121:55:114/1
+CGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTTCTACATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:122:153:213/1
+TTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:123:20:79/1
+AGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:124:154:215/1
+TGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:125:178:235/1
+TTATGAAGCGCATTAAAATCAAGCATATTAGTTTATTAAATGGTACCAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:126:150:210/1
+CCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:127:16:76/1
+TAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:128:6:65/1
+CAGGCCCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:129:15:75/1
+TTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:130:116:175/1
+AAAAAAATTACCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:131:204:263/1
+ACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGGATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:132:64:123/1
+ACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:133:104:164/1
+ACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:134:131:188/1
+CCATACATCACTAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:135:65:126/1
+CTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:136:152:213/1
+CCTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:137:73:133/1
+ATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:138:35:94/1
+TAGGGAGAATCCTCTCAGTGTTACCTTACACTTTTCGAATCAGGGTTATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:139:142:202/1
+TAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:140:196:256/1
+CTTAGGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:141:198:255/1
+TAGGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:142:27:86/1
+GGGTACTTTAGGGAGAATCCTCTCAGTGTTACCTTACACTTTTCGAATCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:143:153:212/1
+CTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:144:1:59/1
+ATGTGGTGTGTCCGTCTGACTTAGCTGGGTACTTTAGGGAGAATCCTCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:145:89:149/1
+GTCGCCCAACCTCGTACGCGGGTGGTGACATGTACTCCACTCCCATACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:146:178:237/1
+ACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:147:67:126/1
+TTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:148:17:77/1
+TGACTTAGCTGGGTACTTTAGGGAGAATCCTCTCAGTGTTACCTTACACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:149:64:124/1
+ACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:150:116:174/1
+ACATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCAGTGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:151:147:205/1
+TTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:152:95:155/1
+CAACCTCGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:153:176:235/1
+GAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:154:202:262/1
+GAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:155:176:236/1
+GAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:156:94:154/1
+CCAACCTCGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:157:71:128/1
+GAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:158:200:259/1
+GGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:159:102:162/1
+GTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:160:142:199/1
+TAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:161:195:254/1
+GCTTAGGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:162:73:133.dup.2/1
+ATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:163:117:175/1
+CATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCAGTGCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:164:49:109/1
+TCAGTGTTACCTTACACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:165:80:138/1
+TTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTACTCCACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:166:177:235/1
+AACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:167:71:131/1
+GAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:168:56:116/1
+TACCTTACACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:169:171:230/1
+CCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:170:134:194/1
+TACATCACTAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACGAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:171:26:86/1
+TGGGTACTTTAGGGAGAATCCTCTCAGTGTTACCTTACACTTTTCGAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:172:182:241/1
+ACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACATAGCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:173:80:141/1
+TTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTACTCCACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:174:208:267/1
+CATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGGATTCGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:175:30:91/1
+TACTTTAGGGAGAATCCTCTCAGTGTTACCTTACACTTTTCGAATCAGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:176:206:266/1
+GCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGGATTCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:177:81:139/1
+TATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTACTCCACTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:178:10:67/1
+GTCCGTCTGACTTAGCTGGGTACTTTAGGGAGAATCCTCTCAGTGTTACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:179:199:258/1
+GGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTCTGTATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:180:35:94/1
+GGACATCGAATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:181:186:244/1
+ACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:182:43:100/1
+AATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:183:75:135/1
+TGTTCCGGAGCGTTGACTCTCATAGATCTTTAACTGTTCACGACTGTATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:184:57:117/1
+ATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:185:122:181/1
+ATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCTGCGATACAAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:186:186:245/1
+ACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:187:99:158/1
+GATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:188:57:115/1
+ATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:189:136:196/1
+ATCTTAAGTTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:190:23:81/1
+CACGGCACATTTGGACATCGAATACACTAGCGGTATTTATGCTTATCTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:191:97:156/1
+TAGATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:192:58:118/1
+TTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:193:156:214/1
+GCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:194:157:215/1
+CGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:195:11:71/1
+AGGTACCGGGCCCACGGCACATTTGGACATCGAATACACTAGCGGTATTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:196:164:226/1
+ATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:197:60:119/1
+TATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:198:152:212/1
+AAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:199:200:260/1
+GTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTCTGTATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:200:167:225/1
+CAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:201:24:86/1
+ACGGCACATTTGGACATCGAATACACTAGCGGTATTTATGCTTATCTGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:202:144:203/1
+TTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:203:41:101/1
+CGAATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:204:189:249/1
+GTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:205:29:89/1
+ACATTTGGACATCGAATACACTAGCGGTATTTATGCTTATCTGCCCTGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:206:194:253/1
+CGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:207:140:199/1
+TAAGTTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:208:168:227/1
+AAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:209:89:148/1
+GACTCTCATAGATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:210:193:252/1
+ACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:211:166:224/1
+ACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:212:150:209/1
+CCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:213:197:256/1
+CGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTCTGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:214:36:97/1
+GACATCGAATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:215:47:107/1
+CACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:216:165:224/1
+TACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:217:150:208/1
+CCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:218:26:87/1
+GGCACATTTGGACATCGAATACACTAGCGGTATTTATGCTTATCTGCCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:219:43:102/1
+AATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:220:161:221/1
+GCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:221:129:189/1
+CTTGCAAATCTTAAGTTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:222:185:244/1
+GACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:223:8:68/1
+ACAAGGTACCGGGCCCACGGCACATTTGGACATCGAATACACTAGCGGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:224:183:242/1
+CGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:225:100:159/1
+ATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:226:100:159.dup.2/1
+ATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding3/1
+CATCCTGTCGACATCATATCTCGATCGATCGATCGACTGACTGACTGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/test_run_data/reads_2.fq b/ariba/test_run_data/reads_2.fq
new file mode 100644
index 0000000..fa54d4a
--- /dev/null
+++ b/ariba/test_run_data/reads_2.fq
@@ -0,0 +1,908 @@
+ at presence_absence1:1:154:213/2
+TAATCTTAAGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:2:64:123/2
+AATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:3:21:79/2
+CTTCATGGGTCATCGCTTCGCGATCCATATGCCGCAACACTATGCCAAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:4:116:174/2
+ATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:5:16:76/2
+CATGGGTCATCGCTTCGCGATCCATATGCCGCAACACTATGCCAAATACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:6:76:136/2
+TTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:7:120:180/2
+GAAACCATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:8:69:129/2
+CGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:9:31:91/2
+CGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATATGCCGCAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:10:76:135/2
+TTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:11:173:233/2
+AGTGAAGCTTTCAGAGGATTTAATCTTAAGTCTGGGTTTTCTATATCCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:12:37:99/2
+GGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:13:131:190/2
+TATCCACACCGAAACCATCTGCACTTCGCACCTAGGTATGTCCTTATTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:14:94:153/2
+ATGTCCTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:15:46:104/2
+ATGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:16:115:175/2
+CATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:17:44:103/2
+TGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:18:141:201/2
+TGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGCACCTAGGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:19:149:210/2
+TCTTAAGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:20:63:124/2
+TAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:21:60:122/2
+ATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:22:111:169/2
+CACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCGCTAATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:23:90:150/2
+TCCTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:24:56:114/2
+AATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:25:21:80/2
+ACTTCATGGGTCATCGCTTCGCGATCCATATGCCGCAACACTATGCCAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:26:59:118/2
+CGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:27:19:77/2
+TCATGGGTCATCGCTTCGCGATCCATATGCCGCAACACTATGCCAAATAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:28:125:186/2
+CACACCGAAACCATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:29:31:89/2
+CGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATATGCCGCAACAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:30:64:125/2
+CTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:31:150:211/2
+ATCTTAAGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:32:144:205/2
+AGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGCACCTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:33:59:119/2
+CCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:34:40:99/2
+GGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:35:105:165/2
+TCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCGCTAATGCCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:36:114:174/2
+ATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:37:82:142/2
+CCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:38:75:137/2
+CTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:39:67:129/2
+CGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:40:34:93/2
+CGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATATGCCGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:41:163:222/2
+CAGAGGATTTAATCTTAAGTCTGGGTTTTCTATATCCACACCGAAACCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:42:42:102/2
+GTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:43:148:207/2
+TAAGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGCACCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:44:117:177/2
+ACCATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:45:54:114/2
+TTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTGTTTTCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:46:8:66/2
+GGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:47:74:134/2
+GTCGCGGTGCTATCTTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:48:6:65/2
+GTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:49:131:189/2
+AAGTAGCAGCTGAGAAATGACGGTGCGGAACATCTGGTAACGCGGGCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:50:179:238/2
+TGGTTCCAAGCCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:51:60:121/2
+CTTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:52:117:176/2
+GAAATGACGGTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCGCGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:53:161:222/2
+AATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:54:99:157/2
+TCTGGTAACGCGGGCTTTATTCGGTCGCGGTGCTATCTTCGCGTTCGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:55:168:227/2
+CATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:56:7:67/2
+CGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:57:9:67/2
+CGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:58:163:221/2
+ATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:59:31:93/2
+GCGTTCGCTGCTTTCGGTGGTGTTTTCGGTGCTAATCAGCATAGATCAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:60:57:117/2
+GCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTGTTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:61:13:72/2
+GTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:62:19:79/2
+CGGTGGTGTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:63:153:215/2
+GATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAATGACGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:64:116:173/2
+ATGACGGTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCGCGGTGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:65:18:76/2
+TGGTGTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:66:155:215/2
+GATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAATGACGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:67:84:143/2
+CTTTATTCGGTCGCGGTGCTATCTTCGCGTTCGGTCAGCGCGGTGCACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:68:62:120/2
+TTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:69:165:223/2
+GAATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:70:1:62/2
+CTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAGATTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:71:25:86/2
+CTGCTTTCGGTGGTGTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:72:120:180/2
+CTGAGAAATGACGGTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:73:164:222/2
+AATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:74:149:210/2
+TGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAATGACGGTGCGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:75:173:233/2
+CCAAGCCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:76:179:238.dup.2/2
+TGGTTCCAAGCCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:77:62:122/2
+TCTTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:78:76:136/2
+CGGTCGCGGTGCTATCTTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:79:12:72/2
+GTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:80:107:167/2
+GTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCGCGGTGCTATCTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:81:55:113/2
+TCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTGTTTTCGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:82:35:95/2
+GCGCGTTCGCTGCTTTCGGTGGTGTTTTCGGTGCTAATCAGCATAGATCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:83:168:228/2
+CCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:84:180:240/2
+CTTGGTTCCAAGCCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:85:97:157/2
+TCTGGTAACGCGGGCTTTATTCGGTCGCGGTGCTATCTTCGCGTTCGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:86:119:177/2
+AGAAATGACGGTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCGCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:87:157:216/2
+TGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAATGACGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:88:40:101/2
+GTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:89:42:101/2
+GTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:90:14:75/2
+AATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:91:47:106/2
+CAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:92:9:67/2
+TCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGACCACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:93:56:116/2
+TTTTCGCTCGCAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:94:18:78/2
+GGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:95:47:106.dup.2/2
+CAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:96:38:97/2
+TCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACATGTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:97:144:203/2
+GTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAAACTAATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:98:31:90/2
+GTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:99:147:206/2
+GCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAAACTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:100:11:71/2
+TTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:101:8:69/2
+TTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGACCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:102:45:106/2
+CAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:103:83:142/2
+AATGCGCTTCATAATGATCGTTCAGTTTTTCGCTCGCAATGGTGGTCGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:104:124:183/2
+GGGGAGTTGGTACCATTTAATAAACTAATATGCTTGATTTTAATGCGCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:105:32:91/2
+TGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:106:12:72/2
+TTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:107:156:214/2
+TATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:108:85:145/2
+TTTAATGCGCTTCATAATGATCGTTCAGTTTTTCGCTCGCAATGGTGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:109:84:145/2
+TTTAATGCGCTTCATAATGATCGTTCAGTTTTTCGCTCGCAATGGTGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:110:60:117/2
+TTTTTCGCTCGCAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:111:131:191/2
+TAATGGGGGGGGAGTTGGTACCATTTAATAAACTAATATGCTTGATTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:112:23:84/2
+ATGGCAGGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:113:125:186/2
+GGGGGGGAGTTGGTACCATTTAATAAACTAATATGCTTGATTTTAATGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:114:160:220/2
+CTTTTGTATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:115:148:207/2
+GGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:116:126:186/2
+GGGGGGGAGTTGGTACCATTTAATAAACTAATATGCTTGATTTTAATGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:117:155:214/2
+TATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:118:79:138/2
+CGCTTCATAATGATCGTTCAGTTTTTCGCTCGCAATGGTGGTCGCCATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:119:158:216/2
+TGTATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:120:139:199/2
+GTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAAACTAATATGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:121:55:114/2
+TTCGCTCGCAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:122:153:213/2
+ATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:123:20:79/2
+AGGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:124:154:215/2
+GTATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:125:178:235/2
+AACTGGTAAAAAGCCCTTTTGTATCAAGGGCAGTGGGTACATTTTAATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:126:150:210/2
+AAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:127:16:76/2
+TAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:128:6:65/2
+GCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGACCACGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:129:15:75/2
+AATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only2:130:116:175/2
+GGTACCATTTAATAAACTAATATGCTTGATTTTAATGCGCTTCATAATGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:131:204:263/2
+CAAACATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:132:64:123/2
+GGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:133:104:164/2
+CGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:134:131:188/2
+TGTTCTTTGCTATGTGGGTGATCTCGGATGGCGTTCCCTAAGCATCTCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:135:65:126/2
+GTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:136:152:213/2
+GATTAATTCGAATCCGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:137:73:133/2
+TTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:138:35:94/2
+TAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCGTACGAGGTTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:139:142:202/2
+ATCCGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTCGGATGGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:140:196:256/2
+TTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAGATTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:141:198:255/2
+TCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAGATTAATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:142:27:86/2
+TATGGGAGTGGAGTACATGTCACCACCCGCGTACGAGGTTGGGCGACTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:143:153:212/2
+ATTAATTCGAATCCGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:144:1:59/2
+CGCGTACGAGGTTGGGCGACTGAAGATAACCCTGATTCGAAAAGTGTAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:145:89:149/2
+AAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:146:178:237/2
+CAGACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:147:67:126/2
+GTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:148:17:77/2
+GGAGTACATGTCACCACCCGCGTACGAGGTTGGGCGACTGAAGATAACCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:149:64:124/2
+GGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:150:116:174/2
+TGGGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:151:147:205/2
+CGAATCCGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTCGGATGGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:152:95:155/2
+TTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:153:176:235/2
+GACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:154:202:262/2
+AAACATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:155:176:236/2
+AGACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:156:94:154/2
+TCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:157:71:128/2
+TCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:158:200:259/2
+CATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAGATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:159:102:162/2
+GATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:160:142:199/2
+CGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTCGGATGGCGTTCCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:161:195:254/2
+CCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAGATTAATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:162:73:133.dup.2/2
+TTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:163:117:175/2
+GTGGGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:164:49:109/2
+CCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:165:80:138/2
+CGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:166:177:235/2
+GACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:167:71:131/2
+CGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:168:56:116/2
+GGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:169:171:230/2
+TCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCTGTTCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:170:134:194/2
+TCGGTCTGTTCTTTGCTATGTGGGTGATCTCGGATGGCGTTCCCTAAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:171:26:86/2
+TATGGGAGTGGAGTACATGTCACCACCCGCGTACGAGGTTGGGCGACTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:172:182:241/2
+GATCCAGACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:173:80:141/2
+CACCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:174:208:267/2
+CTAGCAAACATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:175:30:91/2
+TGATGTATGGGAGTGGAGTACATGTCACCACCCGCGTACGAGGTTGGGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:176:206:266/2
+TAGCAAACATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:177:81:139/2
+CCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:178:10:67/2
+TCACCACCCGCGTACGAGGTTGGGCGACTGAAGATAACCCTGATTCGAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:179:199:258/2
+ATAGCGCCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:180:35:94/2
+CTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:181:186:244/2
+AGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTCCATAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:182:43:100/2
+GAAGAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:183:75:135/2
+CGCTAAACTTGGGATTTGTATCGCAGCGCGCTTGGGAAGAACTTAAGATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:184:57:117/2
+TATCGCAGCGCGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:185:122:181/2
+CAAAACGAACGGACGCATACATTCTTAGAACCCGGCGTGAACTGTCCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:186:186:245/2
+GAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTCCATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:187:99:158/2
+CTTAGAACCCGGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:188:57:115/2
+TCGCAGCGCGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACAGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:189:136:196/2
+ACAGAGACAATTGCACAAAACGAACGGACGCATACATTCTTAGAACCCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:190:23:81/2
+AGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGAGAGTCAACGCTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:191:97:156/2
+TAGAACCCGGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAGCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:192:58:118/2
+GTATCGCAGCGCGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:193:156:214/2
+AGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:194:157:215/2
+CAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:195:11:71/2
+CAGTCGTGAACAGTTAAAGATCTATGAGAGTCAACGCTCCGGAACAGGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:196:164:226/2
+TTCTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:197:60:119/2
+TGTATCGCAGCGCGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:198:152:212/2
+TAGACAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGCATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:199:200:260/2
+CGATAGCGCCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:200:167:225/2
+TCTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:201:24:86/2
+TTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGAGAGTCAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:202:144:203/2
+GGTCCATACAGAGACAATTGCACAAAACGAACGGACGCATACATTCTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:203:41:101/2
+GGAAGAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:204:189:249/2
+TGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:205:29:89/2
+GATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGAGAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:206:194:253/2
+GCCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:207:140:199/2
+CATACAGAGACAATTGCACAAAACGAACGGACGCATACATTCTTAGAACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:208:168:227/2
+TTTCTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:209:89:148/2
+GGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAGCGCGCTTGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:210:193:252/2
+CCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:211:166:224/2
+CTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:212:150:209/2
+ACAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGCATACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:213:197:256/2
+AGCGCCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:214:36:97/2
+GAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:215:47:107/2
+CGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:216:165:224/2
+CTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:217:150:208/2
+CAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGCATACATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:218:26:87/2
+TTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGAGAGTCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:219:43:102/2
+GGGAAGAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:220:161:221/2
+CCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACGAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:221:129:189/2
+CAATTGCACAAAACGAACGGACGCATACATTCTTAGAACCCGGCGTGAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:222:185:244/2
+AGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTCCATAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:223:8:68/2
+TCGTGAACAGTTAAAGATCTATGAGAGTCAACGCTCCGGAACAGGGCAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:224:183:242/2
+TAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTCCATACAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:225:100:159/2
+TCTTAGAACCCGGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding2:226:100:159.dup.2/2
+TCTTAGAACCCGGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding3/2
+CATCAGATCAGATGATCGATCGATGATATGATGAGTATGATGATGATGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/test_run_data/ref_fasta_to_make_reads_from.fa b/ariba/test_run_data/ref_fasta_to_make_reads_from.fa
new file mode 100644
index 0000000..7ec1baa
--- /dev/null
+++ b/ariba/test_run_data/ref_fasta_to_make_reads_from.fa
@@ -0,0 +1,36 @@
+>presence_absence1
+TTAAGCTGCCTAACCCTATAACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTC
+TTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGAC
+CCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTG
+GGAAAGCATGGAATAAGGACATACCTAGGTGCGAAGTGCAGATGGTTTCGGTGTGGATAT
+AGAAAACCCAGACTTAAGATTAAATCCTCTGAAAGCTTCACTGACGTCATGACTCA
+>variants_only1
+TAAGTAAATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTA
+GGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTGATTAGCACCGAAAA
+CACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGCGAAGATAGCACCGC
+GACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCATTTCTCAGCTGCTACTTTC
+ACTTTAGCTGAAGCGTCCAGTATCAGCCATTCGATGGCTTGGAACCAAGT
+>variants_only2
+ATGCGCAGGCCCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCC
+TTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAA
+AATTACCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACGATCATTA
+TGAAGCGCATTAAAATCAAGCATATTAGTTTATTAAATGGTACCAACTCCCCCCCCATTA
+AAATGTACCCACTGCCCTTGATACAAAAGGGCTTTTTACCAGTTCTACGACCG
+>noncoding1
+ATGTGGTGTGTCCGTCTGACTTAGCTGGGTACTTTAGGGAGAATCCTCTCAGTGTTACCT
+TACACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATG
+TACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACG
+AACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGAC
+CGACTTCGGATTCGAATTAATCTCGCATGAGGCAGGCGAGAGTCTGGATCTGTATTCGAC
+GGGAAATGTTTGCTAGGTCT
+>noncoding2
+CGACTACACAAGGTACCGGGCCCACGGCACATTTGGACATCGAATACACTAGCGGTATTT
+ATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTAACTGTTCACGACTG
+TATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGTTT
+AGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTC
+TCTGTATGGACCAATTGTCTACTGGACCGGTAGAAACATAAACGCTCTCTACTCTTCATG
+GCGCTATCGGGGAGGGGGCG
+>noncoding3
+CTAACTTACTACTATGACTGACTGACTGACTGACTGATCGACTGCTGACATCTGATCGAT
+CATCCTGTCGACATCATATCTCGATCGATCGATCGACTGACTGACTGACTGACTGAATCT
+CACGTACTGACTCATCATCATCATACTCATCATATCATCGATCGATCATCTGATCTGATG
diff --git a/ariba/test_run_data/variants_only.fa b/ariba/test_run_data/variants_only.fa
new file mode 100644
index 0000000..2a3b01c
--- /dev/null
+++ b/ariba/test_run_data/variants_only.fa
@@ -0,0 +1,6 @@
+>variants_only1
+ATGCTGATTAGCACCGAAAACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACC
+GAACGCGAAGATAGCACCGCGACCGAATAA
+>variants_only2
+ATGTTTCGCGAAGCGAAAAAAATTACCTGCCATGAAAACATGGCGACCACCATTGCGAGC
+GAAAAACTGAACGATCATTATGAAGCGCATTAA
diff --git a/ariba/tests/aln_to_metadata_test.py b/ariba/tests/aln_to_metadata_test.py
new file mode 100644
index 0000000..7f55dd3
--- /dev/null
+++ b/ariba/tests/aln_to_metadata_test.py
@@ -0,0 +1,411 @@
+import unittest
+import os
+import copy
+import shutil
+import filecmp
+import pyfastaq
+from ariba import aln_to_metadata, sequence_variant
+
+modules_dir = os.path.dirname(os.path.abspath(aln_to_metadata.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestAlnToMetadata(unittest.TestCase):
+    def test_load_aln_file(self):
+        '''test _load_aln_file'''
+        aln_file = os.path.join(data_dir, 'aln_to_metadata_load_aln_file.in.fa')
+        expected = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ABC-DE'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ABCQDE'),
+        }
+        got = aln_to_metadata.AlnToMetadata._load_aln_file(aln_file)
+        self.assertEqual(expected, got)
+
+
+    def test_load_vars_file_good_file(self):
+        '''test _load_vars_file good input file'''
+        infile = os.path.join(data_dir, 'aln_to_metadata_load_vars_file_good.tsv')
+        variant1 = sequence_variant.Variant('p', 'A42B', 'id1')
+        variant2 = sequence_variant.Variant('p', 'C43D', 'id2')
+        variant3 = sequence_variant.Variant('p', 'E100F', 'id3')
+        expected = {
+            'seq1': [(variant1, 'description 1')],
+            'seq2': [(variant2, 'description 2'), (variant3, 'description 3')]
+        }
+        got = aln_to_metadata.AlnToMetadata._load_vars_file(infile, True)
+        self.assertEqual(expected, got)
+
+
+    def test_load_vars_bad_files(self):
+        '''test _load_vars_file bad input files'''
+        infiles = [
+            os.path.join(data_dir, 'aln_to_metadata_load_vars_file_bad.1.tsv'),
+            os.path.join(data_dir, 'aln_to_metadata_load_vars_file_bad.2.tsv')
+        ]
+
+        for infile in infiles:
+            with self.assertRaises(aln_to_metadata.Error):
+                aln_to_metadata.AlnToMetadata._load_vars_file(infile, True)
+
+
+    def test_make_unpadded_seqs(self):
+        '''test _make_unpadded_seqs'''
+        padded = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'acg---t'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', '---a-cgt-'),
+        }
+        expected = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'acgt'),
+        }
+        got = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded)
+        self.assertEqual(expected, got)
+
+
+    def test_check_seq_lengths_same(self):
+        '''test _check_seq_lengths_same'''
+        seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'acgt'),
+        }
+
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_seq_lengths_same(seqs))
+        seqs['seq1'].seq = 'a'
+        with self.assertRaises(aln_to_metadata.Error):
+            aln_to_metadata.AlnToMetadata._check_seq_lengths_same(seqs)
+
+
+    def test_insertion_coords(self):
+        '''test _insertion_coords'''
+        ivl = pyfastaq.intervals.Interval
+        tests = [
+            ('acgt', []),
+            ('-a', [pyfastaq.intervals.Interval(0, 0)]),
+            ('a---cgt--', [pyfastaq.intervals.Interval(1, 3), pyfastaq.intervals.Interval(7, 8)]),
+        ]
+
+        for seq, expected in tests:
+            fa = pyfastaq.sequences.Fasta('x', seq)
+            got = aln_to_metadata.AlnToMetadata._insertion_coords(fa)
+            self.assertEqual(expected, got)
+
+
+    def test_make_unpadded_insertion_coords(self):
+        '''test _make_unpadded_insertion_coords'''
+        seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'acgt'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ac-gt'),
+            'seq3': pyfastaq.sequences.Fasta('seq3', '--acg-t'),
+        }
+
+        expected = {
+            'seq1': [],
+            'seq2': [pyfastaq.intervals.Interval(2, 2)],
+            'seq3': [pyfastaq.intervals.Interval(0, 1), pyfastaq.intervals.Interval(5, 5)],
+
+        }
+        got = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(seqs)
+        self.assertEqual(expected, got)
+
+
+    def test_check_insertion_coords(self):
+        '''test _check_insertion_coords'''
+        seq = pyfastaq.sequences.Fasta('name', 'AAA---GGG------TTT---')
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_insertion_coords(seq))
+
+        bad_seqs = [
+            pyfastaq.sequences.Fasta('name', 'AAA--GGG'),  # bad length
+            pyfastaq.sequences.Fasta('name', 'A---AA'),  # bad start position
+            pyfastaq.sequences.Fasta('name', 'AA---AA'), # bad start position
+        ]
+
+        for seq in bad_seqs:
+            with self.assertRaises(aln_to_metadata.Error):
+                aln_to_metadata.AlnToMetadata._check_insertion_coords(seq)
+
+
+    def test_check_coding_seq(self):
+        '''test _check_coding_seq'''
+        seq = pyfastaq.sequences.Fasta('name', 'ATGCTTTAG')
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_coding_seq(seq))
+
+        bad_seqs = [
+            pyfastaq.sequences.Fasta('name', 'TTGCTTAG'), # length not a mutliple of 3
+            pyfastaq.sequences.Fasta('name', 'TTTCTTTAG'), # no start codon
+            pyfastaq.sequences.Fasta('name', 'ATGTAGCTTTAG'), # stop codon in middle
+            pyfastaq.sequences.Fasta('name', 'TTGCTTTTT'), # no stop at end
+        ]
+
+        for seq in bad_seqs:
+            with self.assertRaises(aln_to_metadata.Error):
+                aln_to_metadata.AlnToMetadata._check_coding_seq(seq)
+
+
+    def test_check_sequences_non_coding(self):
+        '''test _check_sequences with noncoding seqs'''
+        padded_sequences = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'AC-T')
+        }
+
+        unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences)
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, False))
+        padded_sequences['seq2'] = pyfastaq.sequences.Fasta('seq2', 'AC-')
+        unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences)
+        with self.assertRaises(aln_to_metadata.Error):
+            aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, False)
+
+
+    def test_check_sequences_coding(self):
+        '''test _check_sequences with coding seqs'''
+        padded_sequences = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATGCTTTAG'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---TAG')
+        }
+
+        unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences)
+
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, True))
+
+        bad_seqs = [
+            'ATGCTTAG', # length not a mutliple of 3
+            'TTTCTTTAG', # no start codon
+            'ATGTAGCTTTAG', # stop codon in middle
+            'ATGTTTTTT', # no stop at end
+            'ATGC---TTTAG', # bad insertion
+            'ATGCT---TTAG', # bad insertion
+            'ATG-CTTTAG', # bad insertion
+            'ATG--CTTTAG', # bad insertion
+            'ATG----CTTTAG', # bad insertion
+        ]
+
+        for seq in bad_seqs:
+            padded_sequences['seq2'] = pyfastaq.sequences.Fasta('seq2', seq)
+            unpadded_sequences = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_sequences)
+            with self.assertRaises(aln_to_metadata.Error):
+                aln_to_metadata.AlnToMetadata._check_sequences(padded_sequences, unpadded_sequences, True)
+
+
+    def test_check_variants_match_sequences(self):
+        '''test _check_variants_match_sequences'''
+        seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATGCTTTAG'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATGCTTCTTTAG'),
+            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATG---TAG')
+        }
+
+        variants = {'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')]}
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
+        variants = {'seq1': [(sequence_variant.Variant('p', 'M2L', 'id1'), 'description1')]}
+        self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
+
+        variants = {'seq1': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]}
+        with self.assertRaises(aln_to_metadata.Error):
+            self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
+
+        variants = {'seq4': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]}
+        with self.assertRaises(aln_to_metadata.Error):
+            self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
+
+
+    def test_variant_ids_are_unique(self):
+        '''test variant_ids_are_unique'''
+        variants = {
+            'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')],
+            'seq2': [(sequence_variant.Variant('p', 'L2M', 'id2'), 'description2')]
+        }
+
+        self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants))
+        variants['seq2'].append((sequence_variant.Variant('p', 'I3K', 'id1'), 'description3'))
+        with self.assertRaises(aln_to_metadata.Error):
+            self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants))
+
+
+    def test_unpadded_to_padded_nt_position(self):
+        '''test _unpadded_to_padded_nt_position'''
+        ivl = pyfastaq.intervals.Interval
+
+        tests = [
+            (0, [], 0),
+            (1, [], 1),
+            (2, [], 2),
+            (0, [ivl(3, 5)], 0),
+            (1, [ivl(3, 5)], 1),
+            (2, [ivl(3, 5)], 2),
+            (3, [ivl(3, 5)], 6),
+            (4, [ivl(3, 5)], 7),
+            (5, [ivl(3, 5)], 8),
+            (0, [ivl(3, 5), ivl(9,14)], 0),
+            (1, [ivl(3, 5), ivl(9,14)], 1),
+            (2, [ivl(3, 5), ivl(9,14)], 2),
+            (3, [ivl(3, 5), ivl(9,14)], 6),
+            (4, [ivl(3, 5), ivl(9,14)], 7),
+            (5, [ivl(3, 5), ivl(9,14)], 8),
+            (6, [ivl(3, 5), ivl(9,14)], 15),
+            (7, [ivl(3, 5), ivl(9,14)], 16),
+            (8, [ivl(3, 5), ivl(9,14)], 17),
+        ]
+
+        for position, insertions, expected in tests:
+            got = aln_to_metadata.AlnToMetadata._unpadded_to_padded_nt_position(position, insertions)
+            self.assertEqual(expected, got)
+
+
+    def test_padded_to_unpadded_nt_position(self):
+        '''test _padded_to_unpadded_nt_position'''
+        ivl = pyfastaq.intervals.Interval
+
+        tests = [
+            (0, [], 0),
+            (1, [], 1),
+            (2, [], 2),
+            (0, [ivl(3, 5)], 0),
+            (1, [ivl(3, 5)], 1),
+            (2, [ivl(3, 5)], 2),
+            (3, [ivl(3, 5)], None),
+            (4, [ivl(3, 5)], None),
+            (5, [ivl(3, 5)], None),
+            (6, [ivl(3, 5)], 3),
+            (7, [ivl(3, 5)], 4),
+            (8, [ivl(3, 5)], 5),
+            (0, [ivl(3, 5), ivl(7,10)], 0),
+            (1, [ivl(3, 5), ivl(7,10)], 1),
+            (2, [ivl(3, 5), ivl(7,10)], 2),
+            (3, [ivl(3, 5), ivl(7,10)], None),
+            (4, [ivl(3, 5), ivl(7,10)], None),
+            (5, [ivl(3, 5), ivl(7,10)], None),
+            (6, [ivl(3, 5), ivl(7,10)], 3),
+            (7, [ivl(3, 5), ivl(7,10)], None),
+            (8, [ivl(3, 5), ivl(7,10)], None),
+            (9, [ivl(3, 5), ivl(7,10)], None),
+            (10, [ivl(3, 5), ivl(7,10)], None),
+            (11, [ivl(3, 5), ivl(7,10)], 4),
+            (12, [ivl(3, 5), ivl(7,10)], 5),
+        ]
+
+        for position, insertions, expected in tests:
+            got = aln_to_metadata.AlnToMetadata._padded_to_unpadded_nt_position(position, insertions)
+            self.assertEqual(expected, got)
+
+
+    def test_variants_to_tsv_lines_coding(self):
+        '''test _variants_to_tsv_lines coding sequences'''
+        padded_seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'), # M-AN*
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'), # MFAN*
+            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTTT---AATTAG'), # MF-N*
+            'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTTTTGTAATTAG'), # MFCN*
+            'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'), # MFDN*
+        }
+
+        unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs)
+        insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs)
+
+        variant1 = sequence_variant.Variant('p', 'A2D', 'id1')
+        variant2 = sequence_variant.Variant('p', 'F2E', 'id2')
+        variants = {
+            'seq1': [(variant1, 'description 1')],
+            'seq5': [(variant2, 'description 2')],
+        }
+
+        expected = [
+            'seq1\tp\tA2D\tid1\tdescription 1',
+            'seq2\tp\tA2D\tid1\tdescription 1',
+            'seq4\tp\tC3D\tid1\tdescription 1',
+            'seq5\tp\tA3D\tid1\tdescription 1',
+            'seq5\tp\tF2E\tid2\tdescription 2',
+            'seq3\tp\tF2E\tid2\tdescription 2',
+            'seq4\tp\tF2E\tid2\tdescription 2',
+        ]
+
+        got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, True)
+        self.assertEqual(expected, got)
+
+
+    def test_variants_to_tsv_lines_noncoding(self):
+        '''test _variants_to_tsv_lines noncoding sequences'''
+        padded_seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'),
+            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTAT---AATTAG'),
+            'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTGTTGTAATTAG'),
+            'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'),
+        }
+
+        unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs)
+        unpadded_aa_seqs = {x: unpadded_seqs[x].translate() for x in unpadded_seqs}
+        insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs)
+
+        variant1 = sequence_variant.Variant('n', 'C5T', 'id1')
+        variant2 = sequence_variant.Variant('n', 'A5T', 'id2')
+        variants = {
+            'seq1': [(variant1, 'description 1')],
+            'seq5': [(variant2, 'description 2')],
+        }
+
+        expected = [
+            'seq1\tn\tC5T\tid1\tdescription 1',
+            'seq2\tn\tC5T\tid1\tdescription 1',
+            'seq4\tn\tG8T\tid1\tdescription 1',
+            'seq5\tn\tA8T\tid1\tdescription 1',
+            'seq5\tn\tA5T\tid2\tdescription 2',
+            'seq3\tn\tA5T\tid2\tdescription 2',
+            'seq4\tn\tG5T\tid2\tdescription 2',
+        ]
+
+        got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, False)
+        self.assertEqual(expected, got)
+
+
+    def test_make_cluster_file(self):
+        '''test _make_cluster_file'''
+        seqs = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'a'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'c'),
+            'seq3': pyfastaq.sequences.Fasta('seq3', 'g'),
+        }
+        tmpfile = 'tmp.aln_to_meta_test_make_cluster_file.out'
+        expected_file = os.path.join(data_dir, 'aln_to_metadata_make_cluster_file.out')
+
+        with self.assertRaises(aln_to_metadata.Error):
+            aln_to_metadata.AlnToMetadata._make_cluster_file('not_found', seqs, tmpfile)
+
+        aln_to_metadata.AlnToMetadata._make_cluster_file('seq2', seqs, tmpfile)
+        self.assertTrue(filecmp.cmp(expected_file, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_run_coding(self):
+        '''test run coding sequences'''
+        fa_in = os.path.join(data_dir, 'aln_to_metadata_run_coding.in.fa')
+        fa_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.fa')
+        tsv_in = os.path.join(data_dir, 'aln_to_metadata_run_coding.in.tsv')
+        tsv_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.tsv')
+        cluster_expected = os.path.join(data_dir, 'aln_to_metadata_run_coding.out.cluster')
+        a_to_m = aln_to_metadata.AlnToMetadata(fa_in, tsv_in, True, 'seq3')
+        outprefix = 'tmp.test.aln_to_metadata.run_coding'
+        a_to_m.run(outprefix)
+        self.assertTrue(filecmp.cmp(tsv_expected, outprefix + '.tsv', shallow=False))
+        self.assertTrue(filecmp.cmp(fa_expected, outprefix + '.fa', shallow=False))
+        self.assertTrue(filecmp.cmp(cluster_expected, outprefix + '.cluster', shallow=False))
+        os.unlink(outprefix + '.tsv')
+        os.unlink(outprefix + '.fa')
+        os.unlink(outprefix + '.cluster')
+
+
+    def test_run_noncoding(self):
+        '''test run noncoding sequences'''
+        fa_in = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.in.fa')
+        fa_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.fa')
+        tsv_in = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.in.tsv')
+        tsv_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.tsv')
+        cluster_expected = os.path.join(data_dir, 'aln_to_metadata_run_noncoding.out.cluster')
+        a_to_m = aln_to_metadata.AlnToMetadata(fa_in, tsv_in, False, 'seq2')
+        outprefix = 'tmp.test.aln_to_metadata.run_noncoding'
+        a_to_m.run(outprefix)
+        self.assertTrue(filecmp.cmp(tsv_expected, outprefix + '.tsv', shallow=False))
+        self.assertTrue(filecmp.cmp(fa_expected, outprefix + '.fa', shallow=False))
+        self.assertTrue(filecmp.cmp(cluster_expected, outprefix + '.cluster', shallow=False))
+        os.unlink(outprefix + '.tsv')
+        os.unlink(outprefix + '.fa')
+        os.unlink(outprefix + '.cluster')
+
diff --git a/ariba/tests/assembly_compare_test.py b/ariba/tests/assembly_compare_test.py
new file mode 100644
index 0000000..df4001a
--- /dev/null
+++ b/ariba/tests/assembly_compare_test.py
@@ -0,0 +1,381 @@
+import unittest
+import os
+import copy
+import shutil
+import filecmp
+import pyfastaq
+import pysam
+import pymummer
+from ariba import assembly_compare
+
+modules_dir = os.path.dirname(os.path.abspath(assembly_compare.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestAssemblyCompare(unittest.TestCase):
+    def test_parse_nucmer_coords_file(self):
+        '''test _parse_nucmer_coords_file'''
+        coords_file = os.path.join(data_dir, 'assembly_compare_parse_nucmer_coords_file.coords')
+        ref_name = 'ref'
+        got = assembly_compare.AssemblyCompare._parse_nucmer_coords_file(coords_file, ref_name)
+        line1 = ['1', '1000', '1', '1000', '1000', '1000', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1']
+        line2 = ['1', '240', '1', '240', '240', '240', '100.00', '1000', '580', '1', '1', 'ref', 'contig2']
+        line3 = ['661', '1000', '241', '580', '340', '340', '100.00', '1000', '580', '1', '1', 'ref', 'contig2']
+        expected = {
+            'contig1': [pymummer.alignment.Alignment('\t'.join(line1))],
+            'contig2': [pymummer.alignment.Alignment('\t'.join(line2)), pymummer.alignment.Alignment('\t'.join(line3))],
+        }
+        self.assertEqual(expected, got)
+
+
+    def test_nucmer_hits_to_percent_identity(self):
+        '''test _nucmer_hits_to_percent_identity'''
+        hits = [
+            ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['1', '42', '1', '42', '42', '42', '42.42', '1000', '1000', '1', '1', 'ref', 'scaff2'],
+        ]
+        nucmer_hits = {
+            'scaff1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+            ],
+            'scaff2': [
+                pymummer.alignment.Alignment('\t'.join(hits[2])),
+            ]
+        }
+        expected = {'scaff1': round((90*10 + 100*34) / (10+34), 2), 'scaff2': 42.42}
+        got = assembly_compare.AssemblyCompare._nucmer_hits_to_percent_identity(nucmer_hits)
+        self.assertEqual(expected, got)
+
+
+    def test_nucmer_hits_to_assembly_coords(self):
+        '''test _nucmer_hits_to_assembly_coords'''
+        hits = [
+            ['1', '10', '1', '10', '10', '10', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['50', '52', '50', '52', '3', '3', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff2'],
+        ]
+        nucmer_hits = {
+            'scaff1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+                pymummer.alignment.Alignment('\t'.join(hits[2])),
+            ],
+            'scaff2': [
+                pymummer.alignment.Alignment('\t'.join(hits[3])),
+            ]
+        }
+        expected = {
+            'scaff1': [
+                pyfastaq.intervals.Interval(0, 41),
+                pyfastaq.intervals.Interval(49, 51)
+            ],
+            'scaff2': [
+                pyfastaq.intervals.Interval(0, 41),
+            ]
+        }
+        got = assembly_compare.AssemblyCompare._nucmer_hits_to_assembly_coords(nucmer_hits)
+        self.assertEqual(expected, got)
+
+
+    def test_nucmer_hits_to_ref_coords(self):
+        '''test nucmer_hits_to_ref_coords'''
+        hits = [
+            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['31', '52', '1', '22', '22', '22', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'ref', 'contig2'],
+        ]
+        nucmer_hits = {
+            'contig1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+                pymummer.alignment.Alignment('\t'.join(hits[2])),
+            ],
+            'contig2': [
+                pymummer.alignment.Alignment('\t'.join(hits[3])),
+            ]
+        }
+        got = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits)
+        expected = {
+            'contig1': [pyfastaq.intervals.Interval(0,51), pyfastaq.intervals.Interval(99, 141)],
+            'contig2': [pyfastaq.intervals.Interval(99, 109)]
+        }
+
+        self.assertEqual(expected, got)
+
+        got = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits, contig='contig2')
+        del expected['contig1']
+        self.assertEqual(expected, got)
+
+
+    def test_ref_cov_per_contig(self):
+        '''test ref_cov_per_contig'''
+        hits = [
+            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'ref', 'contig2'],
+        ]
+        nucmer_hits = {
+            'contig1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+            ],
+            'contig2': [
+                pymummer.alignment.Alignment('\t'.join(hits[2])),
+            ]
+        }
+
+        expected = {'contig1': 85, 'contig2': 11}
+        got = assembly_compare.AssemblyCompare.ref_cov_per_contig(nucmer_hits)
+        self.assertEqual(expected, got)
+
+
+    def test_get_assembled_reference_sequences(self):
+        '''test _get_assembled_reference_sequences'''
+        ref_sequence = pyfastaq.sequences.Fasta('ref_seq', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG')
+        assembly = {
+            'contig1': pyfastaq.sequences.Fasta('contig1', 'CATCTATGCTGCATCGATCACTGACGTATCATCATCAGCGTACTGACGTATTAGTTTGTAATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAGACGTCGTACTACTGTATATGCATCGATCTGAA'),
+            'contig2': pyfastaq.sequences.Fasta('contig2', 'AGTGATATCCTGCGATCTATAATTTTTTTCGCGGGATCTTGAACGCGACGATGTTCGATAATTCAATGCAAAGGAGCGACCCGCAAGTACACAGGACTGCAAA')
+        }
+
+        hits = [
+            ['1', '147', '61', '207', '147', '147', '100.00', '147', '239', '1', '1', 'ref_seq', 'contig1'],
+            ['18', '120', '103', '1', '103', '103', '100.00', '147', '103', '1', '-1', 'ref_seq', 'contig2']
+        ]
+        nucmer_hits = {
+            'contig1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+            ],
+            'contig2': [
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+            ]
+        }
+
+        expected = {'ref_seq.1.147.contig1.61.207.+.complete': pyfastaq.sequences.Fasta('ref_seq.1.147.contig1.61.207.+.complete', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG'),
+            'ref_seq.18.120.contig2.1.103.-': pyfastaq.sequences.Fasta('ref_seq.18.120.contig2.1.103.-', 'TTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT')
+        }
+
+        got = assembly_compare.AssemblyCompare._get_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly)
+        self.assertEqual(expected, got)
+
+
+    def test_whole_gene_covered_by_nucmer_hits(self):
+        '''test _whole_gene_covered_by_nucmer_hits'''
+        ref_seq = pyfastaq.sequences.Fasta('ref', 'ACGTGTGCAT')
+        hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'ref', 'contig1']
+        hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'ref', 'contig2']
+        hit3 = ['6', '10', '6', '10', '5', '5', '100.00', '10', '10', '1', '1', 'ref', 'contig2']
+        nucmer_hits = [
+            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
+            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]},
+            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]}
+        ]
+        expected = [True, False, True]
+        for i in range(len(nucmer_hits)):
+            got = assembly_compare.AssemblyCompare._whole_gene_covered_by_nucmer_hits(nucmer_hits[i], ref_seq, 0.95, 0)
+            self.assertEqual(expected[i], got)
+
+
+    def test_ref_has_region_assembled_twice(self):
+        '''test _ref_has_region_assembled_twice'''
+        ref_seq = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT')
+        hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1']
+        hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
+        nucmer_hits = { 'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))] }
+        self.assertFalse(assembly_compare.AssemblyCompare._ref_has_region_assembled_twice(nucmer_hits, ref_seq, 0.03))
+        nucmer_hits['contig2'] = [pymummer.alignment.Alignment('\t'.join(hit2))]
+        self.assertTrue(assembly_compare.AssemblyCompare._ref_has_region_assembled_twice(nucmer_hits, ref_seq, 0.03))
+
+
+    def test_longest_nucmer_hit_in_ref(self):
+        '''test _longest_nucmer_hit_in_ref'''
+        hits = [
+            ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1'],
+            ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1'],
+            ['21', '39', '21', '39', '19', '19', '100.00', '39', '39', '1', '1', 'gene', 'contig2'],
+        ]
+        alignments = [pymummer.alignment.Alignment('\t'.join(x)) for x in hits]
+        nucmer_hits = {
+            'contig1': [alignments[0]],
+            'contig2': [alignments[1], alignments[2]],
+        }
+        got = assembly_compare.AssemblyCompare._longest_nucmer_hit_in_ref(nucmer_hits)
+        self.assertEqual(alignments[0], got)
+
+
+    def test_find_previous_start_codon(self):
+        '''test _find_previous_start_codon'''
+        tests = [
+            ('ATGTTTAAA', 0, 0, 0),
+            ('TATGTTTAAA', 0, 0, None),
+            ('TATGTTTAAA', 1, 0, 1),
+            ('ATATGTTTAAA', 2, 0, 2),
+            ('AATGTTTAAA', 7, 6, None),
+            ('AATGTTTAAA', 7, 5, None),
+            ('AATGTTTAAA', 7, 4, None),
+            ('AATGTTTAAA', 7, 3, None),
+            ('AATGTTTAAA', 7, 2, None),
+            ('AATGTTTAAA', 7, 1, 1),
+            ('AATGTTTAAA', 7, 0, 1),
+            ('AGTGTTTAAA', 7, 0, None),
+            ('AATGTAGAAA', 7, 0, None),
+        ]
+
+        for seq, start_coord, min_coord, expected in tests:
+            fa = pyfastaq.sequences.Fasta('x', seq)
+            got = assembly_compare.AssemblyCompare._find_previous_start_codon(fa, start_coord, min_coord)
+            self.assertEqual(expected, got)
+
+
+    def test_find_next_stop_codon(self):
+        '''test _find_next_stop_codon'''
+        tests = [
+            ('ATGTTTAGA', 2, 5, None),
+            ('ATGTTTAGA', 2, 6, None),
+            ('ATGTTTAGA', 2, 7, 5),
+            ('ATGTTTGGA', 2, 4, None),
+            ('ATGTTTAGA', 5, 6, None),
+            ('ATGTTTAGA', 5, 7, 5),
+            ('ATGTTTGGA', 5, 7, None),
+            ('ATGTTTAGA', 4, 7, None),
+            ('ATGTTTAGA', 3, 7, None),
+        ]
+
+        for seq, end_coord, max_coord, expected in tests:
+            fa = pyfastaq.sequences.Fasta('x', seq)
+            got = assembly_compare.AssemblyCompare._find_next_stop_codon(fa, end_coord, max_coord)
+            self.assertEqual(expected, got)
+
+
+    def test_gene_from_nucmer_match(self):
+        '''test _gene_from_nucmer_match'''
+        tests = [
+            (
+             ['1', '15', '2', '16', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGTAGTTTCCCTAGATAT', 5,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGTAGTTTCCCTAG'), 'HAS_STOP', None, None)
+            ),
+            (
+             ['1', '15', '2', '16', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 5,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 0, 0)
+            ),
+            (
+             ['1', '15', '2', '15', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 5,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 0, 1)
+            ),
+            (
+             ['1', '15', '2', '14', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 5,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 0, 2)
+            ),
+            (
+             ['1', '15', '2', '13', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 5,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 0, 3)
+            ),
+            (
+             ['2', '15', '3', '16', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 5,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 1, 0)
+            ),
+            (
+             ['3', '15', '4', '16', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 5,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 2, 0)
+            ),
+            (
+             ['4', '15', '5', '16', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 5,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 3, 0)
+            ),
+            (
+             ['1', '15', '2', '14', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 0,
+             (pyfastaq.sequences.Fasta('contig.2-13', 'ATGAAATTTCCC'), 'START_OR_END_FAIL', 0, None)
+            ),
+            (
+             ['1', '15', '2', '14', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 1,
+             (pyfastaq.sequences.Fasta('contig.2-13', 'ATGAAATTTCCC'), 'START_OR_END_FAIL', 0, None)
+            ),
+            (
+             ['1', '15', '2', '14', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 2,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 0, 2)
+            ),
+            (
+             ['2', '15', '3', '14', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'AATGAAATTTCCCTAGATAT', 2,
+             (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 1, 2)
+            ),
+            (
+             ['2', '15', '18', '7', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig'], 'ATATCTAGGGAAATTTCATT', 2,
+             (pyfastaq.sequences.Fasta('contig.16-2', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 1, 2)
+            ),
+        ]
+
+        for hit, seq, max_extend, expected in tests:
+            nucmer_match = pymummer.alignment.Alignment('\t'.join(hit))
+            contig = pyfastaq.sequences.Fasta('contig', seq)
+            got = assembly_compare.AssemblyCompare._gene_from_nucmer_match(nucmer_match, contig, max_extend)
+            self.assertEqual(expected, got)
+
+
+    def test_get_gene_matching_ref(self):
+        '''test _get_gene_matching_ref'''
+        hit1 = ['2', '15', '3', '14', '11', '11', '100.00', '20', '20', '1', '1', 'ref', 'contig']
+        hit2 = ['2', '7', '3', '8', '6', '6', '100.00', '20', '20', '1', '1', 'ref', 'contig2']
+        contigs = {
+            'contig': pyfastaq.sequences.Fasta('contig', 'AATGAAATTTCCCTAGATAT'),
+            'contig2': pyfastaq.sequences.Fasta('contig2', 'AATGAAATTTCCCTAGATAT')
+        }
+        nucmer_hits = {
+            'contig': [pymummer.alignment.Alignment('\t'.join(hit1))],
+            'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))],
+        }
+
+        got = assembly_compare.AssemblyCompare._get_gene_matching_ref(nucmer_hits, contigs, 10)
+        expected = (pyfastaq.sequences.Fasta('contig.2-16', 'ATGAAATTTCCCTAG'), 'GENE_FOUND', 1, 2)
+        self.assertEqual(expected, got)
+
+
+    def test_ref_covered_by_at_least_one_full_length_contig(self):
+        '''test _ref_covered_by_at_least_one_full_length_contig'''
+        ref = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
+        hits = [
+            ['1', '100', '1', '100', '100', '100', '100.00', '100', '100', '1', '1', 'ref', 'contig1'],
+            ['1', '99', '1', '99', '99', '99', '100.00', '100', '100', '1', '1', 'ref', 'contig1'],
+            ['1', '96', '1', '96', '96', '96', '100.00', '100', '100', '1', '1', 'ref', 'contig1'],
+            ['1', '95', '1', '95', '95', '95', '100.00', '100', '100', '1', '1', 'ref', 'contig1'],
+            ['1', '94', '1', '94', '94', '94', '100.00', '100', '100', '1', '1', 'ref', 'contig1'],
+        ]
+        nucmer_hits = [{'contig1': [pymummer.alignment.Alignment('\t'.join(hit))]} for hit in hits]
+        expected = [True, True, True, True, False]
+        assert len(expected) == len(nucmer_hits)
+        for i in range(len(expected)):
+            self.assertEqual(expected[i], assembly_compare.AssemblyCompare._ref_covered_by_at_least_one_full_length_contig(nucmer_hits[i], 0.95, 0))
+
+
+    def test_nucmer_hit_containing_reference_position(self):
+        '''test nucmer_hit_containing_reference_position'''
+        listhit1 = ['100', '200', '300', '400', '100', '100', '100.00', '600', '500', '1', '1', 'ref', 'contig1']
+        listhit2 = ['400', '500', '500', '600', '100', '100', '100.00', '600', '600', '1', '1', 'ref', 'contig2']
+        hit1 = pymummer.alignment.Alignment('\t'.join(listhit1))
+        hit2 = pymummer.alignment.Alignment('\t'.join(listhit2))
+        nucmer_hits = {
+            'contig1': [hit1],
+            'contig2': [hit2],
+        }
+
+        tests = [
+            ('ref2', 150, None),
+            ('ref', 42, None),
+            ('ref', 98, None),
+            ('ref', 200, None),
+            ('ref', 99, hit1),
+            ('ref', 142, hit1),
+            ('ref', 199, hit1),
+            ('ref', 200, None),
+            ('ref', 398, None),
+            ('ref', 399, hit2),
+            ('ref', 442, hit2),
+            ('ref', 499, hit2),
+            ('ref', 500, None),
+        ]
+
+        for ref_name, ref_pos, expected in tests:
+            got = assembly_compare.AssemblyCompare.nucmer_hit_containing_reference_position(nucmer_hits, ref_name, ref_pos)
+            self.assertEqual(expected, got)
diff --git a/ariba/tests/assembly_test.py b/ariba/tests/assembly_test.py
new file mode 100644
index 0000000..129b80b
--- /dev/null
+++ b/ariba/tests/assembly_test.py
@@ -0,0 +1,135 @@
+import unittest
+import sys
+import os
+import shutil
+import filecmp
+import pyfastaq
+from ariba import assembly
+
+modules_dir = os.path.dirname(os.path.abspath(assembly.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestAssembly(unittest.TestCase):
+    def test_get_assembly_kmer(self):
+        '''test _get_assembly_kmer'''
+        reads1 = os.path.join(data_dir, 'assembly_test_set_assembly_kmer_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_set_assembly_kmer_reads_2.fq')
+        got = assembly.Assembly._get_assembly_kmer(0, reads1, reads2)
+        self.assertEqual(got, 5)
+        got = assembly.Assembly._get_assembly_kmer(42, reads1, reads2)
+        self.assertEqual(got, 42)
+
+
+    def test_check_spades_log_file(self):
+        '''test _check_spades_log_file'''
+        good_file = os.path.join(data_dir, 'assembly_test_check_spades_log_file.log.good')
+        bad_file = os.path.join(data_dir, 'assembly_test_check_spades_log_file.log.bad')
+        self.assertTrue(assembly.Assembly._check_spades_log_file(good_file))
+        with self.assertRaises(assembly.Error):
+            self.assertTrue(assembly.Assembly._check_spades_log_file(bad_file))
+
+
+    def test_assemble_with_spades(self):
+        '''test _assemble_with_spades'''
+        reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_2.fq')
+        ref_fasta = os.path.join(data_dir, 'assembly_test_assemble_with_spades_ref.fa')
+        tmp_dir = 'tmp.test_assemble_with_spades'
+        a = assembly.Assembly(reads1, reads2, ref_fasta, tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a._assemble_with_spades(unittest=True)
+        self.assertTrue(a.assembled_ok)
+        shutil.rmtree(tmp_dir)
+
+
+    def test_assemble_with_spades_fail(self):
+        '''test _assemble_with_spades handles spades fail'''
+        reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_2.fq')
+        ref_fasta = os.path.join(data_dir, 'assembly_test_assemble_with_spades_ref.fa')
+        tmp_dir = 'tmp.test_assemble_with_spades'
+        a = assembly.Assembly(reads1, reads2, ref_fasta, tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a._assemble_with_spades(unittest=False)
+        self.assertFalse(a.assembled_ok)
+        shutil.rmtree(tmp_dir)
+
+
+    def test_scaffold_with_sspace(self):
+        '''test _scaffold_with_sspace'''
+        reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_2.fq')
+        ref_fasta = os.path.join(data_dir, 'assembly_test_assemble_with_spades_ref.fa')
+        tmp_dir = 'tmp.test_scaffold_with_sspace'
+        a = assembly.Assembly(reads1, reads2, ref_fasta, tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a.assembly_contigs = os.path.join(data_dir, 'assembly_test_scaffold_with_sspace_contigs.fa')
+        a._scaffold_with_sspace()
+        self.assertTrue(os.path.exists(a.scaffolder_scaffolds))
+        shutil.rmtree(tmp_dir)
+
+
+    def test_has_gaps_to_fill(self):
+        '''test _has_gaps_to_fill'''
+        no_gaps = os.path.join(data_dir, 'assembly_test_has_gaps_to_fill.no_gaps.fa')
+        has_gaps = os.path.join(data_dir, 'assembly_test_has_gaps_to_fill.has_gaps.fa')
+        self.assertTrue(assembly.Assembly._has_gaps_to_fill(has_gaps))
+        self.assertFalse(assembly.Assembly._has_gaps_to_fill(no_gaps))
+
+
+    def test_rename_scaffolds(self):
+        '''test _rename_scaffolds'''
+        infile = os.path.join(data_dir, 'assembly_test_rename_scaffolds.in.fa')
+        outfile = os.path.join(data_dir, 'assembly_test_rename_scaffolds.out.fa')
+        tmpfile = 'tmp.fa'
+        assembly.Assembly._rename_scaffolds(infile, tmpfile, 'prefix')
+        self.assertTrue(filecmp.cmp(outfile, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_gap_fill_with_gapfiller_no_gaps(self):
+        '''test _gap_fill_with_gapfiller no gaps'''
+        reads1 = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller_reads_2.fq')
+        tmp_dir = 'tmp.gap_fill_with_gapfiller_no_gaps'
+        a = assembly.Assembly(reads1, reads2, 'ref.fa', tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a.scaffolder_scaffolds = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa')
+        a._gap_fill_with_gapfiller()
+        self.assertTrue(os.path.exists(a.gapfilled_scaffolds))
+        shutil.rmtree(tmp_dir)
+
+
+    def test_gap_fill_with_gapfiller_with_gaps(self):
+        '''test _gap_fill_with_gapfiller with gaps'''
+        reads1 = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller_reads_2.fq')
+        tmp_dir = 'tmp.gap_fill_with_gapfiller_with_gaps'
+        a = assembly.Assembly(reads1, reads2, 'ref.fa', tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a.scaffolder_scaffolds = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa')
+        a._gap_fill_with_gapfiller()
+        self.assertTrue(os.path.exists(a.gapfilled_scaffolds))
+        shutil.rmtree(tmp_dir)
+
+
+    def test_fix_contig_orientation(self):
+        '''test _fix_contig_orientation'''
+        scaffs_in = os.path.join(data_dir, 'assembly_test_fix_contig_orientation.in.fa')
+        expected_out = os.path.join(data_dir, 'assembly_test_fix_contig_orientation.out.fa')
+        ref_fa = os.path.join(data_dir, 'assembly_test_fix_contig_orientation.ref.fa')
+        tmp_out = 'tmp.assembly_test_fix_contig_orientation.out.fa'
+        got = assembly.Assembly._fix_contig_orientation(scaffs_in, ref_fa, tmp_out)
+        expected = {'match_both_strands'}
+        self.assertTrue(filecmp.cmp(expected_out, tmp_out, shallow=False))
+        self.assertEqual(expected, got)
+        os.unlink(tmp_out)
+
+
+    def test_parse_bam(self):
+        '''test _parse_bam'''
+        bam = os.path.join(data_dir, 'assembly_test_parse_assembly_bam.bam')
+        assembly_fa = os.path.join(data_dir, 'assembly_test_parse_assembly_bam.assembly.fa')
+        assembly_seqs = {}
+        pyfastaq.tasks.file_to_dict(assembly_fa, assembly_seqs)
+        self.assertTrue(assembly.Assembly._parse_bam(assembly_seqs, bam, 10, 1000))
+        os.unlink(bam + '.soft_clipped')
+        os.unlink(bam + '.unmapped_mates')
+        os.unlink(bam + '.scaff')
+
diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py
new file mode 100644
index 0000000..6061b29
--- /dev/null
+++ b/ariba/tests/assembly_variants_test.py
@@ -0,0 +1,385 @@
+import unittest
+import os
+import pymummer
+import pyfastaq
+from ariba import assembly_variants, reference_data, sequence_variant, sequence_metadata
+
+modules_dir = os.path.dirname(os.path.abspath(assembly_variants.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestAssemblyVariants(unittest.TestCase):
+    def test_get_codon_start(self):
+        '''test _get_codon_start'''
+        tests = [
+            (0, 5, 3),
+            (0, 0, 0),
+            (0, 1, 0),
+            (0, 2, 0),
+            (1, 3, 1),
+            (2, 3, 2),
+            (3, 3, 3),
+            (3, 6, 6),
+            (3, 7, 6),
+            (3, 8, 6),
+        ]
+        for start, position, expected in tests:
+            self.assertEqual(expected, assembly_variants.AssemblyVariants._get_codon_start(start, position))
+
+
+    def test_get_mummer_variants_no_variants(self):
+        '''test _get_mummer_variants when no variants'''
+        snp_file = os.path.join(data_dir, 'assembly_variants_test_get_mummer_variants.none.snps')
+        got = assembly_variants.AssemblyVariants._get_mummer_variants(snp_file)
+        self.assertEqual({}, got)
+
+
+    def test_get_mummer_variants_has_variants(self):
+        '''test _get_mummer_variants when there are variants'''
+        snp_file = os.path.join(data_dir, 'assembly_variants_test_get_mummer_variants.snp.snps')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig2'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('40\tT\tC\t40\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
+        v4 = pymummer.variant.Variant(pymummer.snp.Snp('2\tC\tG\t2\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
+        expected = {
+            'contig1': [[v4], [v3, v1]],
+            'contig2': [[v2]]
+        }
+        got = assembly_variants.AssemblyVariants._get_mummer_variants(snp_file)
+        self.assertEqual(expected, got)
+
+
+    def test_get_variant_effect(self):
+        '''test _get_variant_effect'''
+        ref_seq = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v4 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v5 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v6 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tA\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v7 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v7.qry_base = 'GAT'
+        v8 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v8.qry_base = 'TGA'
+        v9 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v9.qry_base = 'ATTCCT'
+        v10 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v10.ref_base = 'CGC'
+        v10.ref_end = 5
+        v11 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v11.ref_base = 'CGCGAA'
+        v11.ref_end = 8
+
+        variants = [
+            ([v1], ('SYN', '.', 1)),
+            ([v2], ('NONSYN', 'R2S', 1)),
+            ([v2, v1], ('NONSYN', 'R2S', 1)),
+            ([v3, v4], ('TRUNC', 'R2trunc', 1)),
+            ([v5], ('FSHIFT', 'R2fs', 1)),
+            ([v6], ('FSHIFT', 'R2fs', 1)),
+            ([v7], ('INS', 'R2_E3insD', 1)),
+            ([v8], ('TRUNC', 'R2trunc', 1)),
+            ([v9], ('INS', 'R2_E3insIP', 1)),
+            ([v10], ('DEL', 'R2del', 1)),
+            ([v11], ('DEL', 'R2_E3del', 1)),
+        ]
+
+        for variant_list, expected in variants:
+            self.assertEqual(expected, assembly_variants.AssemblyVariants._get_variant_effect(variant_list, ref_seq))
+
+
+    def test_filter_mummer_variants(self):
+        '''test filter_mummer_variants'''
+        ref_seq = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('12\tG\tT\t12\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        mummer_variants = {'contig': [[v1, v2], v3]}
+        assembly_variants.AssemblyVariants._filter_mummer_variants(mummer_variants, ref_seq)
+        expected = {'contig': [[v1, v2]]}
+        self.assertEqual(expected, mummer_variants)
+
+
+    def test_get_one_variant_for_one_contig_non_coding(self):
+        '''test _get_one_variant_for_one_contig_non_coding'''
+        refdata = reference_data.ReferenceData(
+            non_coding_fa=os.path.join(data_dir, 'assembly_variants_test_get_variants_non_coding.fa'),
+            metadata_tsv=os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv')
+        )
+
+        ref_sequence_name = 'non_coding'
+        refdata_var_dict = refdata.metadata[ref_sequence_name]
+
+        v0 = pymummer.variant.Variant(pymummer.snp.Snp('2\tT\tA\t2\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))
+
+        # ref has A at position 3, which is variant type. This gives contig the wild type C. Shouldn't report
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('3\tA\tC\t3\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))
+
+        # ref has T at position 5, which is wild type. This gives contig variant type A. Should report
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('5\tT\tA\t5\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))
+
+        meta0 = sequence_metadata.SequenceMetadata('non_coding\tn\tC3A\tid1\tref has variant type A')
+        meta2 = sequence_metadata.SequenceMetadata('non_coding\tn\tT5A\tid1\tref has wild type T')
+
+        mummer_variants = [v0, v1, v2]
+
+        expected_tuples = [
+            (1, 'n', 'T2A', 'SNP', [v0], set(), set()),   #0
+            None,                                     #1
+            (4, 'n', 'T5A', 'SNP', [v2], {meta2}, set()), #2
+        ]
+
+        expected_used_variants = [
+            set(),     #0
+            {meta0},   #1
+            {meta2},   #2
+        ]
+
+        assert len(mummer_variants) == len(expected_tuples) == len(expected_used_variants)
+
+
+        for i in range(len(mummer_variants)):
+            used_known_variants = set()
+            got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_non_coding(refdata_var_dict, mummer_variants[i])
+            self.assertEqual(expected_tuples[i], got_tuple)
+            self.assertEqual(expected_used_variants[i], got_used_variants)
+
+
+    def test_get_one_variant_for_one_contig_coding(self):
+        '''test _get_one_variant_for_one_contig_coding'''
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa'),
+            metadata_tsv=os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv')
+        )
+
+        ref_sequence_name = 'presence_absence'
+        ref_sequence = refdata.sequence(ref_sequence_name)
+        refdata_var_dict = refdata.metadata[ref_sequence_name]
+
+        v0 = pymummer.variant.Variant(pymummer.snp.Snp('6\tT\tA\t6\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('9\tA\tT\t9\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('18\tG\tT\t18\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('21\tC\tT\t21\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v4 = pymummer.variant.Variant(pymummer.snp.Snp('7\tA\tT\t7\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v5 = pymummer.variant.Variant(pymummer.snp.Snp('12\tA\tC\t11\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+
+        v6 = pymummer.variant.Variant(pymummer.snp.Snp('4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v6.update_indel(pymummer.snp.Snp('5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        v7 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tA\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v7.update_indel(pymummer.snp.Snp('4\t.\tA\t5\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        v8 = pymummer.variant.Variant(pymummer.snp.Snp('4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v8.update_indel(pymummer.snp.Snp('5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v8.update_indel(pymummer.snp.Snp('6\tT\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        v9 = pymummer.variant.Variant(pymummer.snp.Snp('4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('6\tT\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('7\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('8\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('9\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        v10 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tA\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v10.update_indel(pymummer.snp.Snp('4\t.\tT\t5\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v10.update_indel(pymummer.snp.Snp('4\t.\tT\t6\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        mummer_variants = [[v0], [v1], [v2], [v3], [v4], [v5], [v6], [v7], [v8], [v9], [v10]]
+
+        meta0 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)')
+        meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)')
+
+        expected_tuples = [
+            (1, 'p', 'D2E', 'NONSYN', [v0], {meta0}, set()),    #0
+            None,                                               #1
+            (5, 'p', 'M6I', 'NONSYN', [v2], set(), set()),      #2
+            (6, 'p', '.', 'SYN', [v3], set(), set()),           #3
+            (2, 'p', 'R3trunc', 'TRUNC', [v4], set(), {meta4}), #4
+            None,                                               #5
+            (1, 'p', 'D2fs', 'FSHIFT', [v6], set(), {meta0}),   #6
+            (1, 'p', 'D2fs', 'FSHIFT', [v7], set(), {meta0}),   #7
+            (1, 'p', 'D2del', 'DEL', [v8], set(), {meta0}),     #8
+            (1, 'p', 'D2_R3del', 'DEL', [v9], set(), {meta0}),  #9
+            (1, 'p', 'D2_R3insI', 'INS', [v10], set(), {meta0}) #10
+        ]
+
+        expected_used_variants = [
+            refdata_var_dict['p'][1], #0
+            refdata_var_dict['p'][2], #1
+            set(),                    #2
+            set(),                    #3
+            refdata_var_dict['p'][2], #4
+            refdata_var_dict['p'][3], #5
+            refdata_var_dict['p'][1], #6
+            refdata_var_dict['p'][1], #7
+            refdata_var_dict['p'][1], #8
+            refdata_var_dict['p'][1], #9
+            refdata_var_dict['p'][1], #10
+        ]
+
+        assert len(mummer_variants) == len(expected_tuples) == len(expected_used_variants)
+
+        for i in range(len(mummer_variants)):
+            used_known_variants = set()
+            got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variants[i])
+            self.assertEqual(expected_tuples[i], got_tuple)
+            self.assertEqual(expected_used_variants[i], got_used_variants)
+
+
+    def test_get_remaining_known_ref_variants_amino_acids(self):
+        '''test _get_remaining_known_ref_variants with amino acids'''
+        ref_var1 = sequence_metadata.SequenceMetadata('gene1\tp\tD2E\tid1\tfoo bar')
+        ref_var2 = sequence_metadata.SequenceMetadata('gene1\tp\tD3E\tid1\tfoo bar baz')
+        ref_var3 = sequence_metadata.SequenceMetadata('gene1\tp\tD3I\tid1\tfoo bar baz spam')
+        ref_var4 = sequence_metadata.SequenceMetadata('gene1\tp\tD10E\tid1\tfoo bar baz spam egg')
+        ref_var5 = sequence_metadata.SequenceMetadata('gene1\tp\tD14E\tid1\tfoo bar baz spam egg chips')
+        ref_var6 = sequence_metadata.SequenceMetadata('gene1\tp\tD15E\tid1\tfoo bar baz spam egg chips')
+        ref_var7 = sequence_metadata.SequenceMetadata('gene1\tp\tD40E\tid1\tfoo bar baz spam egg chips')
+
+        known_ref_variants = {
+            1: {ref_var1},
+            2: {ref_var2, ref_var3},
+            9: {ref_var4},
+            13: {ref_var5},
+            14: {ref_var6},
+            39: {ref_var7}
+        }
+
+        used_ref_variants = {ref_var3, ref_var5}
+
+        nucmer_coords = [
+            pyfastaq.intervals.Interval(6, 25),
+            pyfastaq.intervals.Interval(30, 100)
+        ]
+
+        expected = [(None, 'p', None, None, None, {x}, set()) for x in [ref_var2, ref_var6]]
+        got = assembly_variants.AssemblyVariants._get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nucmer_coords)
+        self.assertEqual(expected, got)
+
+
+    def test_get_remaining_known_ref_variants_nucleotides(self):
+        '''test _get_remaining_known_ref_variants with nucleotides'''
+        ref_var1 = sequence_metadata.SequenceMetadata('gene1\tn\tA2C\tid1\tfoo bar')
+        ref_var2 = sequence_metadata.SequenceMetadata('gene1\tn\tA3C\tid1\tfoo bar baz')
+        ref_var3 = sequence_metadata.SequenceMetadata('gene1\tn\tA3T\tid1\tfoo bar baz spam')
+        ref_var4 = sequence_metadata.SequenceMetadata('gene1\tn\tA10C\tid1\tfoo bar baz spam egg')
+        ref_var5 = sequence_metadata.SequenceMetadata('gene1\tn\tA14C\tid1\tfoo bar baz spam egg chips')
+        ref_var6 = sequence_metadata.SequenceMetadata('gene1\tn\tA15C\tid1\tfoo bar baz spam egg chips')
+        ref_var7 = sequence_metadata.SequenceMetadata('gene1\tn\tA40C\tid1\tfoo bar baz spam egg chips')
+
+        known_ref_variants = {
+            1: {ref_var1},
+            2: {ref_var2, ref_var3},
+            9: {ref_var4},
+            13: {ref_var5},
+            14: {ref_var6},
+            39: {ref_var7}
+        }
+
+        used_ref_variants = {ref_var3, ref_var5}
+
+        nucmer_coords = [
+            pyfastaq.intervals.Interval(2, 13),
+            pyfastaq.intervals.Interval(30, 100)
+        ]
+
+        expected = [(None, 'n', None, None, None, {x}, set()) for x in [ref_var2, ref_var4, ref_var7]]
+        got = assembly_variants.AssemblyVariants._get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nucmer_coords)
+        self.assertEqual(expected, got)
+
+
+    def test_get_variants_presence_absence(self):
+        '''test get_variants presence absence genes'''
+        meta1 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tid1\tref has wild type D, contig has var (GAT=D, GAA=E)')
+        meta2 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tid1\tref has variant type R, contig has wild (AGA=R, AGT=S)')
+        meta3 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD4E\tid1\tref has variant type E, contig has var (GAA=E, GAC=D)')
+        meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tA5D\tid1\tref has wild type A, contig has var (GCG=A, GAC=D)')
+        meta5 = sequence_metadata.SequenceMetadata('presence_absence\tp\tR13S\tid1\tref and qry have wild type')
+
+        metadata_tsv = 'tmp.test_get_variants_presence_absence.metadata.tsv'
+        with open(metadata_tsv, 'w') as f:
+            print(meta1, file=f)
+            print(meta2, file=f)
+            print(meta3, file=f)
+            print(meta4, file=f)
+            print(meta5, file=f)
+
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=os.path.join(data_dir, 'assembly_variants_test_get_variants_presence_absence.fa'),
+            metadata_tsv=metadata_tsv
+        )
+
+        os.unlink(metadata_tsv)
+
+        nucmer_snp_file = os.path.join(data_dir, 'assembly_variants_test_get_variants_presence_absence.snps')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('9\tA\tT\t9\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
+
+        nucmer_coords = {
+            'contig1': [pyfastaq.intervals.Interval(0, 30)],
+            'contig2': [pyfastaq.intervals.Interval(10, 41)],
+        }
+
+        expected = {
+            'contig1': [
+               (4, 'p', 'A5D', 'NONSYN', [v2, v3], {meta4}, set()),
+               (None, 'p', None, None, None, {meta1}, set()),
+               (None, 'p', None, None, None, {meta3}, set()),
+            ],
+            'contig2': [
+               (None, 'p', None, None, None, {meta3}, set()),
+               (None, 'p', None, None, None, {meta4}, set()),
+               (None, 'p', None, None, None, {meta5}, set()),
+            ],
+        }
+
+        a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file)
+        got = a_variants.get_variants('presence_absence', nucmer_coords)
+        self.assertEqual(expected, got)
+
+
+    def test_get_variants_variants_only(self):
+        '''test get_variants variants only'''
+        meta1 = sequence_metadata.SequenceMetadata('variants_only\tp\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)')
+        meta2 = sequence_metadata.SequenceMetadata('variants_only\tp\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)')
+        meta3 = sequence_metadata.SequenceMetadata('variants_only\tp\tD4E\tid1\tref has variant type E (GAA=E, GAC=D)')
+
+        metadata_tsv = 'tmp.test_get_variants_variants_only.metadata.tsv'
+        with open(metadata_tsv, 'w') as f:
+            print(meta1, file=f)
+            print(meta2, file=f)
+            print(meta3, file=f)
+
+        refdata = reference_data.ReferenceData(
+            variants_only_fa=os.path.join(data_dir, 'assembly_variants_test_get_variants_variants_only.fa'),
+            metadata_tsv=metadata_tsv
+        )
+
+        os.unlink(metadata_tsv)
+
+        nucmer_snp_file = os.path.join(data_dir, 'assembly_variants_test_get_variants_variants_only.snps')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('9\tA\tT\t9\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
+
+        nucmer_coords = {
+            'contig1': [pyfastaq.intervals.Interval(0, 41)],
+            'contig2': [pyfastaq.intervals.Interval(10, 41)],
+        }
+
+        expected = {
+            'contig1': [
+                (4, 'p', 'A5D', 'NONSYN', [v2, v3], set(), set()),
+                (None, 'p', None, None, None, {meta1}, set()),
+                (None, 'p', None, None, None, {meta3}, set()),
+            ],
+            'contig2': [(None, 'p', None, None, None, {meta3}, set())],
+        }
+
+        a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file)
+        got = a_variants.get_variants('variants_only', nucmer_coords)
+        self.assertEqual(expected, got)
+
diff --git a/ariba/tests/best_seq_chooser_test.py b/ariba/tests/best_seq_chooser_test.py
new file mode 100644
index 0000000..80d427a
--- /dev/null
+++ b/ariba/tests/best_seq_chooser_test.py
@@ -0,0 +1,69 @@
+import unittest
+import sys
+import os
+import pyfastaq
+from ariba import best_seq_chooser, external_progs
+
+modules_dir = os.path.dirname(os.path.abspath(best_seq_chooser.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
+
+
+class TestBestSeqChooser(unittest.TestCase):
+    def test_total_alignment_score(self):
+        '''test _total_alignment_score'''
+        reads1 = os.path.join(data_dir, 'best_seq_chooser_total_alignment_score_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'best_seq_chooser_total_alignment_score_reads_2.fq')
+        ref = os.path.join(data_dir, 'best_seq_chooser_total_alignment_score_ref_seqs.fa')
+        chooser = best_seq_chooser.BestSeqChooser(
+            reads1,
+            reads2,
+            ref,
+            sys.stdout,
+            samtools_exe=extern_progs.exe('samtools'),
+            bowtie2_exe=extern_progs.exe('bowtie2'),
+        )
+        self.assertEqual(3000, chooser._total_alignment_score('1'))
+
+
+    def test_get_best_seq_by_alignment_score(self):
+        '''test _get_best_seq_by_alignment_score'''
+        reads1 = os.path.join(data_dir, 'best_seq_chooser_get_best_seq_by_alignment_score_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'best_seq_chooser_get_best_seq_by_alignment_score_reads_2.fq')
+        ref = os.path.join(data_dir, 'best_seq_chooser_get_best_seq_by_alignment_score_ref.fa')
+        chooser = best_seq_chooser.BestSeqChooser(
+            reads1,
+            reads2,
+            ref,
+            sys.stdout,
+            samtools_exe=extern_progs.exe('samtools'),
+            bowtie2_exe=extern_progs.exe('bowtie2'),
+        )
+        self.assertEqual('1', chooser._get_best_seq_by_alignment_score())
+
+
+    def test_best_seq(self):
+        '''test best_seq'''
+        reads1 = os.path.join(data_dir, 'best_seq_chooser_best_seq_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'best_seq_chooser_best_seq_reads_2.fq')
+        ref = os.path.join(data_dir, 'best_seq_chooser_best_seq_ref.fa')
+        expected_seq = pyfastaq.sequences.Fasta('1', ''.join([
+            'AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG',
+            'TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC',
+            'CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA',
+            'GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG',
+            'TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC',
+        ]))
+
+        tmp_file = 'tmp.best_seq.fa'
+        chooser = best_seq_chooser.BestSeqChooser(
+            reads1,
+            reads2,
+            ref,
+            sys.stdout,
+            samtools_exe=extern_progs.exe('samtools'),
+            bowtie2_exe=extern_progs.exe('bowtie2'),
+        )
+        got_seq = chooser.best_seq(tmp_file)
+        self.assertEqual(expected_seq, got_seq)
+        os.unlink(tmp_file)
diff --git a/ariba/tests/card_record_test.py b/ariba/tests/card_record_test.py
new file mode 100644
index 0000000..6058eac
--- /dev/null
+++ b/ariba/tests/card_record_test.py
@@ -0,0 +1,205 @@
+import unittest
+import os
+from ariba import card_record
+
+modules_dir = os.path.dirname(os.path.abspath(card_record.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestCardRecord(unittest.TestCase):
+    def test_ARO_id(self):
+        '''test _ARO_id'''
+        d = {'spam': 'eggs'}
+        self.assertEqual(None, card_record.CardRecord._ARO_id(d))
+        d['ARO_id'] = '123'
+        self.assertEqual('123', card_record.CardRecord._ARO_id(d))
+
+
+    def test_ARO_accession(self):
+        '''test _ARO_accession'''
+        d = {'spam': 'eggs'}
+        self.assertEqual(None, card_record.CardRecord._ARO_accession(d))
+        d['ARO_accession'] = '321'
+        self.assertEqual('321', card_record.CardRecord._ARO_accession(d))
+
+
+    def test_ARO_name(self):
+        '''test _ARO_name'''
+        d = {'spam': 'eggs'}
+        self.assertEqual(None, card_record.CardRecord._ARO_name(d))
+        d['ARO_name'] = 'Dave Lister'
+        self.assertEqual('Dave Lister', card_record.CardRecord._ARO_name(d))
+
+
+    def test_ARO_description(self):
+        '''test _ARO_description'''
+        d = {'spam': 'eggs'}
+        self.assertEqual(None, card_record.CardRecord._ARO_description(d))
+        d['ARO_description'] = 'Technician, Third Class'
+        self.assertEqual('Technician, Third Class', card_record.CardRecord._ARO_description(d))
+
+
+    def test_ARO_name_to_fasta_name(self):
+        '''test _ARO_name_to_fasta_name'''
+        tests = [
+            ('x', 'x'),
+            ('abcD foo bar match at the start', 'abcD'),
+            ('foo bar abcD match in the middle', 'abcD'),
+            ('match at the end abcD', 'abcD'),
+            ('use first three foo bar', 'use_first_three'),
+            ('remove.any.dots first three', 'remove_any_dots_first_three')
+        ]
+
+        for aro_name, expected in tests:
+            got = card_record.CardRecord._ARO_name_to_fasta_name(aro_name)
+            self.assertEqual(expected, got)
+
+
+    def test_dna_seqs_and_genbank_ids(self):
+        '''test _dna_seqs_and_genbank_ids'''
+        d = {'spam': 'eggs'}
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences'] = {}
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence'] = {}
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence']['foo'] = {}
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence']['foo']['dna_sequence'] = {}
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence']['foo']['dna_sequence']['sequence'] = 'ACGT'
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence']['foo']['dna_sequence']['accession'] = 'ABC123'
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence']['foo']['dna_sequence']['fmin'] = '42'
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence']['foo']['dna_sequence']['fmax'] = '4242'
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence']['foo']['protein_sequence'] = {}
+        d['model_sequences']['sequence']['foo']['protein_sequence']['GI'] = '123456789'
+        self.assertEqual([], card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+        d['model_sequences']['sequence']['foo']['protein_sequence']['sequence'] = 'III'
+        expected = [('foo', '123456789', 'ABC123', '42', '4242', 'ACGT', 'III')]
+        self.assertEqual(expected, card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+
+        d['model_sequences']['sequence']['bar'] = {
+            'dna_sequence': {
+                'sequence': 'TTTT',
+                'fmin': '42',
+                'fmax': '4242',
+                'accession': 'BCD234',
+            },
+            'protein_sequence': {
+                'GI': '',
+                'sequence': ''
+            }
+        }
+
+        expected = [('bar', 'NA', 'BCD234', '42', '4242', 'TTTT', '')] + expected
+        self.assertEqual(expected, card_record.CardRecord._dna_seqs_and_genbank_ids(d))
+
+
+    def test_snps(self):
+        '''test snps'''
+        d = {'spam': 'eggs'}
+        self.assertEqual(set(), card_record.CardRecord._snps(d))
+        d['model_param'] = {}
+        self.assertEqual(set(), card_record.CardRecord._snps(d))
+        d['model_param']['snp'] = {}
+        self.assertEqual(set(), card_record.CardRecord._snps(d))
+        d['model_param']['snp']['param_value'] = {}
+        self.assertEqual(set(), card_record.CardRecord._snps(d))
+        d['model_param']['snp']['param_value'] = {
+                '1': 'I42L',
+                '2': 'S100T',
+       }
+        self.assertEqual({'I42L', 'S100T'}, card_record.CardRecord._snps(d))
+
+
+    def test_get_data(self):
+        d = {
+            'ARO_id': '123',
+            'ARO_accession': '1234567',
+            'ARO_name': 'ARO_name1',
+            'ARO_description': 'ARO description that we want.',
+            'model_id': '1',
+            'model_name': 'Model_name1',
+            'model_type': 'protein homolog model',
+            'model_type_id': '12345',
+            'model_description': 'Models to detect proteins conferring antibiotic resistance, which include a reference protein sequence and a curated BLASTP cut-off.',
+            'model_sequences': {
+                'sequence': {
+                    '1234': {
+                        'protein_sequence': {
+                            'sequence': 'MCDE*',
+                            'GI': '229597524'
+                        },
+                        'dna_sequence': {
+                            'sequence': 'ATGTGCGATGAATAA',
+                            'strand': '+',
+                            'fmax': '1194',
+                            'fmin': '0',
+                            'accession': 'XX0000001'
+                        },
+                        'NCBI_taxonomy': {
+                            'NCBI_taxonomy_cvterm_id': '234567',
+                            'NCBI_taxonomy_id': '42',
+                            'NCBI_taxonomy_name': 'Genus1 species1'
+                        }
+                    }
+                }
+            },
+            'model_param': {
+                'blastp_evalue': {} # we're ignoring this, so make it empty for tests to save a few lines
+            },
+            'ARO_category': {
+                '36696': {
+                    'category_aro_description': 'Enzyme that catalyzes the inactivation of an antibiotic resulting in resistance.  Inactivation includes chemical modification, destruction, etc.',
+                    'category_aro_cvterm_id': '36696',
+                    'category_aro_accession': '3000557',
+                    'category_aro_name': 'antibiotic inactivation enzyme'
+                },
+                '36268': {
+                    'category_aro_description': 'Genes conferring resistance to beta-lactams.',
+                    'category_aro_cvterm_id': '36268',
+                    'category_aro_accession': '3000129',
+                    'category_aro_name': 'beta-lactam resistance gene'
+                }
+            },
+        }
+
+        expected = {
+            'ARO_id': '123',
+            'ARO_accession': '1234567',
+            'ARO_name': 'ARO_name1',
+            'ARO_description': 'ARO description that we want.',
+            'dna_seqs_and_ids': [(
+                '1234',
+                '229597524',
+                'XX0000001',
+                '0',
+                '1194',
+                'ATGTGCGATGAATAA',
+                'MCDE*'
+            )],
+            'snps': set(),
+        }
+
+        record = card_record.CardRecord(d)
+        got = record.get_data()
+        self.assertEqual(expected, got)
+
+        d['model_param'] = {
+            'snp': {
+                'param_value': {
+                    '1': 'I42L',
+                    '2': 'S100T',
+                }
+            }
+        }
+
+        expected['snps'] = {'I42L', 'S100T'}
+        record = card_record.CardRecord(d)
+        got = record.get_data()
+        self.assertEqual(expected, got)
+
diff --git a/ariba/tests/cdhit_test.py b/ariba/tests/cdhit_test.py
index dcb1aec..813dba4 100644
--- a/ariba/tests/cdhit_test.py
+++ b/ariba/tests/cdhit_test.py
@@ -1,10 +1,11 @@
 import unittest
 import os
 import filecmp
-from ariba import cdhit
+from ariba import cdhit, external_progs
 
 modules_dir = os.path.dirname(os.path.abspath(cdhit.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
 
 class TestCdhit(unittest.TestCase):
     def test_init_fail_infile_missing(self):
@@ -13,54 +14,48 @@ class TestCdhit(unittest.TestCase):
             r = cdhit.Runner('oopsnotafile', 'out')
 
 
-    def test_enumerate_fasta(self):
-        '''test _enumerate_fasta'''
-        infile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.in.fa')
-        expected_outfile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.out.fa')
-        tmpfile = 'tmp.test_enumerate_fasta.out.fa'
-        expected_dict = {'1': 'a', '2': 'b', '3': 'c'}
-        r = cdhit.Runner(infile, 'out')
-        got_dict = r._enumerate_fasta(infile, tmpfile)
-        self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False))
-        self.assertEqual(expected_dict, got_dict)
-        os.unlink(tmpfile)
-
-
     def test_get_ids(self):
         '''test _get_ids'''
         infile = os.path.join(data_dir, 'cdhit_test_get_ids.fa')
         expected = {'id1', 'id2', 'id3'}
-        r = cdhit.Runner(infile, 'out')
+        r = cdhit.Runner(infile, 'out', cd_hit_est=extern_progs.exe('cdhit'))
         got = r._get_ids(infile)
         self.assertEqual(expected, got)
 
 
     def test_parse_cluster_info_file(self):
         '''test _parse_cluster_info_file'''
-        infile = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.in.fa')
-        r = cdhit.Runner(infile, 'out')
-        names_dict = {str(i): 'seq' + str(i) for i in range(1,5)}
-        cluster_representatives = {'1', '4'}
-        cluster_file = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.out.fa.bak.clstr')
-        got_clusters, got_reps = r._parse_cluster_info_file(cluster_file, names_dict, cluster_representatives)
+        cluster_representatives = {'seq1', 'seq4', 'seq6'}
+        infile = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.infile')
+        got_clusters = cdhit.Runner._parse_cluster_info_file(infile, cluster_representatives)
         expected_clusters = {
-            '0': {'seq1', 'seq2', 'seq3'},
-            '1': {'seq4'}
+            'seq1': {'seq1', 'seq2', 'seq3'},
+            'seq4': {'seq4'},
+            'seq6': {'seq5', 'seq6'},
         }
-        expected_reps = {'1': '0', '4': '1'}
         self.assertEqual(expected_clusters, got_clusters)
-        self.assertEqual(expected_reps, got_reps)
-
-
-    def test_rename_fasta(self):
-        '''test _rename_fasta'''
-        infile = os.path.join(data_dir, 'cdhit_test_rename_fasta.in.fa')
-        tmpfile = 'tmp.rename_fasta.out.fa'
-        expected = os.path.join(data_dir, 'cdhit_test_rename_fasta.out.fa')
-        names_dict = {'a': 'seq1', 'b': 'seq2', 'c': 'seq3'}
-        r = cdhit.Runner(infile, 'out')
-        r._rename_fasta(infile, tmpfile, names_dict)
-        self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+
+
+    def test_rename_clusters(self):
+        '''test _rename_clusters'''
+        infile = os.path.join(data_dir, 'cdhit_test_rename_clusters.in.fa')
+        tmpfile = 'tmp.test_rename_clusters.out.fa'
+        expected_file = os.path.join(data_dir, 'cdhit_test_rename_clusters.expected.fa')
+
+        clusters_in = {
+            'seq.foo': {'seq.foo', 'seq'},
+            'seq.bar': {'seq.bar', 'seq3.spam'},
+            'seq4.eggs': {'seq4.eggs'}
+        }
+        tmp_out = 'tmp.test_rename_clusters.out.fa'
+        expected_clusters = {
+            'seq.x': {'seq.foo', 'seq'},
+            'seq.x.2': {'seq.bar', 'seq3.spam'},
+            'seq4.x': {'seq4.eggs'}
+        }
+        got = cdhit.Runner._rename_clusters(clusters_in, infile, tmpfile)
+        self.assertEqual(expected_clusters, got)
+        self.assertTrue(filecmp.cmp(expected_file, tmpfile, shallow=False))
         os.unlink(tmpfile)
 
 
@@ -69,11 +64,11 @@ class TestCdhit(unittest.TestCase):
         infile = os.path.join(data_dir, 'cdhit_test_run.in.fa')
         expected_outfile = os.path.join(data_dir, 'cdhit_test_run.out.fa')
         tmpfile = 'tmp.cdhit_test_run.out.fa'
-        r = cdhit.Runner(infile, tmpfile)
+        r = cdhit.Runner(infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
         clusters = r.run()
         expected_clusters = {
-            '0': {'seq1', 'seq2', 'seq3'},
-            '1': {'seq4'},
+            'seq1.x': {'seq1', 'seq2', 'seq3'},
+            'seq4.x': {'seq4'},
         }
         self.assertEqual(clusters, expected_clusters)
         self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
@@ -85,13 +80,13 @@ class TestCdhit(unittest.TestCase):
         infile = os.path.join(data_dir, 'cdhit_test_fake_run.in.fa')
         expected_outfile = os.path.join(data_dir, 'cdhit_test_fake_run.out.fa')
         tmpfile = 'tmp.cdhit_test_fake_run.out.fa'
-        r = cdhit.Runner(infile, tmpfile)
+        r = cdhit.Runner(infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
         clusters = r.fake_run()
         expected_clusters = {
-            '0': {'seq1'},
-            '1': {'seq2'},
-            '2': {'seq3'},
-            '3': {'seq4'},
+            'seq1.x': {'seq1'},
+            'seq2.x': {'seq2'},
+            'seq3.x': {'seq3'},
+            'seq4.x': {'seq4'},
         }
         self.assertEqual(clusters, expected_clusters)
         self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
@@ -102,8 +97,51 @@ class TestCdhit(unittest.TestCase):
         '''test fake_run with non-unique names'''
         infile = os.path.join(data_dir, 'cdhit_test_fake_run.non-unique.in.fa')
         tmpfile = 'tmp.cdhit_test_fake_run.out.non-unique.fa'
-        r = cdhit.Runner(infile, tmpfile)
+        r = cdhit.Runner(infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
         with self.assertRaises(cdhit.Error):
             clusters = r.fake_run()
-        os.unlink(tmpfile)
 
+
+    def test_load_user_clusters_file_good_file(self):
+        '''test _load_user_clusters_file with good input file'''
+        infile = os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.good')
+        expected  = {
+            'seq1': 'seq1',
+            'seq2': 'seq1',
+            'seq3': 'seq1',
+            'seq4': 'seq4',
+            'seq5': 'seq5',
+            'seq6': 'seq5',
+        }
+
+        got = cdhit.Runner._load_user_clusters_file(infile)
+        self.assertEqual(expected, got)
+
+
+    def test_load_user_clusters_file_bad_file(self):
+        '''test _load_user_clusters_file with bad input files'''
+        infiles = [
+            os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad1'),
+            os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad2'),
+            os.path.join(data_dir, 'cdhit_test_load_user_clusters_file.bad3')
+        ]
+        for filename in infiles:
+            with self.assertRaises(cdhit.Error):
+                cdhit.Runner._load_user_clusters_file(filename)
+
+
+    def test_run_get_clusters_from_file(self):
+        '''test run_get_clusters_from_file'''
+        fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.fa')
+        clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters')
+        expected_outfile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.out.fa')
+        tmpfile = 'tmp.cdhit_test_run_get_clusters_from_dict.out.fa'
+        r = cdhit.Runner(fa_infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
+        clusters = r.run_get_clusters_from_file(clusters_infile)
+        expected_clusters = {
+            'seq1.x': {'seq1', 'seq2'},
+            'seq3.x': {'seq3'},
+        }
+        self.assertEqual(clusters, expected_clusters)
+        self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
+        os.unlink(tmpfile)
diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py
index 8c60b75..6e959eb 100644
--- a/ariba/tests/cluster_test.py
+++ b/ariba/tests/cluster_test.py
@@ -6,10 +6,11 @@ import filecmp
 import pyfastaq
 import pysam
 import pymummer
-from ariba import cluster, flag
+from ariba import cluster, flag, reference_data
 
 modules_dir = os.path.dirname(os.path.abspath(cluster.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+cluster.unittest = True
 
 
 def clean_cluster_dir(d, exclude=None):
@@ -31,809 +32,217 @@ def clean_cluster_dir(d, exclude=None):
                 os.unlink(full_path)
 
 
-def file2lines(filename):
-    f = pyfastaq.utils.open_file_read(filename)
-    lines = f.readlines()
-    pyfastaq.utils.close(f)
-    return lines
-
-
-def load_gene(filename):
-    file_reader = pyfastaq.sequences.file_reader(filename)
-    seq = None
-    for seq in file_reader:
-        pass
-    return seq
-
-
 class TestCluster(unittest.TestCase):
     def test_init_fail_files_missing(self):
         '''test init_fail_files_missing'''
+        refdata_fa = os.path.join(data_dir, 'cluster_test_init_refdata.fa')
+        refdata = reference_data.ReferenceData(presence_absence_fa=refdata_fa)
+
         dirs = [
-            'cluster_test_directorynotexist'
-            'cluster_test_init_no_genes_fa',
+            'cluster_test_init_no_refs_fa',
             'cluster_test_init_no_reads_1',
             'cluster_test_init_no_reads_2',
         ]
         dirs = [os.path.join(data_dir, d) for d in dirs]
         for d in dirs:
-            clean_cluster_dir(d)
+            tmpdir = 'tmp.cluster_test_init_fail_files_missing'
+            shutil.copytree(d, tmpdir)
             with self.assertRaises(cluster.Error):
-                c = cluster.Cluster(d, 'name')
-            clean_cluster_dir(d)
-
-
-    def test_get_read_counts(self):
-        '''test _get_read_counts pass'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        self.assertEqual(2, c._get_read_counts())
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_read_counts_fail(self):
-        '''test _get_read_counts fail'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts_fail')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        with self.assertRaises(cluster.Error):
-            c._get_read_counts()
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_total_alignment_score(self):
-        '''test _get_total_alignment_score'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_get_total_alignment_score')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        got_score = c._get_total_alignment_score('1')
-        expected_score = 3000
-        self.assertEqual(got_score, expected_score)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_best_gene_by_alignment_score(self):
-        '''test _get_best_gene_by_alignment_score'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_get_best_gene_by_alignment_score')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        got_name = c._get_best_gene_by_alignment_score()
-        self.assertEqual(got_name, '1')
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_choose_best_gene(self):
-        '''test _choose_best_gene'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_choose_best_gene')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        expected_gene = pyfastaq.sequences.Fasta('1', ''.join([
-            'AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG',
-            'TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC',
-            'CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA',
-            'GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG',
-            'TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC',
-        ]))
-        expected_gene_fa = os.path.join(data_dir, 'cluster_test_choose_best_gene.gene.fa')
-        got = c._choose_best_gene()
-        self.assertEqual(got, expected_gene)
-        self.assertTrue(filecmp.cmp(expected_gene_fa, c.gene_fa, shallow=False))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_set_assembly_kmer(self):
-        '''test _set_assembly_kmer'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_set_assembly_kmer')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name', assembly_kmer=42)
-        self.assertEqual(c.assembly_kmer, 42)
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(os.path.join(data_dir, 'cluster_test_set_assembly_kmer'), 'name')
-        self.assertEqual(c.assembly_kmer, 5)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_assemble_with_spades(self):
-        '''test _assemble_with_spades'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa)
-        c._assemble_with_spades(unittest=True)
-        self.assertEqual(c.status_flag.to_number(), 0)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_assemble_with_spades_fail(self):
-        '''test _assemble_with_spades handles spades fail'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa)
-        c._assemble_with_spades()
-        self.assertEqual(c.status_flag.to_number(), 64)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_scaffold_with_sspace(self):
-        '''test _scaffold_with_sspace'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_scaffold_with_sspace')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(
-            os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.contigs.fa'),
-            c.assembly_contigs
+                c = cluster.Cluster(tmpdir, 'name', refdata=refdata, total_reads=42, total_reads_bases=4242)
+            shutil.rmtree(tmpdir)
+
+
+    def test_number_of_reads_for_assembly(self):
+        '''Test _number_of_reads_for_assembly'''
+        # ref is 100bp long
+        ref_fa = os.path.join(data_dir, 'cluster_test_number_of_reads_for_assembly.ref.fa')
+        tests = [
+            (50, 1000, 10, 20, 40),
+            (50, 999, 10, 20, 42),
+            (50, 1000, 10, 10, 20),
+            (50, 1000, 10, 5, 10),
+        ]
+
+        for insert, bases, reads, coverage, expected in tests:
+            self.assertEqual(expected, cluster.Cluster._number_of_reads_for_assembly(ref_fa, insert, bases, reads, coverage))
+
+
+    def test_make_reads_for_assembly_proper_sample(self):
+        '''Test _make_reads_for_assembly when sampling from reads'''
+        reads_in1 = os.path.join(data_dir, 'cluster_test_make_reads_for_assembly.in1.fq')
+        reads_in2 = os.path.join(data_dir, 'cluster_test_make_reads_for_assembly.in2.fq')
+        expected_out1 = os.path.join(data_dir, 'cluster_test_make_reads_for_assembly.out1.fq')
+        expected_out2 = os.path.join(data_dir, 'cluster_test_make_reads_for_assembly.out2.fq')
+        reads_out1 = 'tmp.test_make_reads_for_assembly.reads.out1.fq'
+        reads_out2 = 'tmp.test_make_reads_for_assembly.reads.out2.fq'
+        reads_written = cluster.Cluster._make_reads_for_assembly(10, 20, reads_in1, reads_in2, reads_out1, reads_out2, random_seed=42)
+        self.assertEqual(14, reads_written)
+        self.assertTrue(filecmp.cmp(expected_out1, reads_out1, shallow=False))
+        self.assertTrue(filecmp.cmp(expected_out2, reads_out2, shallow=False))
+        os.unlink(reads_out1)
+        os.unlink(reads_out2)
+
+
+    def test_make_reads_for_assembly_symlinks(self):
+        '''Test _make_reads_for_assembly when just makes symlinks'''
+        reads_in1 = os.path.join(data_dir, 'cluster_test_make_reads_for_assembly.in1.fq')
+        reads_in2 = os.path.join(data_dir, 'cluster_test_make_reads_for_assembly.in2.fq')
+        expected_out1 = os.path.join(data_dir, 'cluster_test_make_reads_for_assembly.out1.fq')
+        expected_out2 = os.path.join(data_dir, 'cluster_test_make_reads_for_assembly.out2.fq')
+        reads_out1 = 'tmp.test_make_reads_for_assembly.reads.out1.fq'
+        reads_out2 = 'tmp.test_make_reads_for_assembly.reads.out2.fq'
+        reads_written = cluster.Cluster._make_reads_for_assembly(20, 20, reads_in1, reads_in2, reads_out1, reads_out2, random_seed=42)
+        self.assertEqual(20, reads_written)
+        self.assertTrue(os.path.islink(reads_out1))
+        self.assertTrue(os.path.islink(reads_out2))
+        self.assertEqual(os.readlink(reads_out1), reads_in1)
+        self.assertEqual(os.readlink(reads_out2), reads_in2)
+        os.unlink(reads_out1)
+        os.unlink(reads_out2)
+
+
+    def test_full_run_choose_ref_fail(self):
+        '''test complete run of cluster when choosing ref seq fails'''
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail.presence_absence.fa')
         )
-        #shutil.copyfile(os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.gene.fa'), c.gene_fa)
-        c._scaffold_with_sspace()
-        self.assertTrue(os.path.exists(c.scaffolder_scaffolds))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_gap_fill_with_gapfiller_no_gaps(self):
-        '''test _gap_fill_with_gapfiller no gaps'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(
-            os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa'),
-            c.scaffolder_scaffolds
+        tmpdir = 'tmp.test_full_run_choose_ref_fail'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=2, total_reads_bases=108)
+        c.run()
+
+        expected = '\t'.join(['.', '.', '1088', '2', 'cluster_name'] + ['.'] * 24)
+        self.assertEqual([expected], c.report_lines)
+        self.assertTrue(c.status_flag.has('ref_seq_choose_fail'))
+        self.assertTrue(c.status_flag.has('assembly_fail'))
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_assembly_fail(self):
+        '''test complete run of cluster when assembly fails'''
+        refdata = reference_data.ReferenceData(
+            non_coding_fa=os.path.join(data_dir, 'cluster_test_full_run_assembly_fail.noncoding.fa')
         )
-        c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT')
-        c._gap_fill_with_gapfiller()
-        self.assertTrue(os.path.exists(c.gapfilled_scaffolds))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_gap_fill_with_gapfiller_with_gaps(self):
-        '''test _gap_fill_with_gapfiller with gaps'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(
-            os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa'),
-            c.scaffolder_scaffolds
+        tmpdir = 'tmp.test_full_run_assembly_fail'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_assembly_fail'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=4, total_reads_bases=304)
+        c.run()
+
+        expected = '\t'.join(['noncoding_ref_seq', 'non_coding', '64', '4', 'cluster_name'] + ['.'] * 24)
+        self.assertEqual([expected], c.report_lines)
+        self.assertFalse(c.status_flag.has('ref_seq_choose_fail'))
+        self.assertTrue(c.status_flag.has('assembly_fail'))
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_ok_non_coding(self):
+        '''test complete run of cluster on a noncoding sequence'''
+        refdata = reference_data.ReferenceData(
+            non_coding_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding.metadata.tsv')
         )
-        c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT')
-        c._gap_fill_with_gapfiller()
-        self.assertTrue(os.path.exists(c.gapfilled_scaffolds))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_rename_scaffolds(self):
-        '''test _rename_scaffolds'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_rename_scaffolds')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT')
-        infile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.in.fa')
-        outfile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.out.fa')
-        tmpfile = 'tmp.fa'
-        c._rename_scaffolds(infile, tmpfile)
-        self.assertTrue(filecmp.cmp(outfile, tmpfile, shallow=False))
-        os.unlink(tmpfile)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_fix_contig_orientation(self):
-        '''test _fix_contig_orientation'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_fix_contig_orientation')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        scaffs_in = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.in.fa')
-        scaffs_out = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.out.fa')
-        shutil.copyfile(scaffs_in, c.gapfilled_scaffolds)
-        shutil.copyfile(os.path.join(data_dir, 'cluster_test_fix_contig_orientation.gene.fa'), c.gene_fa)
-        c._fix_contig_orientation()
-        self.assertTrue(filecmp.cmp(scaffs_out, c.final_assembly_fa, shallow=False))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_load_final_contigs(self):
-        '''test _load_final_contigs'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_load_final_contigs')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        contigs_file = os.path.join(data_dir, 'cluster_test_load_final_contigs.contigs.fa')
-        shutil.copyfile(contigs_file, c.final_assembly_fa)
-        c._load_final_contigs()
-        expected = {
-            'spam': pyfastaq.sequences.Fasta('spam', 'ACGT'),
-            'egg1': pyfastaq.sequences.Fasta('egg1', 'TGCA'),
-            'egg2': pyfastaq.sequences.Fasta('egg2', 'AAAA'),
-        }
-        self.assertEqual(expected, c.final_assembly)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_parse_assembly_vs_gene_coords(self):
-        '''test _parse_assembly_vs_gene_coords'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_parse_assembly_vs_gene_coords')
-        coords_file = os.path.join(data_dir, 'cluster_test_parse_assembly_vs_gene_coords.coords')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(coords_file, c.assembly_vs_gene_coords)
-        c.gene = pyfastaq.sequences.Fasta('gene', 'AAACCCGGGTTT')
-        c._parse_assembly_vs_gene_coords()
-        line1 = ['1', '1000', '1', '1000', '1000', '1000', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1']
-        line2 = ['1', '240', '1', '240', '240', '240', '100.00', '1000', '580', '1', '1', 'gene', 'contig2']
-        line3 = ['661', '1000', '241', '580', '340', '340', '100.00', '1000', '580', '1', '1', 'gene', 'contig2']
-        expected = {
-            'contig1': [pymummer.alignment.Alignment('\t'.join(line1))],
-            'contig2': [pymummer.alignment.Alignment('\t'.join(line2)), pymummer.alignment.Alignment('\t'.join(line3))],
-        }
-        self.assertEqual(expected, c.nucmer_hits)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_parse_assembly_bam(self):
-        '''test _parse_assembly_bam'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_parse_assembly_bam')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        bam = os.path.join(data_dir, 'cluster_test_parse_assembly_bam.bam')
-        assembly_fa = os.path.join(data_dir, 'cluster_test_parse_assembly_bam.assembly.fa')
-        shutil.copyfile(bam, c.final_assembly_bam)
-        shutil.copy(assembly_fa, c.final_assembly_fa)
-        c._load_final_contigs()
-        c._parse_assembly_bam()
-        for e in ['scaff', 'soft_clipped', 'unmapped_mates']:
-            self.assertTrue(os.path.exists(c.final_assembly_bam + '.' + e))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_nucmer_hits_to_percent_identity(self):
-        '''test _nucmer_hits_to_percent_identity'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hits = [
-            ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['1', '42', '1', '42', '42', '42', '42.42', '1000', '1000', '1', '1', 'gene', 'scaff2'],
-        ]
-        c.nucmer_hits = {
-            'scaff1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-            ],
-            'scaff2': [
-                pymummer.alignment.Alignment('\t'.join(hits[2])),
-            ]
-        }
-        expected = {'scaff1': round((90*10 + 100*34) / (10+34), 2), 'scaff2': 42.42}
-        c._nucmer_hits_to_percent_identity()
-        self.assertEqual(expected, c.percent_identities)
-
-
-    def test_nucmer_hits_to_scaff_coords(self):
-        '''test _nucmer_hits_to_scaff_coords'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hits = [
-            ['1', '10', '1', '10', '10', '10', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['50', '52', '50', '52', '3', '3', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff2'],
-        ]
-        c.nucmer_hits = {
-            'scaff1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-                pymummer.alignment.Alignment('\t'.join(hits[2])),
-            ],
-            'scaff2': [
-                pymummer.alignment.Alignment('\t'.join(hits[3])),
-            ]
-        }
-        got = c._nucmer_hits_to_scaff_coords()
-        expected = {
-            'scaff1': [
-                pyfastaq.intervals.Interval(0, 41),
-                pyfastaq.intervals.Interval(49, 51)
-            ],
-            'scaff2': [
-                pyfastaq.intervals.Interval(0, 41),
-            ]
-        }
-        self.assertEqual(got, expected)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_nucmer_hits_to_ref_coords(self):
-        '''test _nucmer_hits_to_ref_coords'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hits = [
-            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1'],
-            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'gene', 'contig1'],
-            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'gene', 'contig2'],
-        ]
-        c.nucmer_hits = {
-            'contig1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-            ],
-            'contig2': [
-                pymummer.alignment.Alignment('\t'.join(hits[2])),
-            ]
-        }
-        got_coords = c._nucmer_hits_to_ref_coords()
+
+        tmpdir = 'tmp.test_full_run_ok_non_coding'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=72, total_reads_bases=3600)
+        c.run()
+
         expected = [
-            pyfastaq.intervals.Interval(0,41),
-            pyfastaq.intervals.Interval(99, 109),
-            pyfastaq.intervals.Interval(99, 141),
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t73\t73\tT\t19\t.\t19\tnoncoding1:n:A14T:.:ref has wild type, reads has variant so should report\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\tG61T\tSNP\t60\t60\tG\t120\t120\tT\t24\t.\t24\t.\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\t.82C\tINS\t81\t81\t.\t142\t142\tC\t23\t.\t23\t.\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t0\t.\tn\t.\t0\tT108.\tDEL\t107\t107\tT\t167\t167\t.\t17\t.\t17\t.\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\t.\t19\tnoncoding1:n:A6G:.:variant in ref and reads so should report\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\t.\t19\tnoncoding1:n:G9T:.:wild type in ref and reads\tgeneric description of noncoding1'
         ]
-        self.assertEqual(got_coords, expected)
 
-        got_coords = c._nucmer_hits_to_ref_coords(contig='contig2')
+        self.assertEqual(expected, c.report_lines)
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_ok_presence_absence(self):
+        '''test complete run of cluster on a presence absence gene'''
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence.metadata.tsv'),
+        )
+
+        tmpdir = 'tmp.cluster_test_full_run_ok_presence_absence'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=64, total_reads_bases=3200)
+        c.run()
+
         expected = [
-            pyfastaq.intervals.Interval(99, 109),
-        ]
-        self.assertEqual(got_coords, expected)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_nucmer_hits_to_gene_cov_per_contig(self):
-        '''test _nucmer_hits_to_gene_cov_per_contig'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hits = [
-            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1'],
-            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'gene', 'contig1'],
-            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'gene', 'contig2'],
-        ]
-        c.nucmer_hits = {
-            'contig1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-            ],
-            'contig2': [
-                pymummer.alignment.Alignment('\t'.join(hits[2])),
-            ]
-        }
-
-        expected = {'contig1': 85, 'contig2': 11}
-        self.assertEqual(expected, c._nucmer_hits_to_gene_cov_per_contig())
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_nucmer_hits_to_assembled_gene_sequences(self):
-        '''test _nucmer_hits_to_assembled_gene_sequences'''
-        ref_gene = pyfastaq.sequences.Fasta('ref_gene', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG')
-        assembly = {
-            'contig1': pyfastaq.sequences.Fasta('contig1', 'CATCTATGCTGCATCGATCACTGACGTATCATCATCAGCGTACTGACGTATTAGTTTGTAATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAGACGTCGTACTACTGTATATGCATCGATCTGAA'),
-            'contig2': pyfastaq.sequences.Fasta('contig2', 'AGTGATATCCTGCGATCTATAATTTTTTTCGCGGGATCTTGAACGCGACGATGTTCGATAATTCAATGCAAAGGAGCGACCCGCAAGTACACAGGACTGCAAA')
-        }
-
-        hits = [
-            ['1', '147', '61', '207', '147', '147', '100.00', '147', '239', '1', '1', 'ref_gene', 'contig1'],
-            ['18', '120', '103', '1', '103', '103', '100.00', '147', '103', '1', '-1', 'ref_gene', 'contig2']
-        ]
-        nucmer_hits = {
-            'contig1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-            ],
-            'contig2': [
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-            ]
-        }
-
-        assembly_fasta = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.assembly.fa')
-        tmp_outfile = 'tmp.test_nucmer_hits_to_assembled_gene_sequences.out.fa'
-        expected_outfile = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa')
-        cluster.Cluster._nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, tmp_outfile)
-        self.assertTrue(filecmp.cmp(tmp_outfile, expected_outfile, shallow=False))
-        os.unlink(tmp_outfile)
-
-
-    def test_whole_gene_covered_by_nucmer_hits(self):
-        '''test _whole_gene_covered_by_nucmer_hits'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT')
-        hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1']
-        hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
-        hit3 = ['6', '10', '6', '10', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
-        nucmer_hits = [
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
-            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]},
-            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]}
-        ]
-        expected = [True, False, True]
-        for i in range(len(nucmer_hits)):
-            c.nucmer_hits = nucmer_hits[i]
-            self.assertEqual(expected[i], c._whole_gene_covered_by_nucmer_hits())
-
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_gene_coverage_unique(self):
-        '''test _gene_coverage_unique'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT')
-        hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1']
-        hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
-        c.nucmer_hits = { 'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))] }
-        self.assertTrue(c._gene_coverage_unique())
-        c.nucmer_hits['contig2'] = [pymummer.alignment.Alignment('\t'.join(hit2))]
-        self.assertFalse(c._gene_coverage_unique())
-
-
-    def test_gene_covered_by_complete_contig_with_orf(self):
-        '''test _gene_covered_by_complete_contig_with_orf'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        gene_no_orf = pyfastaq.sequences.Fasta('gene', 'GATTGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        c.gene = gene
-        hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
-        hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
-        hit3 = ['21', '39', '21', '39', '19', '19', '100.00', '39', '39', '1', '1', 'gene', 'contig2']
-        nucmer_hits = [
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
-            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]},
-            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]},
-        ]
-        expected = [True, False, False, False]
-        assemblies = [
-            {'contig1': gene},
-            {'contig1': gene_no_orf},
-            {'contig1': gene},
-            {'contig1': gene, 'contig2': pyfastaq.sequences.Fasta('contig2', 'ACGT')}
-        ]
-        assert len(expected) == len(nucmer_hits) == len(assemblies)
-        for i in range(len(expected)):
-            c.final_assembly = assemblies[i]
-            c.nucmer_hits = nucmer_hits[i]
-            self.assertEqual(c._gene_covered_by_complete_contig_with_orf(), expected[i])
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_gene_covered_by_at_least_one_full_length_contig(self):
-        '''test _gene_covered_by_at_least_one_full_length_contig'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
-        hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
-        nucmer_hits = [
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit2))]},
-        ]
-        expected = [True, False]
-        assert len(expected) == len(nucmer_hits)
-        for i in range(len(expected)):
-            c.nucmer_hits = nucmer_hits[i]
-            self.assertEqual(c._gene_covered_by_at_least_one_full_length_contig(), expected[i])
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_mummer_variants(self):
-        '''test _get_mummer_variants'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.none.snps')
-        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
-        c._get_mummer_variants()
-        self.assertEqual(c.mummer_variants, {})
-
-        clean_cluster_dir(cluster_dir)
-        snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.snp.snps')
-        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
-        v2 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig2'))
-        v3 = pymummer.variant.Variant(pymummer.snp.Snp('40\tT\tC\t40\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
-        v4 = pymummer.variant.Variant(pymummer.snp.Snp('2\tC\tG\t2\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
-        expected = {
-            'contig1': [[v4], [v3, v1]],
-            'contig2': [[v2]]
-        }
-        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
-        c._get_mummer_variants()
-        self.assertEqual(c.mummer_variants, expected)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_filter_mummer_variants(self):
-        '''test filter_mummer_variants'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v2 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v3 = pymummer.variant.Variant(pymummer.snp.Snp('12\tG\tT\t12\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        c.mummer_variants = {'contig': [[v1, v2], v3]}
-        c._filter_mummer_variants()
-        expected = {'contig': [[v1, v2]]}
-        self.assertEqual(expected, c.mummer_variants)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_codon_start(self):
-        '''test _get_codon_start'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        tests = [
-            (0, 5, 3),
-            (0, 0, 0),
-            (0, 1, 0),
-            (0, 2, 0),
-            (1, 3, 1),
-            (2, 3, 2),
-            (3, 3, 3),
-            (3, 6, 6),
-            (3, 7, 6),
-            (3, 8, 6),
-        ]
-        for t in tests:
-            self.assertEqual(c._get_codon_start(t[0], t[1]), t[2])
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_variant_effect(self):
-        '''test _get_variant_effect'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v2 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v3 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v4 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v5 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v6 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tA\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v7 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v7.qry_base = 'GAT'
-        v8 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v8.qry_base = 'TGA'
-        v9 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v9.qry_base = 'ATTCCT'
-        v10 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v10.ref_base = 'CGC'
-        v10.ref_end = 5
-        v11 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v11.ref_base = 'CGCGAA'
-        v11.ref_end = 8
-
-        variants = [
-            ([v1], ('SYN', '.')),
-            ([v2], ('NONSYN', 'R2S')),
-            ([v2, v1], ('NONSYN', 'R2S')),
-            ([v3, v4], ('TRUNC', 'R2trunc')),
-            ([v5], ('FSHIFT', 'R2fs')),
-            ([v6], ('FSHIFT', 'R2fs')),
-            ([v7], ('INS', 'R2_E3insD')),
-            ([v8], ('TRUNC', 'R2trunc')),
-            ([v9], ('INS', 'R2_E3insIP')),
-            ([v10], ('DEL', 'R2del')),
-            ([v11], ('DEL', 'R2_E3del')),
-        ]
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t28\tC\t83\t83\tT\t22\t.\t22\tpresence_absence1:p:A10V:.:Ref has wild, reads have variant so report\tGeneric description of presence_absence1',
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t0\t.\tp\t.\t0\t.\tSYN\t53\t53\tT\t108\t108\tC\t32\t.\t32\t.\tGeneric description of presence_absence1',
 
-        for t in variants:
-            self.assertEqual(t[1], c._get_variant_effect(t[0]))
-
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_make_assembly_vcf(self):
-        '''test _make_assembly_vcf'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.final_assembly_fa = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.fa')
-        c.final_assembly_bam = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.bam')
-        expected_vcf = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.vcf')
-        expected_depths = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.read_depths.gz')
-        c._make_assembly_vcf()
-
-        def get_vcf_call_lines(fname):
-            with open(fname) as f:
-                lines = [x for x in f.readlines() if not x.startswith('#')]
-            return lines
-
-        expected_lines = get_vcf_call_lines(expected_vcf)
-        got_lines = get_vcf_call_lines(c.final_assembly_vcf)
-        self.assertEqual(expected_lines, got_lines)
-        self.assertEqual(file2lines(expected_depths), file2lines(c.final_assembly_read_depths))
-        clean_cluster_dir(cluster_dir)
-
-    def test_get_assembly_read_depths(self):
-        '''test _get_assembly_read_depths'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_get_assembly_read_depths.gz')
-        tests = [
-            ( ('ref1', 42), None ),
-            ( ('ref2', 1), None ),
-            ( ('ref1', 0), ('G', '.', 1, '1') ),
-            ( ('ref1', 2), ('T', 'A', 3, '2,1') ),
-            ( ('ref1', 3), ('C', 'A,G', 42, '21,11,10') ),
-            ( ('ref1', 4), ('C', 'AC', 41, '0,42') )
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t62\t64\tC;G;C\t18;17;17\t.;.;.\t18;17;17\tpresence_absence1:p:R3S:.:Ref and assembly have wild type\tGeneric description of presence_absence1',
+
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t68\t70\tG;C;G\t18;20;20\t.;.;.\t18;20;20\tpresence_absence1:p:I5A:.:Ref and reads have variant so report\tGeneric description of presence_absence1',
         ]
 
-        for t in tests:
-            self.assertEqual(c._get_assembly_read_depths(t[0][0], t[0][1]), t[1])
+        self.assertEqual(expected, c.report_lines)
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_ok_variants_only_variant_not_present(self):
+        '''test complete run of cluster on a variants only gene when variant not present'''
+        refdata = reference_data.ReferenceData(
+            variants_only_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.not_present.metadata.tsv'),
+        )
+
+        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.not_present'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)
 
-    def test_get_samtools_variant_positions(self):
-        '''test _get_samtools_variant_positions'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_samtools_variant_positions.vcf')
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300)
+        c.run()
         expected = [
-            ('16__cat_2_M35190.scaffold.1', 92),
-            ('16__cat_2_M35190.scaffold.1', 179),
-            ('16__cat_2_M35190.scaffold.1', 263),
-            ('16__cat_2_M35190.scaffold.6', 93)
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type, so do not report\tGeneric description of variants_only1'
         ]
-        self.assertEqual(expected, c._get_samtools_variant_positions())
-
-
-    def test_get_samtools_variants(self):
-        '''test _get_samtools_variants'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_samtools_variants.vcf')
-        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_get_samtools_variants.read_depths.gz')
-        positions = [
-            ('16__cat_2_M35190.scaffold.1', 92),
-            ('16__cat_2_M35190.scaffold.1', 179),
-            ('16__cat_2_M35190.scaffold.1', 263),
-            ('16__cat_2_M35190.scaffold.6', 93)
-        ]
-        expected = {
-            '16__cat_2_M35190.scaffold.1': {
-                92: ('T', 'A', 123, '65,58'),
-                179: ('A', 'T', 86, '41,45'),
-                263: ('G', 'C', 97, '53,44'),
-            },
-            '16__cat_2_M35190.scaffold.6': {
-                93: ('T', 'G', 99, '56,43')
-            }
-        }
-
-        got = c._get_samtools_variants(positions)
-        self.assertEqual(expected, got)
-
-
-    def test_get_vcf_variant_counts(self):
-        '''test _get_vcf_variant_counts'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hit = ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1']
-        c.nucmer_hits = {
-            'scaff1': [pymummer.alignment.Alignment('\t'.join(hit))]
-        }
-
-        c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_vcf_variant_counts.vcf')
-        c._get_vcf_variant_counts()
-        expected = {'scaff1': 1}
-        self.assertEqual(expected, c.vcf_variant_counts)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_make_report_lines_nonsynonymous(self):
-        '''test _make_report_lines'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'cluster_name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('8\tA\tG\t8\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-
-        nucmer_hit = ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'gene', 'contig']
-        c.nucmer_hits = {'contig': [pymummer.alignment.Alignment('\t'.join(nucmer_hit))]}
-        c.mummer_variants = {'contig': [[v1]]}
-        c.percent_identities = {'contig': 92.42}
-        c.status_flag.set_flag(42)
-        c.assembled_ok = True
-        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_make_report_lines.read_depths.gz')
-        c._make_report_lines()
-        expected = [[
-            'gene',
-            554,
-            2,
-            'cluster_name',
-            39,
-            10,
-            92.42,
-            'SNP',
-            'NONSYN',
-            'E3G',
-            8,
-            8,
-            'A',
-            'contig',
-            39,
-            8,
-            8,
-            'G',
-            '.',
-            '.',
-            '.'
-        ]]
-        self.assertEqual(expected, c.report_lines)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_make_report_lines_synonymous(self):
-        '''test _make_report_lines'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'cluster_name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-
-        nucmer_hit = ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'gene', 'contig']
-        c.nucmer_hits = {'contig': [pymummer.alignment.Alignment('\t'.join(nucmer_hit))]}
-        c.mummer_variants = {'contig': [[v1]]}
-        c.percent_identities = {'contig': 92.42}
-        c.status_flag.set_flag(42)
-        c.assembled_ok = True
-        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_make_report_lines.read_depths.gz')
-        c._make_report_lines()
-        expected = [[
-            'gene',
-            42,
-            2,
-            'cluster_name',
-            39,
-            10,
-            92.42,
-            'SNP',
-            'SYN',
-            '.',
-            6,
-            6,
-            'C',
-            'contig',
-            39,
-            6,
-            6,
-            'T',
-            42,
-            'G',
-            '22,20'
-        ]]
         self.assertEqual(expected, c.report_lines)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_make_report_lines_assembly_fail(self):
-        '''test _make_report_lines when assembly fails'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'cluster_name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        c.status_flag.set_flag(64)
-        c.assembled_ok = False
-        c._make_report_lines()
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_ok_variants_only_variant_not_present_always_report(self):
+        '''test complete run of cluster on a variants only gene when variant not present but always report variant'''
+        refdata = reference_data.ReferenceData(
+            variants_only_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv'),
+        )
+
+        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.not_present.always_report'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300)
+        c.run()
         expected = [
-            [
-                'gene',
-                64,
-                2,
-                'cluster_name',
-                39,
-            ] + ['.'] * 16
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1'
         ]
         self.assertEqual(expected, c.report_lines)
-        clean_cluster_dir(cluster_dir)
+        shutil.rmtree(tmpdir)
+
 
+    def test_full_run_ok_variants_only_variant_is_present(self):
+        '''test complete run of cluster on a variants only gene when variant is present'''
+        refdata = reference_data.ReferenceData(
+            variants_only_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.present.metadata.tsv'),
+        )
+
+        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.present'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300)
+        c.run()
+
+        expected = [
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1:p:R3S:.:Ref and assembly have wild type\tGeneric description of variants_only1',
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t71\t73\tG;C;G\t17;17;17\t.;.;.\t17;17;17\tvariants_only1:p:I5A:.:Ref and reads have variant so report\tGeneric description of variants_only1',
+        ]
+        self.assertEqual(expected, c.report_lines)
+        shutil.rmtree(tmpdir)
diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py
index 3d9a6d5..60ec0c4 100644
--- a/ariba/tests/clusters_test.py
+++ b/ariba/tests/clusters_test.py
@@ -1,65 +1,101 @@
 import unittest
 import shutil
 import os
+import pickle
 import pysam
 import pyfastaq
 import filecmp
-from ariba import clusters
+from ariba import clusters, external_progs, reference_data, sequence_metadata
 
 modules_dir = os.path.dirname(os.path.abspath(clusters.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
+
+
+def file_to_list(infile):
+    f = pyfastaq.utils.open_file_read(infile)
+    lines = [x for x in f.readlines()]
+    pyfastaq.utils.close(f)
+    return lines
 
 
 class TestClusters(unittest.TestCase):
     def setUp(self):
         self.cluster_dir = 'tmp.Cluster'
+        self.refdata_dir = 'tmp.RefData'
+        os.mkdir(self.refdata_dir)
+        shutil.copyfile(os.path.join(data_dir, 'clusters_test_dummy_db.fa'), os.path.join(self.refdata_dir, 'refcheck.01.check_variants.non_coding.fa'))
+        with open(os.path.join(self.refdata_dir, 'info.txt'), 'w') as f:
+            print('genetic_code\t11', file=f)
+
+        with open(os.path.join(self.refdata_dir, 'cdhit.clusters.pickle'), 'wb') as f:
+            pickle.dump({'x': {'x'}}, f)
+
         reads1 = os.path.join(data_dir, 'clusters_test_dummy_reads_1.fq')
         reads2 = os.path.join(data_dir, 'clusters_test_dummy_reads_2.fq')
-        db = os.path.join(data_dir, 'clusters_test_dummy_db.fa')
-        self.clusters = clusters.Clusters(db, reads1, reads2, self.cluster_dir)
+        self.clusters = clusters.Clusters(self.refdata_dir, reads1, reads2, self.cluster_dir, extern_progs, clean=False)
 
 
     def tearDown(self):
         shutil.rmtree(self.cluster_dir)
+        shutil.rmtree(self.refdata_dir)
 
 
-    def test_sam_to_fastq(self):
-        '''test _sam_to_fastq'''
-        expected = [
-            pyfastaq.sequences.Fastq('read1/1', 'GTATGAGTAGATATAAAGTCCGGAACTGTGATCGGGGGCGATTTATTTACTGGCCGTCCC', 'GHIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'),
-            pyfastaq.sequences.Fastq('read1/2', 'TCCCATACGTTGCAATCTGCAGACGCCACTCTTCCACGTCGGACGAACGCAACGTCAGGA', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGEDCBA')
-        ]
+    def test_load_reference_data_info_file(self):
+        '''test _load_reference_data_info_file'''
+        infile = os.path.join(data_dir, 'clusters_test_load_data_info_file')
+        expected = {'genetic_code': 11}
+        got = clusters.Clusters._load_reference_data_info_file(infile)
+        self.assertEqual(expected, got)
 
 
-        sam_reader = pysam.Samfile(os.path.join(data_dir, 'clusters_test_sam_to_fastq.bam'), "rb")
-        i = 0
-        for s in sam_reader.fetch(until_eof=True):
-            self.assertEqual(expected[i], self.clusters._sam_to_fastq(s))
-            i += 1
+    def test_load_reference_data_from_dir(self):
+        '''test _load_reference_data_from_dir'''
+        indir = os.path.join(data_dir, 'clusters_test_load_reference_data_from_dir')
+        got_refdata, got_clusters = clusters.Clusters._load_reference_data_from_dir(indir)
+        expected_seq_dicts = {
+            'variants_only': {'variants_only1': pyfastaq.sequences.Fasta('variants_only1', 'atggcgtgcgatgaataa')},
+            'presence_absence': {'presabs1': pyfastaq.sequences.Fasta('presabs1', 'atgatgatgagcccggcgatggaaggcggctag')},
+            'non_coding': {'noncoding1': pyfastaq.sequences.Fasta('noncoding1', 'ACGTA')},
+        }
+        self.assertEqual(expected_seq_dicts, got_refdata.seq_dicts)
+        self.assertEqual(11, got_refdata.genetic_code)
+
+        expected_metadata = {
+            'presabs1': {
+                '.': {sequence_metadata.SequenceMetadata('presabs1\t.\t.\t.\tpresabs1 description')},
+                'n': {},
+                'p': {}
+            },
+            'variants_only1': {
+                '.': set(),
+                'n': {},
+                'p': {1: {sequence_metadata.SequenceMetadata('variants_only1\tp\tC2I\t.\tdescription of variants_only1 C2I')}}
+            }
+        }
+        self.assertEqual(expected_metadata, got_refdata.metadata)
 
+        expected_clusters = {'key1': 1, 'key2': 2}
+        self.assertEqual(expected_clusters, got_clusters)
 
-    def test_sam_pair_to_insert(self):
-        '''test _sam_pair_to_insert'''
-        expected = [
-            None, # both unmapped
-            None, # read 1 unmapped
-            None, # read 2 unmpapped
-            None, # mapped to different seqs
-            None, # same seqs, wrond orientation
-            660
-        ]
 
-        sam1 = None
-        i = 0
-        sam_reader = pysam.Samfile(os.path.join(data_dir, 'clusters_test_sam_pair_to_insert.bam'), 'rb')
-        for s in sam_reader.fetch(until_eof=True):
-            if sam1 is None:
-                sam1 = s
-                continue
+    def test_bam_to_clusters_reads_no_reads_map(self):
+        '''test _bam_to_clusters_reads when no reads map'''
+        clusters_dir = 'tmp.Cluster.test_bam_to_clusters_reads_no_reads_map'
+        reads1 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads_no_reads_map_1.fq')
+        reads2 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads_no_reads_map_2.fq')
+        ref = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.db.fa')
+        refdata = reference_data.ReferenceData(presence_absence_fa = ref)
+        c = clusters.Clusters(self.refdata_dir, reads1, reads2, clusters_dir, extern_progs, clean=False)
+        shutil.copyfile(os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads_no_reads_map.bam'), c.bam)
+        c._bam_to_clusters_reads()
 
-            self.assertEqual(self.clusters._sam_pair_to_insert(s, sam1), expected[i])
-            sam1 = None
-            i += 1
+        self.assertEqual({}, c.insert_hist.bins)
+        self.assertEqual({}, c.cluster_read_counts)
+        self.assertEqual({}, c.cluster_base_counts)
+        self.assertEqual(0, c.proper_pairs)
+
+        shutil.rmtree(clusters_dir)
 
 
     def test_bam_to_clusters_reads(self):
@@ -68,7 +104,8 @@ class TestClusters(unittest.TestCase):
         reads1 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.reads_1.fq')
         reads2 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.reads_2.fq')
         ref = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.db.fa')
-        c = clusters.Clusters(ref, reads1, reads2, clusters_dir)
+        refdata = reference_data.ReferenceData(presence_absence_fa = ref)
+        c = clusters.Clusters(self.refdata_dir, reads1, reads2, clusters_dir, extern_progs, clean=False)
         shutil.copyfile(os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.bam'), c.bam)
         c._bam_to_clusters_reads()
         expected = [
@@ -78,19 +115,14 @@ class TestClusters(unittest.TestCase):
             os.path.join(data_dir, 'clusters_test_bam_to_clusters.out.ref2.reads_2.fq'),
         ]
 
-        got = [
-            os.path.join(clusters_dir, 'Clusters/ref1/reads_1.fq'),
-            os.path.join(clusters_dir, 'Clusters/ref1/reads_2.fq'),
-            os.path.join(clusters_dir, 'Clusters/ref2/reads_1.fq'),
-            os.path.join(clusters_dir, 'Clusters/ref2/reads_2.fq'),
-        ]
-
-
-        for i in range(len(got)):
-            self.assertTrue(os.path.exists(got[i]))
-            self.assertTrue(filecmp.cmp(expected[i], got[i], shallow=False))
+        got_reads_store_lines = file_to_list(os.path.join(clusters_dir, 'read_store.gz'))
+        expected_reads_store_lines = file_to_list(os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.read_store.gz'))
 
+        self.assertEqual(expected_reads_store_lines, got_reads_store_lines)
         self.assertEqual({780:1}, c.insert_hist.bins)
+        self.assertEqual({'ref1': 4, 'ref2': 2}, c.cluster_read_counts)
+        self.assertEqual({'ref1': 240, 'ref2': 120}, c.cluster_base_counts)
+        self.assertEqual(1, c.proper_pairs)
 
         shutil.rmtree(clusters_dir)
 
@@ -121,28 +153,73 @@ class TestClusters(unittest.TestCase):
             def __init__(self, lines):
                 self.report_lines = lines
 
-        self.clusters.clusters = {
-            'gene1': FakeCluster([['gene1 line1']]),
-            'gene2': FakeCluster([['gene2 line2']])
+        clusters_dict = {
+            'gene1': FakeCluster(['gene1\tline1']),
+            'gene2': FakeCluster(['gene2\tline2'])
         }
 
-        self.clusters._write_reports()
-        expected = os.path.join(data_dir, 'clusters_test_write_report.tsv')
-        self.assertTrue(filecmp.cmp(expected, self.clusters.report_file_tsv, shallow=False))
-        self.assertTrue(os.path.exists(self.clusters.report_file_xls))
+        tmp_tsv = 'tmp.test_write_reports.tsv'
+        tmp_xls = 'tmp.test_write_reports.xls'
+        clusters.Clusters._write_reports(clusters_dict, tmp_tsv, tmp_xls)
 
+        expected = os.path.join(data_dir, 'clusters_test_write_report.tsv')
+        self.assertTrue(filecmp.cmp(expected, tmp_tsv, shallow=False))
+        self.assertTrue(os.path.exists(tmp_xls))
+        os.unlink(tmp_tsv)
+        os.unlink(tmp_xls)
+
+
+    def test_write_catted_assembled_seqs_fasta(self):
+        '''test _write_catted_assembled_seqs_fasta'''
+        seq1 = pyfastaq.sequences.Fasta('seq1', 'ACGT')
+        seq2 = pyfastaq.sequences.Fasta('seq2', 'TTTT')
+        seq3 = pyfastaq.sequences.Fasta('seq3', 'AAAA')
+        class FakeAssemblyCompare:
+            def __init__(self, assembled_seqs):
+                if assembled_seqs is not None:
+                    self.assembled_reference_sequences = {x.id: x for x in assembled_seqs}
 
-    def test_write_catted_assembled_genes_fasta(self):
-        '''test _write_catted_assembled_genes_fasta'''
         class FakeCluster:
-            def __init__(self, filename):
-                self.final_assembled_genes_fa = filename
+            def __init__(self, assembled_seqs):
+                self.assembly_compare = FakeAssemblyCompare(assembled_seqs)
 
         self.clusters.clusters = {
-            'gene1': FakeCluster(os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa')),
-            'gene2': FakeCluster(os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa')),
+            'gene1': FakeCluster([seq1, seq2]),
+            'gene2': FakeCluster([seq3]),
+            'gene3': FakeCluster(None),
         }
 
-        self.clusters._write_catted_assembled_genes_fasta()
+        tmp_file = 'tmp.test_write_catted_assembled_seqs_fasta.fa'
+        self.clusters._write_catted_assembled_seqs_fasta(tmp_file)
         expected = os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.expected.out.fa')
-        self.assertTrue(filecmp.cmp(expected, self.clusters.catted_assembled_genes_fasta, shallow=False))
+        self.assertTrue(filecmp.cmp(expected, tmp_file, shallow=False))
+        os.unlink(tmp_file)
+
+
+    def test_write_catted_genes_matching_refs_fasta(self):
+        '''test _write_catted_genes_matching_refs_fasta'''
+        seq1 = pyfastaq.sequences.Fasta('seq1', 'ACGT')
+        seq3 = pyfastaq.sequences.Fasta('seq3', 'AAAA')
+        class FakeAssemblyCompare:
+            def __init__(self, seq, seq_type, start, end):
+                self.gene_matching_ref = seq
+                self.gene_matching_ref_type = seq_type
+                self.gene_start_bases_added = start
+                self.gene_end_bases_added = end
+
+        class FakeCluster:
+            def __init__(self, seq, seq_type, start, end):
+                self.assembly_compare = FakeAssemblyCompare(seq, seq_type, start, end)
+
+        self.clusters.clusters = {
+            'gene1': FakeCluster(seq1, 'TYPE1', 1, 3),
+            'gene2': FakeCluster(None, None, None, None),
+            'gene3': FakeCluster(seq3, 'TYPE3', 4, 5),
+        }
+
+        tmp_file = 'tmp.test_write_catted_genes_matching_refs_fasta.fa'
+        self.clusters._write_catted_genes_matching_refs_fasta(tmp_file)
+        expected = os.path.join(data_dir, 'clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa')
+        self.assertTrue(filecmp.cmp(expected, tmp_file, shallow=False))
+        os.unlink(tmp_file)
+
diff --git a/ariba/tests/common_test.py b/ariba/tests/common_test.py
new file mode 100644
index 0000000..a3f7915
--- /dev/null
+++ b/ariba/tests/common_test.py
@@ -0,0 +1,22 @@
+import unittest
+import os
+import filecmp
+from ariba import common
+
+modules_dir = os.path.dirname(os.path.abspath(common.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestCommon(unittest.TestCase):
+    def test_cat_files(self):
+        '''test cat_files'''
+        infiles = [
+            os.path.join(data_dir, 'test_common_cat_files.in.1'),
+            os.path.join(data_dir, 'test_common_cat_files.in.2'),
+            os.path.join(data_dir, 'test_common_cat_files.in.3'),
+        ]
+        tmp_out = 'tmp.test.common_cat_files.out'
+        expected = os.path.join(data_dir, 'test_common_cat_files.out')
+        common.cat_files(infiles, tmp_out)
+        self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False))
+        os.unlink(tmp_out)
diff --git a/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa b/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa
new file mode 100644
index 0000000..4c7c0fe
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_load_aln_file.in.fa
@@ -0,0 +1,4 @@
+>seq1
+ABC-DE
+>seq2
+ABCQDE
diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv
new file mode 100644
index 0000000..6152c4a
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.1.tsv
@@ -0,0 +1,2 @@
+seq1	A42B	id1	description 1
+seq2	C43D	id2
diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv
new file mode 100644
index 0000000..da6dd35
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_load_vars_file_bad.2.tsv
@@ -0,0 +1,2 @@
+seq1	A42B	id1	description 1
+seq2	wrong_format	id2	description 2
diff --git a/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv b/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv
new file mode 100644
index 0000000..058b1de
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_load_vars_file_good.tsv
@@ -0,0 +1,3 @@
+seq1	A42B	id1	description 1
+seq2	C43D	id2	description 2
+seq2	E100F	id3	description 3
diff --git a/ariba/tests/data/aln_to_metadata_make_cluster_file.out b/ariba/tests/data/aln_to_metadata_make_cluster_file.out
new file mode 100644
index 0000000..529a3cc
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_make_cluster_file.out
@@ -0,0 +1 @@
+seq2	seq1	seq3
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.in.fa b/ariba/tests/data/aln_to_metadata_run_coding.in.fa
new file mode 100644
index 0000000..c71f8c1
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.in.fa
@@ -0,0 +1,10 @@
+>seq1
+ATG---GCTAATTAG
+>seq2
+ATG---GCTAATTAG
+>seq3
+ATGTTT---AATTAG
+>seq4
+ATGTTTTGTAATTAG
+>seq5
+ATGTTTGATAATTAG
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.in.tsv b/ariba/tests/data/aln_to_metadata_run_coding.in.tsv
new file mode 100644
index 0000000..552e7a5
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.in.tsv
@@ -0,0 +1,2 @@
+seq1	A2D	id1	description 1
+seq5	F2E	id2	description 2
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.cluster b/ariba/tests/data/aln_to_metadata_run_coding.out.cluster
new file mode 100644
index 0000000..6df8ac7
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.out.cluster
@@ -0,0 +1 @@
+seq3	seq1	seq2	seq4	seq5
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.fa b/ariba/tests/data/aln_to_metadata_run_coding.out.fa
new file mode 100644
index 0000000..97d0f12
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.out.fa
@@ -0,0 +1,10 @@
+>seq1
+ATGGCTAATTAG
+>seq2
+ATGGCTAATTAG
+>seq3
+ATGTTTAATTAG
+>seq4
+ATGTTTTGTAATTAG
+>seq5
+ATGTTTGATAATTAG
diff --git a/ariba/tests/data/aln_to_metadata_run_coding.out.tsv b/ariba/tests/data/aln_to_metadata_run_coding.out.tsv
new file mode 100644
index 0000000..ee957fa
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_coding.out.tsv
@@ -0,0 +1,7 @@
+seq1	p	A2D	id1	description 1
+seq2	p	A2D	id1	description 1
+seq4	p	C3D	id1	description 1
+seq5	p	A3D	id1	description 1
+seq5	p	F2E	id2	description 2
+seq3	p	F2E	id2	description 2
+seq4	p	F2E	id2	description 2
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa b/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa
new file mode 100644
index 0000000..2bc5657
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.in.fa
@@ -0,0 +1,10 @@
+>seq1
+ATG---GCTAATTAG
+>seq2
+ATG---GCTAATTAG
+>seq3
+ATGTAT---AATTAG
+>seq4
+ATGTGTTGTAATTAG
+>seq5
+ATGTTTGATAATTAG
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv b/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv
new file mode 100644
index 0000000..3d32d77
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.in.tsv
@@ -0,0 +1,2 @@
+seq1	C5T	id1	description 1
+seq5	A5T	id2	description 2
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster b/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster
new file mode 100644
index 0000000..aee4e5a
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.cluster
@@ -0,0 +1 @@
+seq2	seq1	seq3	seq4	seq5
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa b/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa
new file mode 100644
index 0000000..e737be6
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.fa
@@ -0,0 +1,10 @@
+>seq1
+ATGGCTAATTAG
+>seq2
+ATGGCTAATTAG
+>seq3
+ATGTATAATTAG
+>seq4
+ATGTGTTGTAATTAG
+>seq5
+ATGTTTGATAATTAG
diff --git a/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv b/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv
new file mode 100644
index 0000000..7ba82bf
--- /dev/null
+++ b/ariba/tests/data/aln_to_metadata_run_noncoding.out.tsv
@@ -0,0 +1,7 @@
+seq1	n	C5T	id1	description 1
+seq2	n	C5T	id1	description 1
+seq4	n	G8T	id1	description 1
+seq5	n	A8T	id1	description 1
+seq5	n	A5T	id2	description 2
+seq3	n	A5T	id2	description 2
+seq4	n	G5T	id2	description 2
diff --git a/ariba/tests/data/assembly_compare_parse_nucmer_coords_file.coords b/ariba/tests/data/assembly_compare_parse_nucmer_coords_file.coords
new file mode 100644
index 0000000..47c1894
--- /dev/null
+++ b/ariba/tests/data/assembly_compare_parse_nucmer_coords_file.coords
@@ -0,0 +1,7 @@
+a.fa b.fa
+NUCMER
+
+[S1]	[E1]	[S2]	[E2]	[LEN 1]	[LEN 2]	[% IDY]	[LEN R]	[LEN Q]	[FRM]	[TAGS]
+1	1000	1	1000	1000	1000	100.00	1000	1000	1	1	ref	contig1	[IDENTITY]
+1	240	1	240	240	240	100.00	1000	580	1	1	ref	contig2
+661	1000	241	580	340	340	100.00	1000	580	1	1	ref	contig2
diff --git a/ariba/tests/data/cluster_test_assemble_with_spades/reads_1.fq b/ariba/tests/data/assembly_test_assemble_with_spades_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_assemble_with_spades/reads_1.fq
rename to ariba/tests/data/assembly_test_assemble_with_spades_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_assemble_with_spades/reads_2.fq b/ariba/tests/data/assembly_test_assemble_with_spades_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_assemble_with_spades/reads_2.fq
rename to ariba/tests/data/assembly_test_assemble_with_spades_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_assemble_with_spades.gene.fa b/ariba/tests/data/assembly_test_assemble_with_spades_ref.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_assemble_with_spades.gene.fa
rename to ariba/tests/data/assembly_test_assemble_with_spades_ref.fa
diff --git a/ariba/tests/data/assembly_test_check_spades_log_file.log.bad b/ariba/tests/data/assembly_test_check_spades_log_file.log.bad
new file mode 100644
index 0000000..df88162
--- /dev/null
+++ b/ariba/tests/data/assembly_test_check_spades_log_file.log.bad
@@ -0,0 +1,5 @@
+line 1
+line 2
+
+== Error ==  system call for: "['/foo/bar/SPAdes-3.6.0-Linux/bin/spades', '/spam/eggs/K21/configs/config.info']" finished abnormally, err code: -7
+
diff --git a/ariba/tests/data/assembly_test_check_spades_log_file.log.good b/ariba/tests/data/assembly_test_check_spades_log_file.log.good
new file mode 100644
index 0000000..f0f0687
--- /dev/null
+++ b/ariba/tests/data/assembly_test_check_spades_log_file.log.good
@@ -0,0 +1,5 @@
+This is a dummy spades log file.
+
+It doesn't look like a real spades log file.
+
+But it doesn't have lines matching the bad stuff that will mean ariba should stop NOW.
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation.in.fa b/ariba/tests/data/assembly_test_fix_contig_orientation.in.fa
similarity index 77%
rename from ariba/tests/data/cluster_test_fix_contig_orientation.in.fa
rename to ariba/tests/data/assembly_test_fix_contig_orientation.in.fa
index 183e911..0b9605b 100644
--- a/ariba/tests/data/cluster_test_fix_contig_orientation.in.fa
+++ b/ariba/tests/data/assembly_test_fix_contig_orientation.in.fa
@@ -24,3 +24,10 @@ ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
+>match_both_strands
+CGCTTCGGTCCACCATGATGGAGCGCCATGTGATGGGATTTCCAACCCCGTTGTTTCAGG
+ACTCATGGCATTTACCACCGACAACCGTTTATAATCCATGAGCAAGGAATACAGTGGAGA
+CAGGATTGGTTGTATTGGACTGAATACATGCCCCACTGTTACCCCGAAAGTTAACACGTA
+TAGTTTAAATAAACTATGGGTACGTGTTAACTTTCGGGGTAACAGTGGGGCATGTATTCA
+GTCCAATACAACCAATCCTGTCTCCACTGTATTCCTTGCTCATGGATTATAAACGGTTGT
+CGGTGGTAAATGCCATGAGTCCTGAAACAACGGGGTTGGAAATCCCATCACATGGCGCTC
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation.out.fa b/ariba/tests/data/assembly_test_fix_contig_orientation.out.fa
similarity index 77%
rename from ariba/tests/data/cluster_test_fix_contig_orientation.out.fa
rename to ariba/tests/data/assembly_test_fix_contig_orientation.out.fa
index ce3a0b5..a1d0f7b 100644
--- a/ariba/tests/data/cluster_test_fix_contig_orientation.out.fa
+++ b/ariba/tests/data/assembly_test_fix_contig_orientation.out.fa
@@ -24,3 +24,10 @@ ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
+>match_both_strands
+CGCTTCGGTCCACCATGATGGAGCGCCATGTGATGGGATTTCCAACCCCGTTGTTTCAGG
+ACTCATGGCATTTACCACCGACAACCGTTTATAATCCATGAGCAAGGAATACAGTGGAGA
+CAGGATTGGTTGTATTGGACTGAATACATGCCCCACTGTTACCCCGAAAGTTAACACGTA
+TAGTTTAAATAAACTATGGGTACGTGTTAACTTTCGGGGTAACAGTGGGGCATGTATTCA
+GTCCAATACAACCAATCCTGTCTCCACTGTATTCCTTGCTCATGGATTATAAACGGTTGT
+CGGTGGTAAATGCCATGAGTCCTGAAACAACGGGGTTGGAAATCCCATCACATGGCGCTC
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation.gene.fa b/ariba/tests/data/assembly_test_fix_contig_orientation.ref.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_fix_contig_orientation.gene.fa
rename to ariba/tests/data/assembly_test_fix_contig_orientation.ref.fa
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa b/ariba/tests/data/assembly_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa
rename to ariba/tests/data/assembly_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa b/ariba/tests/data/assembly_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa
rename to ariba/tests/data/assembly_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/reads_1.fq b/ariba/tests/data/assembly_test_gapfill_with_gapfiller_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller/reads_1.fq
rename to ariba/tests/data/assembly_test_gapfill_with_gapfiller_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/reads_2.fq b/ariba/tests/data/assembly_test_gapfill_with_gapfiller_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller/reads_2.fq
rename to ariba/tests/data/assembly_test_gapfill_with_gapfiller_reads_2.fq
diff --git a/ariba/tests/data/assembly_test_has_gaps_to_fill.has_gaps.fa b/ariba/tests/data/assembly_test_has_gaps_to_fill.has_gaps.fa
new file mode 100644
index 0000000..14bf0bc
--- /dev/null
+++ b/ariba/tests/data/assembly_test_has_gaps_to_fill.has_gaps.fa
@@ -0,0 +1,2 @@
+>seq
+CATCATCATCATnCATAATATATATAT
diff --git a/ariba/tests/data/assembly_test_has_gaps_to_fill.no_gaps.fa b/ariba/tests/data/assembly_test_has_gaps_to_fill.no_gaps.fa
new file mode 100644
index 0000000..896d4d4
--- /dev/null
+++ b/ariba/tests/data/assembly_test_has_gaps_to_fill.no_gaps.fa
@@ -0,0 +1,4 @@
+>seq
+ACTATCCATGCATGCATACT
+>seq2
+CACACGTCAGTCAAG
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam.assembly.fa b/ariba/tests/data/assembly_test_parse_assembly_bam.assembly.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_parse_assembly_bam.assembly.fa
rename to ariba/tests/data/assembly_test_parse_assembly_bam.assembly.fa
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam.bam b/ariba/tests/data/assembly_test_parse_assembly_bam.bam
similarity index 100%
rename from ariba/tests/data/cluster_test_parse_assembly_bam.bam
rename to ariba/tests/data/assembly_test_parse_assembly_bam.bam
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds.in.fa b/ariba/tests/data/assembly_test_rename_scaffolds.in.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_rename_scaffolds.in.fa
rename to ariba/tests/data/assembly_test_rename_scaffolds.in.fa
diff --git a/ariba/tests/data/assembly_test_rename_scaffolds.out.fa b/ariba/tests/data/assembly_test_rename_scaffolds.out.fa
new file mode 100644
index 0000000..5866be1
--- /dev/null
+++ b/ariba/tests/data/assembly_test_rename_scaffolds.out.fa
@@ -0,0 +1,6 @@
+>prefix.scaffold.1
+TACG
+>prefix.scaffold.2
+ACGT
+>prefix.scaffold.3
+CGTA
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace.contigs.fa b/ariba/tests/data/assembly_test_scaffold_with_sspace_contigs.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_scaffold_with_sspace.contigs.fa
rename to ariba/tests/data/assembly_test_scaffold_with_sspace_contigs.fa
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace/reads_1.fq b/ariba/tests/data/assembly_test_scaffold_with_sspace_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_scaffold_with_sspace/reads_1.fq
rename to ariba/tests/data/assembly_test_scaffold_with_sspace_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace/reads_2.fq b/ariba/tests/data/assembly_test_scaffold_with_sspace_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_scaffold_with_sspace/reads_2.fq
rename to ariba/tests/data/assembly_test_scaffold_with_sspace_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation/reads_1.fq b/ariba/tests/data/assembly_test_set_assembly_kmer_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_fix_contig_orientation/reads_1.fq
rename to ariba/tests/data/assembly_test_set_assembly_kmer_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation/reads_2.fq b/ariba/tests/data/assembly_test_set_assembly_kmer_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_fix_contig_orientation/reads_2.fq
rename to ariba/tests/data/assembly_test_set_assembly_kmer_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_get_mummer_variants.none.snps b/ariba/tests/data/assembly_variants_test_get_mummer_variants.none.snps
similarity index 100%
rename from ariba/tests/data/cluster_test_get_mummer_variants.none.snps
rename to ariba/tests/data/assembly_variants_test_get_mummer_variants.none.snps
diff --git a/ariba/tests/data/cluster_test_get_mummer_variants.snp.snps b/ariba/tests/data/assembly_variants_test_get_mummer_variants.snp.snps
similarity index 100%
rename from ariba/tests/data/cluster_test_get_mummer_variants.snp.snps
rename to ariba/tests/data/assembly_variants_test_get_mummer_variants.snp.snps
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
new file mode 100644
index 0000000..f1e3583
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
@@ -0,0 +1,11 @@
+presence_absence	p	D2E	id1	ref has wild type D (GAT=D, GAA=E)
+presence_absence	p	S3R	id1	ref has variant type R (AGA=R, AGT=S)
+presence_absence	p	D4E	id1	ref has variant type E (GAA=E, GAC=D)
+presence_absence	p	A5D	id1	ref has wild type A (GCG=A)
+variants_only	p	D2E	id1	ref has wild type D (GAT=D, GAA=E)
+variants_only	p	S3R	id1	ref has variant type R (AGA=R, AGT=S)
+variants_only	p	D4E	id1	ref has variant type E (GAA=E, GAC=D)
+variants_only	p	A5D	id1	ref has wild type A (GCG=A)
+non_coding	n	C3A	id1	ref has variant type A
+non_coding	n	T5A	id1	ref has wild type T
+non_coding	n	C6G	id1	ref has variant type G
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa
new file mode 100644
index 0000000..d60406b
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa
@@ -0,0 +1,2 @@
+>presence_absence
+ATGGATAGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
new file mode 100644
index 0000000..f1e3583
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
@@ -0,0 +1,11 @@
+presence_absence	p	D2E	id1	ref has wild type D (GAT=D, GAA=E)
+presence_absence	p	S3R	id1	ref has variant type R (AGA=R, AGT=S)
+presence_absence	p	D4E	id1	ref has variant type E (GAA=E, GAC=D)
+presence_absence	p	A5D	id1	ref has wild type A (GCG=A)
+variants_only	p	D2E	id1	ref has wild type D (GAT=D, GAA=E)
+variants_only	p	S3R	id1	ref has variant type R (AGA=R, AGT=S)
+variants_only	p	D4E	id1	ref has variant type E (GAA=E, GAC=D)
+variants_only	p	A5D	id1	ref has wild type A (GCG=A)
+non_coding	n	C3A	id1	ref has variant type A
+non_coding	n	T5A	id1	ref has wild type T
+non_coding	n	C6G	id1	ref has variant type G
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa b/ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa
new file mode 100644
index 0000000..bc1221d
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa
@@ -0,0 +1,2 @@
+>non_coding
+CTACTGACGTACTGATCGATCGTATGAA
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.fa b/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.fa
new file mode 100644
index 0000000..d60406b
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.fa
@@ -0,0 +1,2 @@
+>presence_absence
+ATGGATAGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.snps b/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.snps
new file mode 100644
index 0000000..9171478
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.snps
@@ -0,0 +1,3 @@
+9	A	T	9	x	x	42	42	x	x	presence_absence	contig1
+14	C	A	14	x	x	42	42	x	x	presence_absence	contig1
+15	G	C	15	x	x	42	42	x	x	presence_absence	contig1
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_variants_only.fa b/ariba/tests/data/assembly_variants_test_get_variants_variants_only.fa
new file mode 100644
index 0000000..0d2677e
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_variants_only.fa
@@ -0,0 +1,2 @@
+>variants_only
+ATGGATAGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_variants_only.snps b/ariba/tests/data/assembly_variants_test_get_variants_variants_only.snps
new file mode 100644
index 0000000..c9e5c6f
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_variants_only.snps
@@ -0,0 +1,3 @@
+9	A	T	9	x	x	42	42	x	x	variants_only	contig1
+14	C	A	14	x	x	42	42	x	x	variants_only	contig1
+15	G	C	15	x	x	42	42	x	x	variants_only	contig1
diff --git a/ariba/tests/data/cluster_test_choose_best_gene/reads_1.fq b/ariba/tests/data/best_seq_chooser_best_seq_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_choose_best_gene/reads_1.fq
rename to ariba/tests/data/best_seq_chooser_best_seq_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_choose_best_gene/reads_2.fq b/ariba/tests/data/best_seq_chooser_best_seq_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_choose_best_gene/reads_2.fq
rename to ariba/tests/data/best_seq_chooser_best_seq_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_choose_best_gene/genes.fa b/ariba/tests/data/best_seq_chooser_best_seq_ref.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_choose_best_gene/genes.fa
rename to ariba/tests/data/best_seq_chooser_best_seq_ref.fa
diff --git a/ariba/tests/data/best_seq_chooser_best_seq_ref.fa.fai b/ariba/tests/data/best_seq_chooser_best_seq_ref.fa.fai
new file mode 100644
index 0000000..ae52961
--- /dev/null
+++ b/ariba/tests/data/best_seq_chooser_best_seq_ref.fa.fai
@@ -0,0 +1,3 @@
+1	300	3	60	61
+2	279	311	60	61
+3	300	598	60	61
diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_1.fq b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_1.fq
rename to ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_2.fq b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_2.fq
rename to ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/genes.fa b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/genes.fa
rename to ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa
diff --git a/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa.fai b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa.fai
new file mode 100644
index 0000000..ae52961
--- /dev/null
+++ b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa.fai
@@ -0,0 +1,3 @@
+1	300	3	60	61
+2	279	311	60	61
+3	300	598	60	61
diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/reads_1.fq b/ariba/tests/data/best_seq_chooser_total_alignment_score_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_get_total_alignment_score/reads_1.fq
rename to ariba/tests/data/best_seq_chooser_total_alignment_score_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/reads_2.fq b/ariba/tests/data/best_seq_chooser_total_alignment_score_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_get_total_alignment_score/reads_2.fq
rename to ariba/tests/data/best_seq_chooser_total_alignment_score_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/genes.fa b/ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_get_total_alignment_score/genes.fa
rename to ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa
diff --git a/ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa.fai b/ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa.fai
new file mode 100644
index 0000000..ae52961
--- /dev/null
+++ b/ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa.fai
@@ -0,0 +1,3 @@
+1	300	3	60	61
+2	279	311	60	61
+3	300	598	60	61
diff --git a/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa b/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa
deleted file mode 100644
index 85ca4cb..0000000
--- a/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->a
-A
->b
-G
->c
-T
diff --git a/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa b/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa
deleted file mode 100644
index 4b36e89..0000000
--- a/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->1
-A
->2
-G
->3
-T
diff --git a/ariba/tests/data/cdhit_test_fake_run.out.fa b/ariba/tests/data/cdhit_test_fake_run.out.fa
index 2a8bbc9..b3cdef0 100644
--- a/ariba/tests/data/cdhit_test_fake_run.out.fa
+++ b/ariba/tests/data/cdhit_test_fake_run.out.fa
@@ -1,4 +1,4 @@
->0
+>seq1.x
 TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
 GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
 AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
@@ -8,7 +8,7 @@ ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
 GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
 TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
 AACTCTATCGTAGGGTCGCA
->1
+>seq2.x
 TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
 GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
 AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
@@ -18,7 +18,7 @@ ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
 GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
 TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
 AACTCTATGTAGGGTCGCA
->2
+>seq3.x
 TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
 GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
 AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
@@ -28,7 +28,7 @@ ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
 GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
 TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
 AACTCTATGTAGGGTCGCA
->3
+>seq4.x
 CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
 TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
 TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
diff --git a/ariba/tests/data/cdhit_test_load_user_clusters_file.bad1 b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad1
new file mode 100644
index 0000000..5722176
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad1
@@ -0,0 +1 @@
+seq1 seq1
diff --git a/ariba/tests/data/cdhit_test_load_user_clusters_file.bad2 b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad2
new file mode 100644
index 0000000..38b9f39
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad2
@@ -0,0 +1,2 @@
+seq1 seq2
+seq3 seq1
diff --git a/ariba/tests/data/cdhit_test_load_user_clusters_file.bad3 b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad3
new file mode 100644
index 0000000..dfc59c9
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_load_user_clusters_file.bad3
@@ -0,0 +1,2 @@
+seq1 seq2
+seq3 seq2
diff --git a/ariba/tests/data/cdhit_test_load_user_clusters_file.good b/ariba/tests/data/cdhit_test_load_user_clusters_file.good
new file mode 100644
index 0000000..7b6b6b2
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_load_user_clusters_file.good
@@ -0,0 +1,3 @@
+seq1 seq2 seq3
+seq4
+seq5 seq6
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa
deleted file mode 100644
index bf8b12c..0000000
--- a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa
+++ /dev/null
@@ -1,40 +0,0 @@
->seq1
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATCGTAGGGTCGCA
->seq2
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATGTAGGGTCGCA
->seq3
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
-GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC
-ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
-AACTCTATGTAGGGTCGCA
->seq4
-CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
-TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
-TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
-TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
-AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
-CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
-AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
-GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
-TGCGGTCAACAAGTCCAGGT
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa
deleted file mode 100644
index 9f7eca5..0000000
--- a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa
+++ /dev/null
@@ -1,40 +0,0 @@
->1
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATCGTAGGGTCGCA
->2
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATGTAGGGTCGCA
->3
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
-GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC
-ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
-AACTCTATGTAGGGTCGCA
->4
-CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
-TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
-TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
-TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
-AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
-CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
-AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
-GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
-TGCGGTCAACAAGTCCAGGT
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.infile b/ariba/tests/data/cdhit_test_parse_cluster_info_file.infile
new file mode 100644
index 0000000..548e060
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_parse_cluster_info_file.infile
@@ -0,0 +1,6 @@
+0	500aa, >seq1... *
+0	499aa, >seq2... at 99.40%
+0	499aa, >seq3... at 98.40%
+1	500aa, >seq4... *
+2	300aa, >seq5... at 90.42%
+2	301aa, >seq6... *
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa
deleted file mode 100644
index dba7562..0000000
--- a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa
+++ /dev/null
@@ -1,20 +0,0 @@
->1
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATCGTAGGGTCGCA
->4
-CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
-TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
-TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
-TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
-AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
-CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
-AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
-GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
-TGCGGTCAACAAGTCCAGGT
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr b/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr
deleted file mode 100644
index 1745159..0000000
--- a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr
+++ /dev/null
@@ -1,4 +0,0 @@
-0	500aa, >1... *
-0	499aa, >2... at 99.40%
-0	499aa, >3... at 98.40%
-1	500aa, >4... *
diff --git a/ariba/tests/data/cdhit_test_rename_clusters.expected.fa b/ariba/tests/data/cdhit_test_rename_clusters.expected.fa
new file mode 100644
index 0000000..bc44a99
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_rename_clusters.expected.fa
@@ -0,0 +1,6 @@
+>seq.x
+AGCT
+>seq.x.2
+CCC
+>seq4.x
+AAA
diff --git a/ariba/tests/data/cdhit_test_rename_clusters.in.fa b/ariba/tests/data/cdhit_test_rename_clusters.in.fa
new file mode 100644
index 0000000..614c0bd
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_rename_clusters.in.fa
@@ -0,0 +1,6 @@
+>seq.foo
+AGCT
+>seq.bar
+CCC
+>seq4.eggs
+AAA
diff --git a/ariba/tests/data/cdhit_test_rename_fasta.in.fa b/ariba/tests/data/cdhit_test_rename_fasta.in.fa
deleted file mode 100644
index 11d5e25..0000000
--- a/ariba/tests/data/cdhit_test_rename_fasta.in.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->a
-A
->b
-C
->c
-G
diff --git a/ariba/tests/data/cdhit_test_run.out.fa b/ariba/tests/data/cdhit_test_run.out.fa
index 12a02b2..74d8f08 100644
--- a/ariba/tests/data/cdhit_test_run.out.fa
+++ b/ariba/tests/data/cdhit_test_run.out.fa
@@ -1,4 +1,4 @@
->0
+>seq1.x
 TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
 GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
 AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
@@ -8,7 +8,7 @@ ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
 GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
 TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
 AACTCTATCGTAGGGTCGCA
->1
+>seq4.x
 CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
 TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
 TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
diff --git a/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.clusters b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.clusters
new file mode 100644
index 0000000..5654ce1
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.clusters
@@ -0,0 +1,2 @@
+seq1 seq2
+seq3
diff --git a/ariba/tests/data/cdhit_test_rename_fasta.out.fa b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.fa
similarity index 54%
copy from ariba/tests/data/cdhit_test_rename_fasta.out.fa
copy to ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.fa
index 7ab3799..8f39be0 100644
--- a/ariba/tests/data/cdhit_test_rename_fasta.out.fa
+++ b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.in.fa
@@ -1,6 +1,6 @@
 >seq1
-A
+ACGT
 >seq2
-C
+AAAA
 >seq3
-G
+CCCC
diff --git a/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.out.fa b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.out.fa
new file mode 100644
index 0000000..424f9c3
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_run_get_clusters_from_dict.out.fa
@@ -0,0 +1,4 @@
+>seq1.x
+ACGT
+>seq3.x
+CCCC
diff --git a/ariba/tests/data/cluster_test_assemble_with_spades/genes.fa b/ariba/tests/data/cluster_test_assemble_with_spades/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_assemble_with_spades/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_choose_best_gene.gene.fa b/ariba/tests/data/cluster_test_choose_best_gene.gene.fa
deleted file mode 100644
index 6d31a2e..0000000
--- a/ariba/tests/data/cluster_test_choose_best_gene.gene.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->1
-AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG
-TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC
-CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA
-GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG
-TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC
diff --git a/ariba/tests/data/cluster_test_count_reads_1.fq b/ariba/tests/data/cluster_test_count_reads_1.fq
new file mode 100644
index 0000000..9d69178
--- /dev/null
+++ b/ariba/tests/data/cluster_test_count_reads_1.fq
@@ -0,0 +1,8 @@
+ at read1
+ACGT
++
+HHHH
+ at read2
+ACG
++
+:-)
diff --git a/ariba/tests/data/cluster_test_count_reads_2.fq b/ariba/tests/data/cluster_test_count_reads_2.fq
new file mode 100644
index 0000000..e458df7
--- /dev/null
+++ b/ariba/tests/data/cluster_test_count_reads_2.fq
@@ -0,0 +1,8 @@
+ at read1
+ACGT
++
+HHHH
+ at read2
+ACG
++
+:-D
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa b/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa
deleted file mode 100644
index 5d5102b..0000000
--- a/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa
+++ /dev/null
@@ -1,10 +0,0 @@
->gene
-ACTTACCGGTTCGGGGTCTAAACCAACCATTAAACTGCGACAACCATTCATCCTGGAGTA
-CGCTTCGGTCCACCATGATGGAGCGCCATGTGATGGGATTTCCAACCCCGTTGTTTCAGG
-ACTCATGGCATTTACCACCGACAACCGTTTATAATCCATGAGCAAGGAATACAGTGGAGA
-CAGGATTGGTTGTATTGGACTGAATACATGCCCCACTGTTACCCCGAAAGTTAACACGTA
-CCCATAGTTTATTTAAACTAGGCACTCCCGATCAGCCAAGACTTAAAAAGGGGGATAGGA
-ATATCAACGTAGTACTTCTCGGTTGATCCGTGTTTTTTAATCTAAAATATAATGTGTAGG
-CAGCTATCGTGCTAATCGTTGAAATGAGCAGGCGAAATGCCGTTTACAACGACGCTAAAC
-CTCCAAGTCGAATTAAGCCAAATTGTGCCTTCCATATGACCTCCACAGATTTGGGCTGGC
-ACTGTCAGCGTAGTTGCGCT
diff --git a/ariba/tests/data/cluster_test_full_run_assembly_fail.noncoding.fa b/ariba/tests/data/cluster_test_full_run_assembly_fail.noncoding.fa
new file mode 100644
index 0000000..7a82c89
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_assembly_fail.noncoding.fa
@@ -0,0 +1,8 @@
+>noncoding_ref_seq
+TTGGTTCCGTTCTTATTCTTCAGCTATTGATAGCAATGGTCCAACACATCGACCGACCCC
+CACAAAGACAGCAGACATCGATTGCTAAGGGGCCCGAAATTTGCTGGTCCGCACGAACAC
+GACTCCGCCAAATACGTATTCGTCCGCGCAGGATGTGGCGAACTCTCAGATGTTCGGCTT
+TTCTTTCGATCGAAGCGACTGGCCGGAGGCAGTCAGACGGCGATAGGAGTAGAATCCACC
+GGAGGGTCTGCCCTGCGGTCATCAGCTACCTTCAACCCTCAAGTCCTCGTCGCCATCCGA
+AAAAACCTTCCCCACGATAGCGTACATCACGCTTTGGTTACAGGAAGAATGAGGGATTCA
+AATGAAGATCCCATGTACACTGTAAAGGTCTTTCCGAGTA
diff --git a/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_1.fq b/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_1.fq
new file mode 100644
index 0000000..932a232
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_1.fq
@@ -0,0 +1,8 @@
+ at non_coding:1:92:322/1
+GCCCGAAATTTGCTGGTCCGCACGAACACGACTCCGCCAAATACGTATTCGTCCGCGCAGGATGTGGCGAACTCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at non_coding:2:11:253/1
+TCTTATTCTTCAGCTATTGATAGCAATGGTCCAACACATCGACCGACCCCCACAAAGACAGCAGACATCGATTGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_2.fq b/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_2.fq
new file mode 100644
index 0000000..3fc3196
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_2.fq
@@ -0,0 +1,8 @@
+ at non_coding:1:92:322/2
+TCGGAAAGACCTTTACAGTGTACATGGGATCTTCATTTGAATCCCTCATTCTTCCTGTAACCAAAGCGTGATGTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at non_coding:2:11:253/2
+GATGTACGCTATCGTGGGGAAGGTTTTTTCGGATGGCGACGAGGACTTGAGGGTTGAAGGTAGCTGATGACCGCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_assembly_fail/references.fa b/ariba/tests/data/cluster_test_full_run_assembly_fail/references.fa
new file mode 100644
index 0000000..7a82c89
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_assembly_fail/references.fa
@@ -0,0 +1,8 @@
+>noncoding_ref_seq
+TTGGTTCCGTTCTTATTCTTCAGCTATTGATAGCAATGGTCCAACACATCGACCGACCCC
+CACAAAGACAGCAGACATCGATTGCTAAGGGGCCCGAAATTTGCTGGTCCGCACGAACAC
+GACTCCGCCAAATACGTATTCGTCCGCGCAGGATGTGGCGAACTCTCAGATGTTCGGCTT
+TTCTTTCGATCGAAGCGACTGGCCGGAGGCAGTCAGACGGCGATAGGAGTAGAATCCACC
+GGAGGGTCTGCCCTGCGGTCATCAGCTACCTTCAACCCTCAAGTCCTCGTCGCCATCCGA
+AAAAACCTTCCCCACGATAGCGTACATCACGCTTTGGTTACAGGAAGAATGAGGGATTCA
+AATGAAGATCCCATGTACACTGTAAAGGTCTTTCCGAGTA
diff --git a/ariba/tests/data/cluster_test_full_run_choose_ref_fail.presence_absence.fa b/ariba/tests/data/cluster_test_full_run_choose_ref_fail.presence_absence.fa
new file mode 100644
index 0000000..09fa61e
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_choose_ref_fail.presence_absence.fa
@@ -0,0 +1,2 @@
+>presence_absence_gene
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_1.fq b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_1.fq
new file mode 100644
index 0000000..022d4f4
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_1.fq
@@ -0,0 +1,4 @@
+ at read1/1
+CACTGACTTCACTTACTATCTACTGAATATACTTATCATCTACTCGATGCATGCATGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_2.fq b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_2.fq
new file mode 100644
index 0000000..022d4f4
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_2.fq
@@ -0,0 +1,4 @@
+ at read1/1
+CACTGACTTCACTTACTATCTACTGAATATACTTATCATCTACTCGATGCATGCATGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_choose_ref_fail/references.fa b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/references.fa
new file mode 100644
index 0000000..bf3b272
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/references.fa
@@ -0,0 +1,4 @@
+>presence_absence_gene1
+ATGAACCCCGGGGTTTTTTAA
+>presence_absence_gene2
+ATGACCCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding.fa b/ariba/tests/data/cluster_test_full_run_ok_non_coding.fa
new file mode 100644
index 0000000..3278f9e
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding.fa
@@ -0,0 +1,6 @@
+>noncoding1
+CGTACGCGGGTGGAGACATGTACTCCACTCCCATACATCCCTAAGTTTGTCCCTAAGGCA
+GTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCAC
+>noncoding2
+TCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCT
+GCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCC
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv
new file mode 100644
index 0000000..97cfd32
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv
@@ -0,0 +1,5 @@
+noncoding1	.	.	.	generic description of noncoding1
+noncoding1	n	A6G	.	variant in ref and reads so should report
+noncoding1	n	G9T	.	wild type in ref and reads
+noncoding1	n	A14T	.	ref has wild type, reads has variant so should report
+noncoding1	n	A40C	.	ref has variant, reads has wild type
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_1.fq b/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_1.fq
new file mode 100644
index 0000000..4356e47
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_1.fq
@@ -0,0 +1,144 @@
+ at noncoding1:1:77:136/1
+CATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCATTGCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:2:48:107/1
+CTGAGTGAAGCGACGTACGCGGGTGGTGACATGTACTCCACTCCCATACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:3:98:159/1
+TCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:4:126:185/1
+CGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:5:26:85/1
+CGTAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGTGGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:6:85:145/1
+CCACTCCCATACATCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:7:53:112/1
+TGAAGCGACGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:8:110:170/1
+TCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:9:73:132/1
+GTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:10:51:110/1
+AGTGAAGCGACGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:11:123:183/1
+GCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:12:63:122/1
+TACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:13:91:150/1
+CCATACATCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:14:7:68/1
+GACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTGAGTGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:15:104:163/1
+AGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCGGTGAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:16:1:60/1
+CGTATCGACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:17:64:123/1
+ACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:18:128:185/1
+CCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:19:28:88/1
+TAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGTGGTGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:20:97:157/1
+ATCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:21:22:81/1
+CGTACGTAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:22:95:155/1
+ACATCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACGAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:23:119:176/1
+CATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:24:110:169/1
+TCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:25:110:170.dup.2/1
+TCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:26:57:117/1
+GCGACGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:27:41:100/1
+CTACTGACTGAGTGAAGCGACGTACGCGGGTGGTGACATGTACTCCACTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:28:18:78/1
+CGTACGTACGTAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:29:6:65/1
+CGACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTGAGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:30:3:63/1
+TATCGACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:31:66:124/1
+GCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:32:62:122/1
+GTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:33:32:91/1
+GTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGTGGTGACATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:34:28:86/1
+TAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGTGGTGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:35:3:64/1
+TATCGACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:36:120:181/1
+ATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_2.fq b/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_2.fq
new file mode 100644
index 0000000..b3ba738
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_2.fq
@@ -0,0 +1,144 @@
+ at noncoding1:1:77:136/2
+TGAATGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:2:48:107/2
+GCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCCTTAGGGACAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:3:98:159/2
+CGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:4:126:185/2
+AGATCCGCGCGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:5:26:85/2
+TGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:6:85:145/2
+GCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:7:53:112/2
+CCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCCTTAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:8:110:170/2
+TATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:9:73:132/2
+TGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:10:51:110/2
+TAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCCTTAGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:11:123:183/2
+ATCCGCGCGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:12:63:122/2
+GATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:13:91:150/2
+TAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCTAAGCATCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:14:7:68/2
+CTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:15:104:163/2
+CGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:16:1:60/2
+CAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCGTACGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:17:64:123/2
+GGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:18:128:185/2
+AGATCCGCGCGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:19:28:88/2
+TCGTGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:20:97:157/2
+TCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:21:22:81/2
+CGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:22:95:155/2
+GCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCTAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:23:119:176/2
+CGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:24:110:169/2
+ATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:25:110:170.dup.2/2
+TATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:26:57:117/2
+CGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:27:41:100/2
+ACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:28:18:78/2
+CGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:29:6:65/2
+AGGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:30:3:63/2
+GGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:31:66:124/2
+CGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:32:62:122/2
+GATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:33:32:91/2
+CGTTCGTGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:34:28:86/2
+GTGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:35:3:64/2
+GGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at noncoding1:36:120:181/2
+CCGCGCGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding/references.fa b/ariba/tests/data/cluster_test_full_run_ok_non_coding/references.fa
new file mode 100644
index 0000000..2a9253d
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding/references.fa
@@ -0,0 +1,6 @@
+>noncoding1
+CGTACGCGGGTGGAGACATGTACTCCACTCCCATACATCCCTAAGTTTGTCCCTAAGGCA
+GTGCCCGCCGCCCACGAACGAATGCGGTGAGATGCTTAGGGAACGCCTATCCGAGATCAC
+>noncoding2
+TCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCT
+GCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCC
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence.fa b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.fa
new file mode 100644
index 0000000..c7a01d8
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.fa
@@ -0,0 +1,5 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv
new file mode 100644
index 0000000..bc5a3d9
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv
@@ -0,0 +1,4 @@
+presence_absence1	.	.	.	Generic description of presence_absence1
+presence_absence1	p	R3S	.	Ref and assembly have wild type
+presence_absence1	p	A10V	.	Ref has wild, reads have variant so report
+presence_absence1	p	I5A	.	Ref and reads have variant so report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_1.fq b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_1.fq
new file mode 100644
index 0000000..3d9db94
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_1.fq
@@ -0,0 +1,128 @@
+ at presence_absence1:1:42:100/1
+CAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:2:95:155/1
+AACGCGCGAGCACCAACATCAGCCATATTAACGGCATTAGCGCGTGGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:3:33:94/1
+TTCGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:4:18:78/1
+ACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTGATATGGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:5:86:147/1
+AAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCATTAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:6:53:112/1
+GTAGTGATATGGATCGCGAAGCGATGACCCATGAAGTGACCGAACGCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:7:98:156/1
+GCGCGAGCACCAACATCAGCCATATTAACGGCATTAGCGCGTGGGAAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:8:29:87/1
+TATCTTCGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:9:84:145/1
+TGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:10:35:94/1
+CGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:11:9:66/1
+CTATCATCGACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:12:9:68/1
+CTATCATCGACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:13:82:142/1
+CATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:14:83:142/1
+ATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:15:54:114/1
+TAGTGATATGGATCGCGAAGCGATGACCCATGAAGTGACCGAACGCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:16:81:140/1
+CCATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:17:18:76/1
+ACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTGATATGGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:18:14:73/1
+ATCGACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTGATATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:19:6:65/1
+ACTCTATCATCGACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:20:30:89/1
+ATCTTCGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:21:77:136/1
+TGACCCATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:22:47:107/1
+GTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAGTGACCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:23:90:149/1
+GACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCATTAGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:24:109:169/1
+AACATCAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:25:35:94.dup.2/1
+CGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:26:74:133/1
+CGATGACCCATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:27:36:97/1
+GACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:28:39:99/1
+GCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:29:74:135/1
+CGATGACCCATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:30:40:99/1
+CTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:31:36:96/1
+GACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:32:40:101/1
+CTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_2.fq b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_2.fq
new file mode 100644
index 0000000..94dc98b
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_2.fq
@@ -0,0 +1,128 @@
+ at presence_absence1:1:42:100/2
+ATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:2:95:155/2
+CGACTGATACGTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:3:33:94/2
+TCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:4:18:78/2
+CGTTAATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:5:86:147/2
+ACGTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:6:53:112/2
+TAGTGTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:7:98:156/2
+ACGACTGATACGTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:8:29:87/2
+CGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:9:84:145/2
+GTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:10:35:94/2
+TCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:11:9:66/2
+TGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCATCGCTTCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:12:9:68/2
+GCTGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCATCGCTTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:13:82:142/2
+GACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:14:83:142/2
+GACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:15:54:114/2
+CGTAGTGTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:16:81:140/2
+CTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:17:18:76/2
+TTAATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:18:14:73/2
+ATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCATCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:19:6:65/2
+GATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCATCGCTTCGCGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:20:30:89/2
+CGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:21:77:136/2
+TGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTCCCACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:22:47:107/2
+TTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:23:90:149/2
+ATACGTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:24:109:169/2
+TTTTGTGTGAGCTACGACTGATACGTCGACTCGTGTCGTTCAGCTACGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:25:35:94.dup.2/2
+TCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:26:74:133/2
+CGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTCCCACGCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:27:36:97/2
+CTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:28:39:99/2
+TGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:29:74:135/2
+GTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTCCCACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:30:40:99/2
+TGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:31:36:96/2
+TTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at presence_absence1:32:40:101/2
+CATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence/references.fa b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/references.fa
new file mode 100644
index 0000000..c7a01d8
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/references.fa
@@ -0,0 +1,5 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.fa b/ariba/tests/data/cluster_test_full_run_ok_variants_only.fa
new file mode 100644
index 0000000..b99d42a
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.fa
@@ -0,0 +1,5 @@
+>variants_only1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>variants_only2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
new file mode 100644
index 0000000..7e193f6
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
@@ -0,0 +1,2 @@
+variants_only1	.	.	.	Generic description of variants_only1
+variants_only1	p	R3S	.	Ref and assembly have wild type, but always report anyway
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv
new file mode 100644
index 0000000..de14a1b
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv
@@ -0,0 +1,2 @@
+variants_only1	.	.	.	Generic description of variants_only1
+variants_only1	p	R3S	.	Ref and assembly have wild type, so do not report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv
new file mode 100644
index 0000000..621f2c9
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv
@@ -0,0 +1,3 @@
+variants_only1	.	.	.	Generic description of variants_only1
+variants_only1	p	R3S	.	Ref and assembly have wild type
+variants_only1	p	I5A	.	Ref and reads have variant so report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_1.fq b/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_1.fq
new file mode 100644
index 0000000..9f18c62
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_1.fq
@@ -0,0 +1,132 @@
+ at variants_only1:1:3:63/1
+CTACGAGCATGCGTCGCAGTCGACGACGTAGCAGCAGTCACGTAGCAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:2:24:83/1
+GACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCGCGAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:3:92:151/1
+CCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:4:64:122/1
+GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:5:95:154/1
+AACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:6:87:144/1
+AGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:7:109:168/1
+AACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:8:105:165/1
+CACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:9:56:116/1
+ATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:10:76:135/1
+ATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:11:19:79/1
+CAGTCGACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:12:100:161/1
+GCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:13:40:101/1
+TCACGTAGCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:14:51:112/1
+CACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:15:74:135/1
+CGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:16:22:81/1
+TCGACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:17:102:158/1
+GAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:18:92:151.dup.2/1
+CCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:19:102:162/1
+GAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:20:22:80/1
+TCGACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:21:63:121/1
+GGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:22:96:155/1
+ACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:23:8:69/1
+AGCATGCGTCGCAGTCGACGACGTAGCAGCAGTCACGTAGCAGCACGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:24:8:67/1
+AGCATGCGTCGCAGTCGACGACGTAGCAGCAGTCACGTAGCAGCACGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:25:36:96/1
+GCAGTCACGTAGCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:26:73:133/1
+GCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:27:41:100/1
+CACGTAGCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:28:50:109/1
+GCACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:29:47:106/1
+GCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:30:69:130/1
+CGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:31:54:113/1
+GTATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:32:38:97/1
+AGTCACGTAGCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:33:23:83/1
+CGACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCGCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_2.fq b/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_2.fq
new file mode 100644
index 0000000..89b713b
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_2.fq
@@ -0,0 +1,132 @@
+ at variants_only1:1:3:63/2
+TGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGTCATCGCTTCGCGATCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:2:24:83/2
+AATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:3:92:151/2
+GCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:4:64:122/2
+GCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:5:95:154/2
+GTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:6:87:144/2
+CGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:7:109:168/2
+CTGGTAACGTGCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:8:105:165/2
+GTAACGTGCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:9:56:116/2
+GTCGCTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:10:76:135/2
+TGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:11:19:79/2
+CCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:12:100:161/2
+CGTGCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:13:40:101/2
+CATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:14:51:112/2
+CTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:15:74:135/2
+TGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:16:22:81/2
+TGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:17:102:158/2
+GCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:18:92:151.dup.2/2
+GCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:19:102:162/2
+ACGTGCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:20:22:80/2
+GCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:21:63:121/2
+CTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:22:96:155/2
+CGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:23:8:69/2
+GGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGTCATCGCTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:24:8:67/2
+CTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGTCATCGCTTCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:25:36:96/2
+TTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:26:73:133/2
+CTTTGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:27:41:100/2
+ATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:28:50:109/2
+CTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:29:47:106/2
+TATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:30:69:130/2
+TGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCGCTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:31:54:113/2
+GCTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:32:38:97/2
+CTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at variants_only1:33:23:83/2
+AATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only/references.fa b/ariba/tests/data/cluster_test_full_run_ok_variants_only/references.fa
new file mode 100644
index 0000000..b99d42a
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only/references.fa
@@ -0,0 +1,5 @@
+>variants_only1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>variants_only2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller.gene.fa b/ariba/tests/data/cluster_test_gapfill_with_gapfiller.gene.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_gapfill_with_gapfiller.gene.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/genes.fa b/ariba/tests/data/cluster_test_gapfill_with_gapfiller/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_generic/genes.fa b/ariba/tests/data/cluster_test_generic/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_generic/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_generic/reads_1.fq b/ariba/tests/data/cluster_test_generic/reads_1.fq
deleted file mode 100644
index 6ff1e12..0000000
--- a/ariba/tests/data/cluster_test_generic/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_generic/reads_2.fq b/ariba/tests/data/cluster_test_generic/reads_2.fq
deleted file mode 100644
index 2eb387f..0000000
--- a/ariba/tests/data/cluster_test_generic/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_get_read_counts/genes.fa b/ariba/tests/data/cluster_test_get_read_counts/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_get_read_counts/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq b/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq
deleted file mode 100644
index 6ff1e12..0000000
--- a/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq b/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq
deleted file mode 100644
index 2eb387f..0000000
--- a/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa b/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq b/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq
deleted file mode 100644
index 46dd9a2..0000000
--- a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq
+++ /dev/null
@@ -1,8 +0,0 @@
- at read1/1
-ACGTACGT
-+
-IIIIIIII
- at read2/1
-TCATCATA
-+
-:D:D:D:D
diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq b/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq
deleted file mode 100644
index 2eb387f..0000000
--- a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_init_no_gene_fa/reads_1.fq b/ariba/tests/data/cluster_test_init_no_refs_fa/reads_1.fq
similarity index 100%
copy from ariba/tests/data/cluster_test_init_no_gene_fa/reads_1.fq
copy to ariba/tests/data/cluster_test_init_no_refs_fa/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_init_no_gene_fa/reads_2.fq b/ariba/tests/data/cluster_test_init_no_refs_fa/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_init_no_gene_fa/reads_2.fq
rename to ariba/tests/data/cluster_test_init_no_refs_fa/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_init_refdata.fa b/ariba/tests/data/cluster_test_init_refdata.fa
new file mode 100644
index 0000000..f98f0ac
--- /dev/null
+++ b/ariba/tests/data/cluster_test_init_refdata.fa
@@ -0,0 +1,2 @@
+>seq1
+ACGTACGT
diff --git a/ariba/tests/data/cluster_test_load_final_contigs.contigs.fa b/ariba/tests/data/cluster_test_load_final_contigs.contigs.fa
deleted file mode 100644
index a32ff99..0000000
--- a/ariba/tests/data/cluster_test_load_final_contigs.contigs.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->spam
-ACGT
->egg1
-TGCA
->egg2
-AAAA
diff --git a/ariba/tests/data/cluster_test_load_final_contigs/genes.fa b/ariba/tests/data/cluster_test_load_final_contigs/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_load_final_contigs/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_load_final_contigs/reads_1.fq b/ariba/tests/data/cluster_test_load_final_contigs/reads_1.fq
deleted file mode 100644
index 6ff1e12..0000000
--- a/ariba/tests/data/cluster_test_load_final_contigs/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_load_final_contigs/reads_2.fq b/ariba/tests/data/cluster_test_load_final_contigs/reads_2.fq
deleted file mode 100644
index 2eb387f..0000000
--- a/ariba/tests/data/cluster_test_load_final_contigs/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_make_reads_for_assembly.in1.fq b/ariba/tests/data/cluster_test_make_reads_for_assembly.in1.fq
new file mode 100644
index 0000000..976e11d
--- /dev/null
+++ b/ariba/tests/data/cluster_test_make_reads_for_assembly.in1.fq
@@ -0,0 +1,40 @@
+ at read1/1
+ACGT
++
+ABCD
+ at read2/1
+ACGT
++
+ABCD
+ at read3/1
+ACGT
++
+ABCD
+ at read4/1
+ACGT
++
+ABCD
+ at read5/1
+ACGT
++
+ABCD
+ at read6/1
+ACGT
++
+ABCD
+ at read7/1
+ACGT
++
+ABCD
+ at read8/1
+ACGT
++
+ABCD
+ at read9/1
+ACGT
++
+ABCD
+ at read10/1
+ACGT
++
+ABCD
diff --git a/ariba/tests/data/cluster_test_make_reads_for_assembly.in2.fq b/ariba/tests/data/cluster_test_make_reads_for_assembly.in2.fq
new file mode 100644
index 0000000..a7f82df
--- /dev/null
+++ b/ariba/tests/data/cluster_test_make_reads_for_assembly.in2.fq
@@ -0,0 +1,40 @@
+ at read1/2
+ACGTA
++
+DEFGH
+ at read2/2
+ACGTA
++
+DEFGH
+ at read3/2
+ACGTA
++
+DEFGH
+ at read4/2
+ACGTA
++
+DEFGH
+ at read5/2
+ACGTA
++
+DEFGH
+ at read6/2
+ACGTA
++
+DEFGH
+ at read7/2
+ACGTA
++
+DEFGH
+ at read8/2
+ACGTA
++
+DEFGH
+ at read9/2
+ACGTA
++
+DEFGH
+ at read10/2
+ACGTA
++
+DEFGH
diff --git a/ariba/tests/data/cluster_test_make_reads_for_assembly.out1.fq b/ariba/tests/data/cluster_test_make_reads_for_assembly.out1.fq
new file mode 100644
index 0000000..0155caf
--- /dev/null
+++ b/ariba/tests/data/cluster_test_make_reads_for_assembly.out1.fq
@@ -0,0 +1,28 @@
+ at read2/1
+ACGT
++
+ABCD
+ at read3/1
+ACGT
++
+ABCD
+ at read5/1
+ACGT
++
+ABCD
+ at read6/1
+ACGT
++
+ABCD
+ at read7/1
+ACGT
++
+ABCD
+ at read8/1
+ACGT
++
+ABCD
+ at read10/1
+ACGT
++
+ABCD
diff --git a/ariba/tests/data/cluster_test_make_reads_for_assembly.out2.fq b/ariba/tests/data/cluster_test_make_reads_for_assembly.out2.fq
new file mode 100644
index 0000000..dad30a6
--- /dev/null
+++ b/ariba/tests/data/cluster_test_make_reads_for_assembly.out2.fq
@@ -0,0 +1,28 @@
+ at read2/2
+ACGTA
++
+DEFGH
+ at read3/2
+ACGTA
++
+DEFGH
+ at read5/2
+ACGTA
++
+DEFGH
+ at read6/2
+ACGTA
++
+DEFGH
+ at read7/2
+ACGTA
++
+DEFGH
+ at read8/2
+ACGTA
++
+DEFGH
+ at read10/2
+ACGTA
++
+DEFGH
diff --git a/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz b/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz
deleted file mode 100644
index 87c55ae..0000000
Binary files a/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz and /dev/null differ
diff --git a/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz.tbi b/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz.tbi
deleted file mode 100644
index 02adc59..0000000
Binary files a/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz.tbi and /dev/null differ
diff --git a/ariba/tests/data/cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa b/ariba/tests/data/cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa
deleted file mode 100644
index 87aa214..0000000
--- a/ariba/tests/data/cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa
+++ /dev/null
@@ -1,7 +0,0 @@
->ref_gene.1.147.contig1.61.207.+.complete
-ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAAT
-TATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT
-GCCAGTGGCATCTGTGTAAGCGCTTAG
->ref_gene.18.120.contig2.1.103.-
-TTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCG
-TTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT
diff --git a/ariba/tests/data/cluster_test_number_of_reads_for_assembly.ref.fa b/ariba/tests/data/cluster_test_number_of_reads_for_assembly.ref.fa
new file mode 100644
index 0000000..1548184
--- /dev/null
+++ b/ariba/tests/data/cluster_test_number_of_reads_for_assembly.ref.fa
@@ -0,0 +1,3 @@
+>ref
+TTTCTCGGTACCTCATCACGAGCCTCGTCCATACGCGTACCTTTAGAGGTTATGGACGTA
+TGGCTAGTACGTTGATGACAAAGTTGATGTCGGAGCCTAT
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam/genes.fa b/ariba/tests/data/cluster_test_parse_assembly_bam/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_parse_assembly_bam/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam/reads_1.fq b/ariba/tests/data/cluster_test_parse_assembly_bam/reads_1.fq
deleted file mode 100644
index 6ff1e12..0000000
--- a/ariba/tests/data/cluster_test_parse_assembly_bam/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam/reads_2.fq b/ariba/tests/data/cluster_test_parse_assembly_bam/reads_2.fq
deleted file mode 100644
index 2eb387f..0000000
--- a/ariba/tests/data/cluster_test_parse_assembly_bam/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords.coords b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords.coords
deleted file mode 100644
index 57be109..0000000
--- a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords.coords
+++ /dev/null
@@ -1,7 +0,0 @@
-a.fa b.fa
-NUCMER
-
-[S1]	[E1]	[S2]	[E2]	[LEN 1]	[LEN 2]	[% IDY]	[LEN R]	[LEN Q]	[FRM]	[TAGS]
-1	1000	1	1000	1000	1000	100.00	1000	1000	1	1	gene	contig1	[IDENTITY]
-1	240	1	240	240	240	100.00	1000	580	1	1	gene	contig2	
-661	1000	241	580	340	340	100.00	1000	580	1	1	gene	contig2	
diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/genes.fa b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/genes.fa
deleted file mode 100644
index 8d999fa..0000000
--- a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_1.fq b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_1.fq
deleted file mode 100644
index 6ff1e12..0000000
--- a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_2.fq b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_2.fq
deleted file mode 100644
index 2eb387f..0000000
--- a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds.out.fa b/ariba/tests/data/cluster_test_rename_scaffolds.out.fa
deleted file mode 100644
index d60043f..0000000
--- a/ariba/tests/data/cluster_test_rename_scaffolds.out.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->name_of_gene.scaffold.1
-TACG
->name_of_gene.scaffold.2
-ACGT
->name_of_gene.scaffold.3
-CGTA
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds/genes.fa b/ariba/tests/data/cluster_test_rename_scaffolds/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_rename_scaffolds/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds/reads_1.fq b/ariba/tests/data/cluster_test_rename_scaffolds/reads_1.fq
deleted file mode 100644
index 6ff1e12..0000000
--- a/ariba/tests/data/cluster_test_rename_scaffolds/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds/reads_2.fq b/ariba/tests/data/cluster_test_rename_scaffolds/reads_2.fq
deleted file mode 100644
index 2eb387f..0000000
--- a/ariba/tests/data/cluster_test_rename_scaffolds/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa b/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa b/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa b/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa
deleted file mode 100644
index 042775d..0000000
--- a/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_set_assembly_kmer/reads_1.fq b/ariba/tests/data/cluster_test_set_assembly_kmer/reads_1.fq
deleted file mode 100644
index 192e5e7..0000000
--- a/ariba/tests/data/cluster_test_set_assembly_kmer/reads_1.fq
+++ /dev/null
@@ -1,12 +0,0 @@
- at 1/1
-AGTGACGTA
-+
-III:DIIII
- at 2/1
-ACGTGACGTA
-+
-II:-()IIII
- at 3/1
-AACGTGACGTA
-+
-II;)IIIIIII
diff --git a/ariba/tests/data/cluster_test_set_assembly_kmer/reads_2.fq b/ariba/tests/data/cluster_test_set_assembly_kmer/reads_2.fq
deleted file mode 100644
index fc17f72..0000000
--- a/ariba/tests/data/cluster_test_set_assembly_kmer/reads_2.fq
+++ /dev/null
@@ -1,12 +0,0 @@
- at 1/2
-ACGTG
-+
-IIIII
- at 2/2
-ACAGTG
-+
-IIIIIII
- at 3/2
-ACGTAGA
-+
-IIIIIII
diff --git a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_1.fq b/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_1.fq
deleted file mode 100644
index f62449d..0000000
--- a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read2/1
-CGGCTTAACGGCACTTTTCCACGCAAGTGTTGCTCTGAAAAGTTGGGACTTATGTCTTCC
-+
-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_2.fq b/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_2.fq
deleted file mode 100644
index 8ebade7..0000000
--- a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
- at read2/2
-GGGCCCGTGGTAGTTAGACTAGAGGAATAGCTGAGAGTTGACATTTACGGTGGGAACAGC
-+
-IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/clusters_test_bam_to_clusters_reads.db.fa.fai b/ariba/tests/data/clusters_test_bam_to_clusters_reads.db.fa.fai
new file mode 100644
index 0000000..0ce6d4a
--- /dev/null
+++ b/ariba/tests/data/clusters_test_bam_to_clusters_reads.db.fa.fai
@@ -0,0 +1,3 @@
+ref1	1000	6	60	61
+ref2	1000	1029	60	61
+ref3	1000	2052	60	61
diff --git a/ariba/tests/data/clusters_test_bam_to_clusters_reads.read_store.gz b/ariba/tests/data/clusters_test_bam_to_clusters_reads.read_store.gz
new file mode 100644
index 0000000..02b0e0f
Binary files /dev/null and b/ariba/tests/data/clusters_test_bam_to_clusters_reads.read_store.gz differ
diff --git a/ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map.bam b/ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map.bam
new file mode 100644
index 0000000..89be29e
Binary files /dev/null and b/ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map.bam differ
diff --git a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_1.fq b/ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map_1.fq
similarity index 54%
rename from ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_1.fq
rename to ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map_1.fq
index c1510f4..b04e523 100644
--- a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_1.fq
+++ b/ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map_1.fq
@@ -1,8 +1,8 @@
 @read1/1
-GTATATGGTGGGTCGTCATGGAAGCAGTACCTATCAGCATAGCTGCACTACCCTACATGC
+CACAACCACAACACACACACACACTCATTTTTTTTTTTTTTTTCACACACACAACCAACA
 +
 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
 @read2/1
-CGGCTTAACGGCACTTTTCCACGCAAGTGTTGCTCTGAAAAGTTGGGACTTATGTCTTCC
+TGTGTGTGTGTGTGTGTGTTTTTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAAAATCTCT
 +
 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_2.fq b/ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map_2.fq
similarity index 54%
rename from ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_2.fq
rename to ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map_2.fq
index 532fbc9..2e57937 100644
--- a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_2.fq
+++ b/ariba/tests/data/clusters_test_bam_to_clusters_reads_no_reads_map_2.fq
@@ -1,8 +1,8 @@
 @read1/2
-CTTCGAGTGCCCCAACACAAATTCGCATCCTCTAGGGGGGTTTTTCGTTTTGCGAGTATC
+TTTCTCATACTATATATATATATATATATATATATATTAATATATATTAATTATATATAA
 +
 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
 @read2/2
-GGGCCCGTGGTAGTTAGACTAGAGGAATAGCTGAGAGTTGACATTTACGGTGGGAACAGC
+GCGCGCGCGCGCGCCGGCGGCCGCGCGCGCGCGGCGCGGGGGGGGGGGGGGGGGGGGGAA
 +
 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/clusters_test_dummy_db.fa b/ariba/tests/data/clusters_test_dummy_db.fa
index e69de29..6c2305a 100644
--- a/ariba/tests/data/clusters_test_dummy_db.fa
+++ b/ariba/tests/data/clusters_test_dummy_db.fa
@@ -0,0 +1,2 @@
+>x
+CTACTGCATCGTAATCATCGTATACCATTGACTGCATCAATCTGCATCTATGCATCAAA
diff --git a/ariba/tests/data/clusters_test_load_data_info_file b/ariba/tests/data/clusters_test_load_data_info_file
new file mode 100644
index 0000000..498fb80
--- /dev/null
+++ b/ariba/tests/data/clusters_test_load_data_info_file
@@ -0,0 +1,5 @@
+input_presabs	/foo/bar/presence_absence.fa
+input_varonly	/foo/bar/variants_only.fa
+input_noncoding	/foo/bar/noncoding.fa
+input_metadata	/foo/bar/metadata.tsv
+genetic_code	11
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/cdhit.clusters.pickle b/ariba/tests/data/clusters_test_load_reference_data_from_dir/cdhit.clusters.pickle
new file mode 100644
index 0000000..2a2a480
Binary files /dev/null and b/ariba/tests/data/clusters_test_load_reference_data_from_dir/cdhit.clusters.pickle differ
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/info.txt b/ariba/tests/data/clusters_test_load_reference_data_from_dir/info.txt
new file mode 100644
index 0000000..498fb80
--- /dev/null
+++ b/ariba/tests/data/clusters_test_load_reference_data_from_dir/info.txt
@@ -0,0 +1,5 @@
+input_presabs	/foo/bar/presence_absence.fa
+input_varonly	/foo/bar/variants_only.fa
+input_noncoding	/foo/bar/noncoding.fa
+input_metadata	/foo/bar/metadata.tsv
+genetic_code	11
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.non_coding.fa b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.non_coding.fa
new file mode 100644
index 0000000..216a9da
--- /dev/null
+++ b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.non_coding.fa
@@ -0,0 +1,2 @@
+>noncoding1
+ACGTA
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.presence_absence.fa b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.presence_absence.fa
new file mode 100644
index 0000000..a083dc2
--- /dev/null
+++ b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.presence_absence.fa
@@ -0,0 +1,2 @@
+>presabs1
+atgatgatgagcccggcgatggaaggcggctag
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv
new file mode 100644
index 0000000..07c89d5
--- /dev/null
+++ b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.tsv
@@ -0,0 +1,2 @@
+variants_only1	p	C2I	.	description of variants_only1 C2I
+presabs1	.	.	.	presabs1 description
diff --git a/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.variants_only.fa b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.variants_only.fa
new file mode 100644
index 0000000..95fb726
--- /dev/null
+++ b/ariba/tests/data/clusters_test_load_reference_data_from_dir/refcheck.01.check_variants.variants_only.fa
@@ -0,0 +1,2 @@
+>variants_only1
+atggcgtgcgatgaataa
diff --git a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.expected.out.fa b/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.expected.out.fa
index d340bc2..a663981 100644
--- a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.expected.out.fa
+++ b/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.expected.out.fa
@@ -1,6 +1,6 @@
->gene1.1
+>seq1
 ACGT
->gene1.2
-CAT
->gene2
-GTGT
+>seq2
+TTTT
+>seq3
+AAAA
diff --git a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa b/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa
deleted file mode 100644
index 27aef24..0000000
--- a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->gene1.1
-ACGT
->gene1.2
-CAT
diff --git a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa b/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa
deleted file mode 100644
index 2697b9f..0000000
--- a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->gene2
-GTGT
diff --git a/ariba/tests/data/clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa b/ariba/tests/data/clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa
new file mode 100644
index 0000000..194d322
--- /dev/null
+++ b/ariba/tests/data/clusters_test_write_catted_genes_matching_refs_fasta.expected.out.fa
@@ -0,0 +1,4 @@
+>seq1.TYPE1.1.3
+ACGT
+>seq3.TYPE3.4.5
+AAAA
diff --git a/ariba/tests/data/clusters_test_write_report.tsv b/ariba/tests/data/clusters_test_write_report.tsv
index 382892c..dc8d91d 100644
--- a/ariba/tests/data/clusters_test_write_report.tsv
+++ b/ariba/tests/data/clusters_test_write_report.tsv
@@ -1,3 +1,3 @@
-#gene	flag	reads	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt	read_depth	alt_bases	ref_alt_depth
-gene1 line1
-gene2 line2
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+gene1	line1
+gene2	line2
diff --git a/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads.bam b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads.bam
new file mode 100644
index 0000000..916eaff
Binary files /dev/null and b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads.bam differ
diff --git a/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_1.fq b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_1.fq
new file mode 100644
index 0000000..5d5b849
--- /dev/null
+++ b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_1.fq
@@ -0,0 +1,16 @@
+ at read1/1
+TCCACAGGATGGTGGTATACCTGAGGCCAAAGGATACAGATCTTGTGGGAAAGGTCCGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at read2/1
+CATCTGACTGAACTCATCTGACTGACTACTATACTCAGTGCATGCATCGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at read3/1
+CATCTGACTGAACTCATCTGACTGACTACTATACTCAGTGCATGCATCGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at read4/1
+TCCACAGGATGGTGGTATACCTGAGGCCAAAGGATACAGATCTTGTGGGAAAGGTCCGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_2.fq b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_2.fq
new file mode 100644
index 0000000..b0580a4
--- /dev/null
+++ b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_2.fq
@@ -0,0 +1,16 @@
+ at read1/2
+CTACTGACTGACTGATCGATCGATCATCTACTATCCTACTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at read2/2
+ACCATAATGTTCTTAGGGCTTACCATAGAGGTACACTAAAAAGTGTTTCATCCACCTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at read3/2
+CTACTGACTGACTGATCGATCGATCATCTACTATCCTACTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at read4/2
+ACCATAATGTTCTTAGGGCTTACCATAGAGGTACACTAAAAAGTGTTTCATCCACCTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/clusters_test_sam_pair_to_insert.bam b/ariba/tests/data/mapping_test_sam_pair_to_insert.bam
similarity index 100%
rename from ariba/tests/data/clusters_test_sam_pair_to_insert.bam
rename to ariba/tests/data/mapping_test_sam_pair_to_insert.bam
diff --git a/ariba/tests/data/clusters_test_sam_to_fastq.bam b/ariba/tests/data/mapping_test_sam_to_fastq.bam
similarity index 100%
rename from ariba/tests/data/clusters_test_sam_to_fastq.bam
rename to ariba/tests/data/mapping_test_sam_to_fastq.bam
diff --git a/ariba/tests/data/mapping_test_smalt_reads_1.fq b/ariba/tests/data/mapping_test_smalt_reads_1.fq
deleted file mode 100644
index a786c43..0000000
--- a/ariba/tests/data/mapping_test_smalt_reads_1.fq
+++ /dev/null
@@ -1,28 +0,0 @@
- at 1/1
-AGCCCTCCACAGGATGGTGGTATAC
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 2/1
-TAATGTTCTTAGGGCTTACCATAGA
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 3/1
-TCGGGTCTGTACAAGGACGGATGGT
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 4/1
-CCGCCGGGAAGTCCTTCTGTCGTGC
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 5/1
-CCTCCACAGGATGGTGGTATACCTG
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 6/1
-CAGTTGCATGACGTCATGCAGTCAT
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 7/1
-ACGCCGGGAAGTCCTTCTGTCGTGT
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/mapping_test_smalt_reads_2.fq b/ariba/tests/data/mapping_test_smalt_reads_2.fq
deleted file mode 100644
index 33e89bd..0000000
--- a/ariba/tests/data/mapping_test_smalt_reads_2.fq
+++ /dev/null
@@ -1,28 +0,0 @@
- at 1/2
-ACCTTTCCCACAAGATCTGTATCCT
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 2/2
-CGAGTCTGCGCTTAGCTAAGGTGGA
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 3/2
-CGTACTGACTGACTGACGTACTGCA
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 4/2
-TTTTAGTGTACCTCTATGGTAAGCC
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 5/2
-TCTGCGCTTAGCTAAGGTGGATGAA
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 6/2
-AATGAGTATGATGAGTAATGGTATG
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
- at 7/2
-ATTTAGTGTACCTCTATGGTAAGCC
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/mapping_test_smalt_ref.fa b/ariba/tests/data/mapping_test_smalt_ref.fa
deleted file mode 100644
index 18764e8..0000000
--- a/ariba/tests/data/mapping_test_smalt_ref.fa
+++ /dev/null
@@ -1,5 +0,0 @@
->ref
-TCCACAGGATGGTGGTATACCTGAGGCCAAAGGATACAGATCTTGTGGGAAAGGTCCGCC
-GGGAAGTCCTTCTGTCGTGCTTTTTATCGGGTCTGTACAAGGACGGATGGTTTCCGGCAT
-ACCATAATGTTCTTAGGGCTTACCATAGAGGTACACTAAAAAGTGTTTCATCCACCTTAG
-CTAAGCGCAG
diff --git a/ariba/tests/data/mapping_test_smalt_ref.fa.fai b/ariba/tests/data/mapping_test_smalt_ref.fa.fai
deleted file mode 100644
index 045a789..0000000
--- a/ariba/tests/data/mapping_test_smalt_ref.fa.fai
+++ /dev/null
@@ -1 +0,0 @@
-ref	190	5	60	61
diff --git a/ariba/tests/data/mapping_test_smalt_sorted.bam b/ariba/tests/data/mapping_test_smalt_sorted.bam
deleted file mode 100644
index e50c639..0000000
Binary files a/ariba/tests/data/mapping_test_smalt_sorted.bam and /dev/null differ
diff --git a/ariba/tests/data/mapping_test_smalt_unsorted.bam b/ariba/tests/data/mapping_test_smalt_unsorted.bam
deleted file mode 100644
index 40e3d8e..0000000
Binary files a/ariba/tests/data/mapping_test_smalt_unsorted.bam and /dev/null differ
diff --git a/ariba/tests/data/read_store_test_clean.in b/ariba/tests/data/read_store_test_clean.in
new file mode 100644
index 0000000..6cfcecb
--- /dev/null
+++ b/ariba/tests/data/read_store_test_clean.in
@@ -0,0 +1,10 @@
+cluster1	1	AAAA	ABCD
+cluster1	2	CCCC	IIII
+cluster2	1	AAAA	HIHI
+cluster2	2	CCCC	GGGG
+cluster2	3	GGGG	DEFG
+cluster2	4	TTTT	GFED
+cluster2	11	ACGT	CDEF
+cluster2	12	TGCA	CDEF
+cluster3	1	AGGG	ABCD
+cluster3	2	AGGG	ABCD
diff --git a/ariba/tests/data/read_store_test_compress_and_index_file.in b/ariba/tests/data/read_store_test_compress_and_index_file.in
new file mode 100644
index 0000000..e1b0556
--- /dev/null
+++ b/ariba/tests/data/read_store_test_compress_and_index_file.in
@@ -0,0 +1,10 @@
+cluster1	1	ACGT	ABCD
+cluster1	2	ACGT	ABCD
+cluster2	1	ACGT	ABCD
+cluster2	2	ACGT	ABCD
+cluster2	3	ACGT	ABCD
+cluster2	4	ACGT	ABCD
+cluster2	11	ACGT	ABCD
+cluster2	12	ACGT	ABCD
+cluster3	1	ACGT	ABCD
+cluster3	2	ACGT	ABCD
diff --git a/ariba/tests/data/read_store_test_get_reads.in b/ariba/tests/data/read_store_test_get_reads.in
new file mode 100644
index 0000000..6cfcecb
--- /dev/null
+++ b/ariba/tests/data/read_store_test_get_reads.in
@@ -0,0 +1,10 @@
+cluster1	1	AAAA	ABCD
+cluster1	2	CCCC	IIII
+cluster2	1	AAAA	HIHI
+cluster2	2	CCCC	GGGG
+cluster2	3	GGGG	DEFG
+cluster2	4	TTTT	GFED
+cluster2	11	ACGT	CDEF
+cluster2	12	TGCA	CDEF
+cluster3	1	AGGG	ABCD
+cluster3	2	AGGG	ABCD
diff --git a/ariba/tests/data/read_store_test_get_reads.reads_1.fq b/ariba/tests/data/read_store_test_get_reads.reads_1.fq
new file mode 100644
index 0000000..1f6a519
--- /dev/null
+++ b/ariba/tests/data/read_store_test_get_reads.reads_1.fq
@@ -0,0 +1,12 @@
+ at 1/1
+AAAA
++
+HIHI
+ at 3/1
+GGGG
++
+DEFG
+ at 11/1
+ACGT
++
+CDEF
diff --git a/ariba/tests/data/read_store_test_get_reads.reads_2.fq b/ariba/tests/data/read_store_test_get_reads.reads_2.fq
new file mode 100644
index 0000000..2270fd1
--- /dev/null
+++ b/ariba/tests/data/read_store_test_get_reads.reads_2.fq
@@ -0,0 +1,12 @@
+ at 1/2
+CCCC
++
+GGGG
+ at 3/2
+TTTT
++
+GFED
+ at 11/2
+TGCA
++
+CDEF
diff --git a/ariba/tests/data/read_store_test_sort_file.in b/ariba/tests/data/read_store_test_sort_file.in
new file mode 100644
index 0000000..497d629
--- /dev/null
+++ b/ariba/tests/data/read_store_test_sort_file.in
@@ -0,0 +1,10 @@
+cluster2	1	ACGT	ABCD
+cluster3	2	ACGT	ABCD
+cluster2	2	ACGT	ABCD
+cluster2	3	ACGT	ABCD
+cluster2	4	ACGT	ABCD
+cluster2	11	ACGT	ABCD
+cluster2	12	ACGT	ABCD
+cluster1	2	ACGT	ABCD
+cluster1	1	ACGT	ABCD
+cluster3	1	ACGT	ABCD
diff --git a/ariba/tests/data/read_store_test_sort_file.out b/ariba/tests/data/read_store_test_sort_file.out
new file mode 100644
index 0000000..e1b0556
--- /dev/null
+++ b/ariba/tests/data/read_store_test_sort_file.out
@@ -0,0 +1,10 @@
+cluster1	1	ACGT	ABCD
+cluster1	2	ACGT	ABCD
+cluster2	1	ACGT	ABCD
+cluster2	2	ACGT	ABCD
+cluster2	3	ACGT	ABCD
+cluster2	4	ACGT	ABCD
+cluster2	11	ACGT	ABCD
+cluster2	12	ACGT	ABCD
+cluster3	1	ACGT	ABCD
+cluster3	2	ACGT	ABCD
diff --git a/ariba/tests/data/ref_preparer_test_write_info_file.out b/ariba/tests/data/ref_preparer_test_write_info_file.out
new file mode 100644
index 0000000..7cc5a6c
--- /dev/null
+++ b/ariba/tests/data/ref_preparer_test_write_info_file.out
@@ -0,0 +1,5 @@
+input_presabs	presabs.fa
+input_varonly	None
+input_noncoding	None
+input_metadata	metadata.tsv
+genetic_code	1
diff --git a/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv b/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv
new file mode 100644
index 0000000..4f90cbf
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv
@@ -0,0 +1,9 @@
+non_coding_1	.	.	.	non_coding_1 description1
+non_coding_1	.	.	.	should be in output because this field is here
+non_coding_1	n	C5A	id1	dna variant ok
+presence_absence_1	.	.	.	should be in output because this field is here
+presence_absence_2	n	T4G	id2	dna variant ok
+presence_absence_3	p	R3S	.	amino acid variant ok
+variants_only_1	.	.	.	should be kept as a generic description of variants_only_1
+variants_only_1	p	S2T	.	amino acid variant ok
+variants_only_1	n	T4A	.	dna variant ok
diff --git a/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv b/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv
new file mode 100644
index 0000000..031a8d7
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv
@@ -0,0 +1,20 @@
+non_coding_1	.	.	.	non_coding_1 description1
+non_coding_1	.	.	.	should be in output because this field is here
+non_coding_1	p	L2K	.	should be removed because this is non-coding, but variant is protein
+non_coding_1	n	C5A	id1	dna variant ok
+non_coding_not_in_fasta	.	.	.	should be removed from tsv because not in fasta
+presence_absence_1	.	.	.	.
+presence_absence_1	.	.	.	should be in output because this field is here
+presence_absence_2	n	T4G	id2	dna variant ok
+presence_absence_2	n	A4G	.	dna variant not ok
+presence_absence_3	p	R3S	.	amino acid variant ok
+presence_absence_3	p	I3S	.	amino acid variant not ok
+presence_absence_not_in_fasta	.	.	.	should be removed from tsv because not in fasta
+variants_only_1	n	T4A	.	dna variant ok
+variants_only_1	n	C4G	.	dna variant not ok
+variants_only_1	p	S2T	.	amino acid variant ok
+variants_only_1	p	I2L	.	amin acid variant not ok
+variants_only_1	.	.	.	should be kept as a generic description of variants_only_1
+variants_only_1	.	.	.	.
+variants_only_not_in_fasta	.	.	.	should be removed from tsv because not in fasta
+variants_only_no_good_variants	n	A4G	.	dna variant not ok
diff --git a/ariba/tests/data/reference_data_filter_bad_data_non_coding.expected.fa b/ariba/tests/data/reference_data_filter_bad_data_non_coding.expected.fa
new file mode 100644
index 0000000..000b42f
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_non_coding.expected.fa
@@ -0,0 +1,2 @@
+>non_coding_1
+AAAAAAAGAAGGAAAA
diff --git a/ariba/tests/data/reference_data_filter_bad_data_non_coding.in.fa b/ariba/tests/data/reference_data_filter_bad_data_non_coding.in.fa
new file mode 100644
index 0000000..000b42f
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_non_coding.in.fa
@@ -0,0 +1,2 @@
+>non_coding_1
+AAAAAAAGAAGGAAAA
diff --git a/ariba/tests/data/reference_data_filter_bad_data_presence_absence.expected.fa b/ariba/tests/data/reference_data_filter_bad_data_presence_absence.expected.fa
new file mode 100644
index 0000000..077de1b
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_presence_absence.expected.fa
@@ -0,0 +1,6 @@
+>presence_absence_1
+ATGTCTTAA
+>presence_absence_2
+ATGTTTTAA
+>presence_absence_3
+ATGTTTCGTTAA
diff --git a/ariba/tests/data/reference_data_filter_bad_data_presence_absence.in.fa b/ariba/tests/data/reference_data_filter_bad_data_presence_absence.in.fa
new file mode 100644
index 0000000..077de1b
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_presence_absence.in.fa
@@ -0,0 +1,6 @@
+>presence_absence_1
+ATGTCTTAA
+>presence_absence_2
+ATGTTTTAA
+>presence_absence_3
+ATGTTTCGTTAA
diff --git a/ariba/tests/data/reference_data_filter_bad_data_variants_only.expected.fa b/ariba/tests/data/reference_data_filter_bad_data_variants_only.expected.fa
new file mode 100644
index 0000000..974509c
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_variants_only.expected.fa
@@ -0,0 +1,2 @@
+>variants_only_1
+ATGTCCTGTTAG
diff --git a/ariba/tests/data/reference_data_filter_bad_data_variants_only.in.fa b/ariba/tests/data/reference_data_filter_bad_data_variants_only.in.fa
new file mode 100644
index 0000000..d68032c
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_variants_only.in.fa
@@ -0,0 +1,6 @@
+>variants_only_1
+ATGTCCTGTTAG
+>variants_only_should_be_removed
+ATGTCCTGTTAG
+>variants_only_no_good_variants
+ATGTTTTAG
diff --git a/ariba/tests/data/reference_data_get_filename b/ariba/tests/data/reference_data_get_filename
new file mode 100644
index 0000000..ce01362
--- /dev/null
+++ b/ariba/tests/data/reference_data_get_filename
@@ -0,0 +1 @@
+hello
diff --git a/ariba/tests/data/reference_data_init.tsv b/ariba/tests/data/reference_data_init.tsv
new file mode 100644
index 0000000..1e8f1a6
--- /dev/null
+++ b/ariba/tests/data/reference_data_init.tsv
@@ -0,0 +1,4 @@
+gene1	n	A42G	.	free text
+gene1	n	A42T	.	free text2
+gene1	n	G13T	.	confers killer rabbit resistance
+gene2	p	I42L	.	removes tardigrade's space-living capability
diff --git a/ariba/tests/data/cluster_test_init_no_gene_fa/reads_1.fq b/ariba/tests/data/reference_data_init_empty.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_init_no_gene_fa/reads_1.fq
rename to ariba/tests/data/reference_data_init_empty.fa
diff --git a/ariba/tests/data/reference_data_init_presence_absence.fa b/ariba/tests/data/reference_data_init_presence_absence.fa
new file mode 100644
index 0000000..3f4671c
--- /dev/null
+++ b/ariba/tests/data/reference_data_init_presence_absence.fa
@@ -0,0 +1,4 @@
+>gene1
+CATTCCTAGCGTCGTCTATCGTCG
+>gene2
+AAAAACCCCGGGGTTTT
diff --git a/ariba/tests/data/reference_data_keep_seqs_from_dict.fa b/ariba/tests/data/reference_data_keep_seqs_from_dict.fa
new file mode 100644
index 0000000..045a3bd
--- /dev/null
+++ b/ariba/tests/data/reference_data_keep_seqs_from_dict.fa
@@ -0,0 +1,2 @@
+>seq1
+acgt
diff --git a/ariba/tests/data/reference_data_keep_seqs_from_dict.log b/ariba/tests/data/reference_data_keep_seqs_from_dict.log
new file mode 100644
index 0000000..067ea0b
--- /dev/null
+++ b/ariba/tests/data/reference_data_keep_seqs_from_dict.log
@@ -0,0 +1 @@
+seq2 has no variants. Removing.
diff --git a/ariba/tests/data/reference_data_load_fasta_file.fa b/ariba/tests/data/reference_data_load_fasta_file.fa
new file mode 100644
index 0000000..6b27dae
--- /dev/null
+++ b/ariba/tests/data/reference_data_load_fasta_file.fa
@@ -0,0 +1,2 @@
+>seq1
+ACGT
diff --git a/ariba/tests/data/reference_data_load_metadata_tsv.tsv b/ariba/tests/data/reference_data_load_metadata_tsv.tsv
new file mode 100644
index 0000000..3551863
--- /dev/null
+++ b/ariba/tests/data/reference_data_load_metadata_tsv.tsv
@@ -0,0 +1,3 @@
+gene1	n	A42G	.	free text
+gene1	n	G13T	.	confers killer rabbit resistance
+gene2	p	I42L	.	removes tardigrade's space-living capability
diff --git a/ariba/tests/data/reference_data_make_catted_fasta.expected.fa b/ariba/tests/data/reference_data_make_catted_fasta.expected.fa
new file mode 100644
index 0000000..b2dd9a3
--- /dev/null
+++ b/ariba/tests/data/reference_data_make_catted_fasta.expected.fa
@@ -0,0 +1,6 @@
+>pa1
+ATGTTTTAA
+>vonly1
+ATGTTTTAA
+>nc1
+CACCATACTGCATCT
diff --git a/ariba/tests/data/reference_data_make_catted_fasta.noncoding.fa b/ariba/tests/data/reference_data_make_catted_fasta.noncoding.fa
new file mode 100644
index 0000000..2fee033
--- /dev/null
+++ b/ariba/tests/data/reference_data_make_catted_fasta.noncoding.fa
@@ -0,0 +1,2 @@
+>nc1
+CACCATACTGCATCT
diff --git a/ariba/tests/data/reference_data_make_catted_fasta.presence_absence.fa b/ariba/tests/data/reference_data_make_catted_fasta.presence_absence.fa
new file mode 100644
index 0000000..9caa1f6
--- /dev/null
+++ b/ariba/tests/data/reference_data_make_catted_fasta.presence_absence.fa
@@ -0,0 +1,2 @@
+>pa1
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_make_catted_fasta.variants_only.fa b/ariba/tests/data/reference_data_make_catted_fasta.variants_only.fa
new file mode 100644
index 0000000..1b0ae3c
--- /dev/null
+++ b/ariba/tests/data/reference_data_make_catted_fasta.variants_only.fa
@@ -0,0 +1,2 @@
+>vonly1
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_remove_bad_genes.in.fa b/ariba/tests/data/reference_data_remove_bad_genes.in.fa
new file mode 100644
index 0000000..c6b2868
--- /dev/null
+++ b/ariba/tests/data/reference_data_remove_bad_genes.in.fa
@@ -0,0 +1,10 @@
+>g1
+ACGTG
+>g2
+ACGCGTACGTATCGACGTATCTGACGTACGTAGTACCGTACGTACGTAATCACGTAGTACTGACTGAGTCGTCAGTCAGCTGTAGTACGTAGCACATATA
+>g3
+GAGGAGCCG
+>g4
+ATGTAACCT
+>g5
+ATGCCTTAA
diff --git a/ariba/tests/data/reference_data_rename_sequences.noncoding.fa b/ariba/tests/data/reference_data_rename_sequences.noncoding.fa
new file mode 100644
index 0000000..a8b5192
--- /dev/null
+++ b/ariba/tests/data/reference_data_rename_sequences.noncoding.fa
@@ -0,0 +1,4 @@
+>noncoding1
+AAAA
+>noncoding1 blah
+CCCC
diff --git a/ariba/tests/data/reference_data_rename_sequences.presence_absence.fa b/ariba/tests/data/reference_data_rename_sequences.presence_absence.fa
new file mode 100644
index 0000000..d9a7ab5
--- /dev/null
+++ b/ariba/tests/data/reference_data_rename_sequences.presence_absence.fa
@@ -0,0 +1,10 @@
+>pres_abs1 foo bar spam eggs
+ACGT
+>pres_abs1 blah
+AAAA
+>pres'abs1
+CCCC
+>pres_abs2
+TTTT
+>pres!abs3
+GGGG
diff --git a/ariba/tests/data/reference_data_rename_sequences.variants_only.fa b/ariba/tests/data/reference_data_rename_sequences.variants_only.fa
new file mode 100644
index 0000000..87d7377
--- /dev/null
+++ b/ariba/tests/data/reference_data_rename_sequences.variants_only.fa
@@ -0,0 +1,8 @@
+>var_only1 hello
+AAAA
+>var:only1 boo
+CCCC
+>var_only1
+GGGG
+>var_only2
+TTTT
diff --git a/ariba/tests/data/reference_data_rename_sequences_metadata.tsv b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
new file mode 100644
index 0000000..6d43433
--- /dev/null
+++ b/ariba/tests/data/reference_data_rename_sequences_metadata.tsv
@@ -0,0 +1,11 @@
+noncoding1	.	.	.	original name "noncoding1"
+noncoding1 blah	.	.	.	original name "noncoding1 blah"
+pres_abs1 foo bar spam eggs	.	.	.	original name "pres_abs1 foo bar spam eggs"
+pres_abs1 blah	.	.	.	original name "pres_abs1 blah"
+pres'abs1	.	.	.	original name "pres'abs1"
+pres_abs2	.	.	.	original name "pres_abs2"
+pres!abs3	.	.	.	original name "pres!abs3"
+var_only1 hello	.	.	.	original name "var_only1 hello"
+var:only1 boo	.	.	.	original name "var:only1 boo"
+var_only1	.	.	.	original name "var_only1"
+var_only2	.	.	.	original name "var_only2"
diff --git a/ariba/tests/data/reference_data_sequence.presence_absence.fa b/ariba/tests/data/reference_data_sequence.presence_absence.fa
new file mode 100644
index 0000000..3ea7b0a
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence.presence_absence.fa
@@ -0,0 +1,2 @@
+>pa
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_sequence_length.presence_absence.fa b/ariba/tests/data/reference_data_sequence_length.presence_absence.fa
new file mode 100644
index 0000000..3ea7b0a
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence_length.presence_absence.fa
@@ -0,0 +1,2 @@
+>pa
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_sequence_type.noncoding.fa b/ariba/tests/data/reference_data_sequence_type.noncoding.fa
new file mode 100644
index 0000000..92270e6
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence_type.noncoding.fa
@@ -0,0 +1,2 @@
+>noncoding
+AAAAAA
diff --git a/ariba/tests/data/reference_data_sequence_type.presence_absence.fa b/ariba/tests/data/reference_data_sequence_type.presence_absence.fa
new file mode 100644
index 0000000..3ea7b0a
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence_type.presence_absence.fa
@@ -0,0 +1,2 @@
+>pa
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_sequence_type.variants_only.fa b/ariba/tests/data/reference_data_sequence_type.variants_only.fa
new file mode 100644
index 0000000..acf4730
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence_type.variants_only.fa
@@ -0,0 +1,2 @@
+>var_only
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.noncoding.fa b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.noncoding.fa
new file mode 100644
index 0000000..601a75d
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.noncoding.fa
@@ -0,0 +1,2 @@
+>non_coding
+AAGTCATGCATCTA
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.pres_abs.fa b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.pres_abs.fa
new file mode 100644
index 0000000..09fa61e
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.pres_abs.fa
@@ -0,0 +1,2 @@
+>presence_absence_gene
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.var_only.fa b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.var_only.fa
new file mode 100644
index 0000000..65d15d3
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.var_only.fa
@@ -0,0 +1,2 @@
+>var_only_gene
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv
new file mode 100644
index 0000000..6f1defa
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv
@@ -0,0 +1,12 @@
+var_only_gene	n	A8T	.	ref has wild type A
+var_only_gene	n	G9C	.	ref has variant C instead of G
+var_only_gene	p	G4I	.	ref has wild type F
+var_only_gene	p	F6I	.	ref has wild type F
+var_only_gene	p	P3Q	.	ref has wild type P
+var_only_gene	p	I5V	.	ref has variant V instead of I
+presence_absence_gene	n	A4G	.	ref has wild type A
+presence_absence_gene	n	A6C	.	ref has variant C instead of A
+presence_absence_gene	p	N2I	.	ref has wild type N
+presence_absence_gene	p	A4G	.	ref has variant G instead of A
+non_coding	n	A2C	.	ref has wild type A
+non_coding	n	C4T	.	ref has variant T instead of C
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit.clusters.tsv b/ariba/tests/data/reference_data_test_cluster_with_cdhit.clusters.tsv
new file mode 100644
index 0000000..902d2fa
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit.clusters.tsv
@@ -0,0 +1,3 @@
+presence_absence1.p	presence_absence1	presence_absence2
+presence_absence3.p	presence_absence3	presence_absence4
+noncoding1.n	noncoding1
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit.expected_representatives.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit.expected_representatives.fa
new file mode 100644
index 0000000..4497fcb
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit.expected_representatives.fa
@@ -0,0 +1,10 @@
+>presence_absence1.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3.p
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>noncoding1.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit.non_coding.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit.non_coding.fa
new file mode 100644
index 0000000..3008b6e
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit.non_coding.fa
@@ -0,0 +1,5 @@
+>noncoding1
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit.presence_absence.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit.presence_absence.fa
new file mode 100644
index 0000000..4541dff
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit.presence_absence.fa
@@ -0,0 +1,10 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>presence_absence4
+ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters
new file mode 100644
index 0000000..20ea4d0
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters
@@ -0,0 +1,4 @@
+noncoding1
+noncoding2
+presence_absence1 presence_absence3 presence_absence4
+presence_absence2
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters.tsv b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters.tsv
new file mode 100644
index 0000000..a549633
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.clusters.tsv
@@ -0,0 +1,4 @@
+presence_absence1.p	presence_absence1	presence_absence3	presence_absence4
+presence_absence2.p	presence_absence2
+noncoding1.n	noncoding1
+noncoding2.n	noncoding2
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected_representatives.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected_representatives.fa
new file mode 100644
index 0000000..d287c87
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.expected_representatives.fa
@@ -0,0 +1,16 @@
+>presence_absence1.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>noncoding1.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>noncoding2.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.non_coding.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.non_coding.fa
new file mode 100644
index 0000000..9d54da5
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.non_coding.fa
@@ -0,0 +1,10 @@
+>noncoding1
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>noncoding2
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.presence_absence.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.presence_absence.fa
new file mode 100644
index 0000000..4541dff
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_clusters_in_file.presence_absence.fa
@@ -0,0 +1,10 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>presence_absence4
+ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.clusters.tsv b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.clusters.tsv
new file mode 100644
index 0000000..f61ea8f
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.clusters.tsv
@@ -0,0 +1,6 @@
+presence_absence1.p	presence_absence1
+presence_absence2.p	presence_absence2
+presence_absence3.p	presence_absence3
+presence_absence4.p	presence_absence4
+noncoding1.n	noncoding1
+noncoding2.n	noncoding2
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected_representatives.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected_representatives.fa
new file mode 100644
index 0000000..afc9b89
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.expected_representatives.fa
@@ -0,0 +1,20 @@
+>noncoding1.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>noncoding2.n
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>presence_absence1.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2.p
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3.p
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>presence_absence4.p
+ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.non_coding.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.non_coding.fa
new file mode 100644
index 0000000..9d54da5
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.non_coding.fa
@@ -0,0 +1,10 @@
+>noncoding1
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
+>noncoding2
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.presence_absence.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.presence_absence.fa
new file mode 100644
index 0000000..4541dff
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit_nocluster.presence_absence.fa
@@ -0,0 +1,10 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>presence_absence4
+ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/reference_data_test_remove_bad_genes.log b/ariba/tests/data/reference_data_test_remove_bad_genes.log
new file mode 100644
index 0000000..a1223f6
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_remove_bad_genes.log
@@ -0,0 +1,5 @@
+g1 Remove: too short. Length: 5
+g2 Remove: too long. Length: 100
+g3 Does not look like a gene (tried both strands and all reading frames) GAGGAGCCG
+g4 Does not look like a gene (tried both strands and all reading frames) ATGTAACCT
+g5 Made g5 into gene. strand=+, frame=0
diff --git a/ariba/tests/data/reference_data_test_rename_sequences.out b/ariba/tests/data/reference_data_test_rename_sequences.out
new file mode 100644
index 0000000..d47d87c
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_rename_sequences.out
@@ -0,0 +1,8 @@
+noncoding1 blah	noncoding1_1
+pres!abs3	pres_abs3
+pres'abs1	pres_abs1
+pres_abs1 blah	pres_abs1_1
+pres_abs1 foo bar spam eggs	pres_abs1_2
+var:only1 boo	var_only1
+var_only1	var_only1_1
+var_only1 hello	var_only1_2
diff --git a/ariba/tests/data/reference_data_test_write_cluster_allocation_file.expected b/ariba/tests/data/reference_data_test_write_cluster_allocation_file.expected
new file mode 100644
index 0000000..ae974b0
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_write_cluster_allocation_file.expected
@@ -0,0 +1,4 @@
+seq1	seq2
+seq3	seq4	seq5
+seq6
+seq10	seq42
diff --git a/ariba/tests/data/reference_data_test_write_seqs_to_fasta.expected.fa b/ariba/tests/data/reference_data_test_write_seqs_to_fasta.expected.fa
new file mode 100644
index 0000000..adcd5e6
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_write_seqs_to_fasta.expected.fa
@@ -0,0 +1,6 @@
+>seq1
+ATGAACCCCGGGGTTTTTTAA
+>seq4
+ATGAACCCCGGGGTTTTTTAA
+>seq5
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/reference_data_test_write_seqs_to_fasta.in.fa b/ariba/tests/data/reference_data_test_write_seqs_to_fasta.in.fa
new file mode 100644
index 0000000..bf5df89
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_write_seqs_to_fasta.in.fa
@@ -0,0 +1,10 @@
+>seq1
+ATGAACCCCGGGGTTTTTTAA
+>seq2
+ATGAACCCCGGGGTTTTTTAA
+>seq3
+ATGAACCCCGGGGTTTTTTAA
+>seq4
+ATGAACCCCGGGGTTTTTTAA
+>seq5
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/cdhit_test_rename_fasta.out.fa b/ariba/tests/data/reference_data_write_dict_of_sequences.fa
similarity index 50%
rename from ariba/tests/data/cdhit_test_rename_fasta.out.fa
rename to ariba/tests/data/reference_data_write_dict_of_sequences.fa
index 7ab3799..3becfa8 100644
--- a/ariba/tests/data/cdhit_test_rename_fasta.out.fa
+++ b/ariba/tests/data/reference_data_write_dict_of_sequences.fa
@@ -1,6 +1,4 @@
 >seq1
-A
+ACGT
 >seq2
-C
->seq3
-G
+GGGG
diff --git a/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv b/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv
new file mode 100644
index 0000000..7baa402
--- /dev/null
+++ b/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv
@@ -0,0 +1,2 @@
+gene1	.	.	.	has anybody got a bottle of orange juice?
+gene2	.	.	.	we didn't burn him
diff --git a/ariba/tests/data/reference_data_write_metadata_tsv.tsv b/ariba/tests/data/reference_data_write_metadata_tsv.tsv
new file mode 100644
index 0000000..4143803
--- /dev/null
+++ b/ariba/tests/data/reference_data_write_metadata_tsv.tsv
@@ -0,0 +1,2 @@
+gene2	.	.	.	we didn't burn him
+gene1	.	.	.	has anybody got a bottle of orange juice?
diff --git a/ariba/tests/data/reference_data_write_metadata_tsv_presence_absence.fa b/ariba/tests/data/reference_data_write_metadata_tsv_presence_absence.fa
new file mode 100644
index 0000000..1f14442
--- /dev/null
+++ b/ariba/tests/data/reference_data_write_metadata_tsv_presence_absence.fa
@@ -0,0 +1,4 @@
+>gene1
+CCTACTATCGCGTCTGCTG
+>gene2
+CGCAGCAGCCGACAGAGAGA
diff --git a/ariba/tests/data/report_filter_test_init_bad.tsv b/ariba/tests/data/report_filter_test_init_bad.tsv
new file mode 100644
index 0000000..f93b0f5
--- /dev/null
+++ b/ariba/tests/data/report_filter_test_init_bad.tsv
@@ -0,0 +1,4 @@
+#ef_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id3:baz	free_text3
diff --git a/ariba/tests/data/report_filter_test_init_good.tsv b/ariba/tests/data/report_filter_test_init_good.tsv
new file mode 100644
index 0000000..c98baf8
--- /dev/null
+++ b/ariba/tests/data/report_filter_test_init_good.tsv
@@ -0,0 +1,5 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	10.5	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	10.5	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.2	1300	12.4	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id3:spam	free_text3
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	20.2	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:v:I42L:id4:eggs	free_text3
diff --git a/ariba/tests/data/report_filter_test_load_report_bad.tsv b/ariba/tests/data/report_filter_test_load_report_bad.tsv
new file mode 100644
index 0000000..553e60f
--- /dev/null
+++ b/ariba/tests/data/report_filter_test_load_report_bad.tsv
@@ -0,0 +1,4 @@
+#ef_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id1:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id1:foo	free_text3
diff --git a/ariba/tests/data/report_filter_test_load_report_good.tsv b/ariba/tests/data/report_filter_test_load_report_good.tsv
new file mode 100644
index 0000000..1165ea4
--- /dev/null
+++ b/ariba/tests/data/report_filter_test_load_report_good.tsv
@@ -0,0 +1,5 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.2	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.2	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.2	1300	22.2	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id3:spam	free_text3
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	33.3	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:v:I42L:id4:eggs	free_text3
diff --git a/ariba/tests/data/report_filter_test_run.expected.tsv b/ariba/tests/data/report_filter_test_run.expected.tsv
new file mode 100644
index 0000000..a35a0cf
--- /dev/null
+++ b/ariba/tests/data/report_filter_test_run.expected.tsv
@@ -0,0 +1,6 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	A51G	1	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	99.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id3:baz	free_text3
+cluster4	variants_only	179	20000	cluster4	1042	1042	99.0	cluster4.scaffold.1	1442	14.6	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	free_text3
+cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	E89G	NONSYN	65	265	A;A	766	766	G;C	88;90	.;.	87;90	.	.'
+cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	Q37fs	FSHIFT	109	109	A	634	634	.	67	.	67	.	.
diff --git a/ariba/tests/data/report_filter_test_run.in.tsv b/ariba/tests/data/report_filter_test_run.in.tsv
new file mode 100644
index 0000000..f701c3a
--- /dev/null
+++ b/ariba/tests/data/report_filter_test_run.in.tsv
@@ -0,0 +1,9 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+cluster1	non_coding	27	10000	cluster1	1000	0	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	12.4	1	SNP	n	A51G	1	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	99.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id3:baz	free_text3
+cluster3	variants_only	179	20000	cluster3	1042	1042	89.0	cluster2.scaffold.1	1442	13.5	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id4:spam	free_text3
+cluster4	variants_only	179	20000	cluster4	1042	1042	99.0	cluster4.scaffold.1	1442	14.6	1	SNP	p	I42L	1	I42L	SYN	112	112	C	442	442	T	300	.	290	a:n:I42L:id5:eggs	free_text3
+cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	E89G	NONSYN	65	265	A;A	766	766	G;C	88;90	.;.	87;90	.	.'
+cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	Q37fs	FSHIFT	109	109	A	634	634	.	67	.	67	.	.
+cluster5	presence_absence	528	1874	cluster5	1188	1097	92.43	cluster5.scaffold.1	2218	20.0	0	.	p	.	0	E89G	NONSYN	265	265	A;A	766	766	G;C	88;90	.;.	87;90	.	.
diff --git a/ariba/tests/data/report_filter_test_write_report.tsv b/ariba/tests/data/report_filter_test_write_report.tsv
new file mode 100644
index 0000000..11b3ab4
--- /dev/null
+++ b/ariba/tests/data/report_filter_test_write_report.tsv
@@ -0,0 +1,4 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	42.4	1	SNP	n	C42T	0	.	.	42	42	C	142	142	C	500	.	500	a:n:C42T:id1:foo	free_text
+cluster1	non_coding	27	10000	cluster1	1000	999	99.42	cluster1.scaffold.1	1300	42.4	1	SNP	n	A51G	0	.	.	51	51	C	151	151	C	542	.	542	a:n:A51G:id2:bar	free_text2
+cluster2	variants_only	179	20000	cluster2	1042	1042	42.42	cluster2.scaffold.1	1442	42.4	1	SNP	p	I42L	1	I42L	NONSYN	112	112	C	442	442	T	300	.	290	a:v:I42L:id3:baz	free_text3
diff --git a/ariba/tests/data/samtools_variants_test_get_depths_at_position.bam b/ariba/tests/data/samtools_variants_test_get_depths_at_position.bam
new file mode 100644
index 0000000..a2c29b2
Binary files /dev/null and b/ariba/tests/data/samtools_variants_test_get_depths_at_position.bam differ
diff --git a/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa b/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa
new file mode 100644
index 0000000..dc03a6e
--- /dev/null
+++ b/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa
@@ -0,0 +1,18 @@
+>ref
+AGTGCCTTTTAGACTAGACGCACTCTTCTTGCTGAGAAACTAGAGCTGTCGCTCCAAGAG
+GAGTTAAAAAGCAGAACCTGGACCACAGTTCCTGAAGAATACCGTGTATGTACTGCAGCC
+GGTGTACCTGTCTGGACCCTATGCTCGCGAAAACGGACTCATTACTGCAGGGTTGCACCG
+CCGTGCTCGGAGGGAGGTCAGTCCCCGGTGAGATCTACACCGGCTGACACCACCCTTTCC
+CACCCAGTGAAGTACCTTCAGCCAAAGGACGAGGCTAGTAACAAAATTCTGCGATGTGCG
+TGGAGCGCTACTAAATGGCCCGTAGTACGGCCCACTACAGCGTACCTTTTGGGCACTACA
+ATTACCTCCGGTATTTGCTTAGATCATTGCAGTAAGGACTCATAAGAAACCTTCCTGTCA
+TAGCACCCCGCAGTGCCACAGAAATGGAGTTTTGTGTGAATTTGATAAGGACGGCACTCC
+GCAGTACCCAACCCATGAGTATCTATGGCCATTGTTGATTGGAGCCCTTATCAGTGTCTT
+AATCCATAATCGGATTATGTCGACCCGTTCTAGTTATATTGCATTCCTAACCCTGGTCCT
+GGGTGCCTTGCATTCCACGAAGAGCTGCAGAAATTTCGCGCACATCAGACACGAACACGC
+CAAACCCGTATCTACCGCACCAACCGGCCTCGCTGACTAGGGCATAATGCGGTGGGATGG
+CAACTGTGTCCTTTTTCGTAGTATCGACTGATATATGGATGCACTCCGCGGTCGTTTGAG
+AGCGGACGGATCACTAGGACATTTGCGGTGGGTTTTAGGCATTGACCGAGCTAGTCCATG
+TTTTTCCATGACGGGTGTGTCGATCAATTACAGCGGTTCCACGATCGAGAAGCACTATCG
+TCTCGGATATTGACCTGTAAGCTGGGAGATCTCCACCAACAGTATTGGGATACGTGGTCC
+CACCGGTAGTAGGATCGCTCCTGCCCGAACGACTAGTTAA
diff --git a/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa.fai b/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa.fai
new file mode 100644
index 0000000..3207483
--- /dev/null
+++ b/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa.fai
@@ -0,0 +1 @@
+ref	1000	5	60	61
diff --git a/ariba/tests/data/cluster_test_get_assembly_read_depths.gz b/ariba/tests/data/samtools_variants_test_get_read_depths.gz
similarity index 100%
rename from ariba/tests/data/cluster_test_get_assembly_read_depths.gz
rename to ariba/tests/data/samtools_variants_test_get_read_depths.gz
diff --git a/ariba/tests/data/cluster_test_get_assembly_read_depths.gz.tbi b/ariba/tests/data/samtools_variants_test_get_read_depths.gz.tbi
similarity index 100%
rename from ariba/tests/data/cluster_test_get_assembly_read_depths.gz.tbi
rename to ariba/tests/data/samtools_variants_test_get_read_depths.gz.tbi
diff --git a/ariba/tests/data/cluster_test_get_samtools_variant_positions.vcf b/ariba/tests/data/samtools_variants_test_get_variant_positions_from_vcf.vcf
similarity index 100%
rename from ariba/tests/data/cluster_test_get_samtools_variant_positions.vcf
rename to ariba/tests/data/samtools_variants_test_get_variant_positions_from_vcf.vcf
diff --git a/ariba/tests/data/cluster_test_get_samtools_variants.read_depths.gz b/ariba/tests/data/samtools_variants_test_get_variants.read_depths.gz
similarity index 100%
rename from ariba/tests/data/cluster_test_get_samtools_variants.read_depths.gz
rename to ariba/tests/data/samtools_variants_test_get_variants.read_depths.gz
diff --git a/ariba/tests/data/cluster_test_get_samtools_variants.read_depths.gz.tbi b/ariba/tests/data/samtools_variants_test_get_variants.read_depths.gz.tbi
similarity index 100%
rename from ariba/tests/data/cluster_test_get_samtools_variants.read_depths.gz.tbi
rename to ariba/tests/data/samtools_variants_test_get_variants.read_depths.gz.tbi
diff --git a/ariba/tests/data/cluster_test_get_samtools_variants.vcf b/ariba/tests/data/samtools_variants_test_get_variants.vcf
similarity index 100%
rename from ariba/tests/data/cluster_test_get_samtools_variants.vcf
rename to ariba/tests/data/samtools_variants_test_get_variants.vcf
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.fa b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.fa
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.fa.fai b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa.fai
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.fa.fai
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa.fai
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.bam b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.bam
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.bam
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.bam
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.read_depths.gz b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.read_depths.gz
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.read_depths.gz.tbi b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz.tbi
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.read_depths.gz.tbi
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz.tbi
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.vcf b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.vcf
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf
diff --git a/ariba/tests/data/samtools_variants_test_total_depth_per_contig b/ariba/tests/data/samtools_variants_test_total_depth_per_contig
new file mode 100644
index 0000000..3031306
--- /dev/null
+++ b/ariba/tests/data/samtools_variants_test_total_depth_per_contig
@@ -0,0 +1,7 @@
+scaff1	1	C	.	15	15
+scaff1	2	A	.	16	16
+scaff1	3	T	.	17	17
+scaff1	4	G,T	.	19	17,2
+scaff2	1	A	.	20	20
+scaff2	2	A	.	22	22
+scaff2	10	T	.	30	30
diff --git a/ariba/tests/data/cluster_test_get_vcf_variant_counts.vcf b/ariba/tests/data/samtools_variants_test_variants_in_coords.vcf
similarity index 100%
rename from ariba/tests/data/cluster_test_get_vcf_variant_counts.vcf
rename to ariba/tests/data/samtools_variants_test_variants_in_coords.vcf
diff --git a/ariba/tests/data/summary_sample_test_column_names_tuples.tsv b/ariba/tests/data/summary_sample_test_column_names_tuples.tsv
new file mode 100644
index 0000000..4684c42
--- /dev/null
+++ b/ariba/tests/data/summary_sample_test_column_names_tuples.tsv
@@ -0,0 +1,8 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:.:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:.:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_column_summary_data.tsv b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
new file mode 100644
index 0000000..cf7e5b9
--- /dev/null
+++ b/ariba/tests/data/summary_sample_test_column_summary_data.tsv
@@ -0,0 +1,8 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_load_file.in.tsv b/ariba/tests/data/summary_sample_test_load_file.in.tsv
new file mode 100644
index 0000000..524d334
--- /dev/null
+++ b/ariba/tests/data/summary_sample_test_load_file.in.tsv
@@ -0,0 +1,7 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non:coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non:coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_non_synon_variants.tsv b/ariba/tests/data/summary_sample_test_non_synon_variants.tsv
new file mode 100644
index 0000000..b8f5753
--- /dev/null
+++ b/ariba/tests/data/summary_sample_test_non_synon_variants.tsv
@@ -0,0 +1,8 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	0	SNP	.	.	.	G15T	SNP	15	15	G	85	85	T	17	.	17	.	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_sample_test_var_groups.tsv b/ariba/tests/data/summary_sample_test_var_groups.tsv
new file mode 100644
index 0000000..056296a
--- /dev/null
+++ b/ariba/tests/data/summary_sample_test_var_groups.tsv
@@ -0,0 +1,7 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding1.scaffold.1	279	35.4	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n	120	120	98.33	noncoding2.scaffold.1	279	35.4	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	generic description of noncoding2
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.1	267	35.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+presence_absence1	presence_absence	27	88	cluster.p	96	96	98.96	presence_absence1.scaffold.2	267	35.1	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	Generic description of presence_absence2
+variants_only1	variants_only	27	64	cluster.v	90	90	100.0	variants_only1.scaffold.1	260	42.4	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1:p:S5T:id4:Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
index ff5aaf5..6ec23ec 100644
--- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
+++ b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
@@ -1,4 +1,3 @@
-#gene	flag	reads	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt	read_depth	alt_bases	ref_alt_depth
-gene1	27	42	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.	.	.	.
-gene2	15	44	2	780	780	100.0	.	.	.	.	.	.	gene2.scaffold.2	1124	.	.	.	.	.	.
-gene2	15	46	2	780	770	99.0	.	.	.	.	.	.	gene2.scaffold.3	1097	.	.	.	.	.	.
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
index 3d7bfb7..322f965 100644
--- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
+++ b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
@@ -1,3 +1,5 @@
-#gene	flag	reads	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt	read_depth	alt_bases	ref_alt_depth
-gene1	27	142	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.	.	.	.
-gene3	27	144	3	750	750	98.93	.	.	.	.	.	.	gene3.scaffold.1	1047	.	.	.	.	.	.
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id3:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	0	SNP	p	A10V	.	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:.:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	variants_only	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
new file mode 100644
index 0000000..2d06842
--- /dev/null
+++ b/ariba/tests/data/summary_test_get_all_cluster_names.1.tsv
@@ -0,0 +1,3 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
new file mode 100644
index 0000000..0058b23
--- /dev/null
+++ b/ariba/tests/data/summary_test_get_all_cluster_names.2.tsv
@@ -0,0 +1,5 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	variants_only	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.1.tsv b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
new file mode 100644
index 0000000..e3465e4
--- /dev/null
+++ b/ariba/tests/data/summary_test_get_all_var_groups.1.tsv
@@ -0,0 +1,3 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	cluster.p.1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id4:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_get_all_var_groups.2.tsv b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
new file mode 100644
index 0000000..0058b23
--- /dev/null
+++ b/ariba/tests/data/summary_test_get_all_var_groups.2.tsv
@@ -0,0 +1,5 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	cluster.n.1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	cluster.p.2	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	variants_only	64	12	cluster.v.1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_load_file.in.tsv b/ariba/tests/data/summary_test_load_file.in.tsv
deleted file mode 100644
index e78caca..0000000
--- a/ariba/tests/data/summary_test_load_file.in.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-#gene	flag	reads	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt	read_depth	alt_bases	ref_alt_depth
-gene1	27	42	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.	.	.	.
-gene2	15	44	2	780	780	100.0	.	.	.	.	.	.	gene2.scaffold.2	1124	.	.	.	.	.	.
-gene2	15	46	2	780	770	99.0	.	.	.	.	.	.	gene2.scaffold.3	1097	.	.	.	.	.	.
-gene3	187	48	3	750	750	98.93	SNP	SYN	.	318	318	C	gene3.scaffold.1	1047	319	319	G	.	.	.
diff --git a/ariba/tests/data/summary_test_load_input_files.1.tsv b/ariba/tests/data/summary_test_load_input_files.1.tsv
new file mode 100644
index 0000000..e1bc25f
--- /dev/null
+++ b/ariba/tests/data/summary_test_load_input_files.1.tsv
@@ -0,0 +1,3 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	10.0	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	20.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id2:Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_load_input_files.2.tsv b/ariba/tests/data/summary_test_load_input_files.2.tsv
new file mode 100644
index 0000000..ff47b22
--- /dev/null
+++ b/ariba/tests/data/summary_test_load_input_files.2.tsv
@@ -0,0 +1,5 @@
+#ref_name	ref_type	flag	reads	cluster	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	ctg_cov	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1:n:A14T:id1:ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	50.1	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1:n:A6G:id2:variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	51.1	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1:p:A10V:id3:Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	variants_only	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_newick_from_dist_matrix.distances b/ariba/tests/data/summary_test_newick_from_dist_matrix.distances
new file mode 100644
index 0000000..c3ffce3
--- /dev/null
+++ b/ariba/tests/data/summary_test_newick_from_dist_matrix.distances
@@ -0,0 +1,4 @@
+file1	file2	file3
+file1	3	2
+file2	0	1
+file3	1	0
diff --git a/ariba/tests/data/summary_test_newick_from_dist_matrix.tre b/ariba/tests/data/summary_test_newick_from_dist_matrix.tre
new file mode 100644
index 0000000..a3f5ac3
--- /dev/null
+++ b/ariba/tests/data/summary_test_newick_from_dist_matrix.tre
@@ -0,0 +1 @@
+(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
diff --git a/ariba/tests/data/summary_test_write_distance_matrix.distances b/ariba/tests/data/summary_test_write_distance_matrix.distances
new file mode 100644
index 0000000..c3ffce3
--- /dev/null
+++ b/ariba/tests/data/summary_test_write_distance_matrix.distances
@@ -0,0 +1,4 @@
+file1	file2	file3
+file1	3	2
+file2	0	1
+file3	1	0
diff --git a/ariba/tests/data/summary_test_write_tsv.out.tsv b/ariba/tests/data/summary_test_write_tsv.out.tsv
deleted file mode 100644
index 4bb564a..0000000
--- a/ariba/tests/data/summary_test_write_tsv.out.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#filename	gene1	gene3
-file2	1	3
-file3	2	4
diff --git a/ariba/tests/data/test_common_cat_files.in.1 b/ariba/tests/data/test_common_cat_files.in.1
new file mode 100644
index 0000000..d037497
--- /dev/null
+++ b/ariba/tests/data/test_common_cat_files.in.1
@@ -0,0 +1,2 @@
+file1 line1
+file1 line2
diff --git a/ariba/tests/data/test_common_cat_files.in.2 b/ariba/tests/data/test_common_cat_files.in.2
new file mode 100644
index 0000000..65a38a3
--- /dev/null
+++ b/ariba/tests/data/test_common_cat_files.in.2
@@ -0,0 +1,3 @@
+file2 line1
+file2 line2
+file2 line3
diff --git a/ariba/tests/data/test_common_cat_files.in.3 b/ariba/tests/data/test_common_cat_files.in.3
new file mode 100644
index 0000000..5c07b44
--- /dev/null
+++ b/ariba/tests/data/test_common_cat_files.in.3
@@ -0,0 +1 @@
+file3 line1
diff --git a/ariba/tests/data/test_common_cat_files.out b/ariba/tests/data/test_common_cat_files.out
new file mode 100644
index 0000000..cca27f6
--- /dev/null
+++ b/ariba/tests/data/test_common_cat_files.out
@@ -0,0 +1,6 @@
+file1 line1
+file1 line2
+file2 line1
+file2 line2
+file2 line3
+file3 line1
diff --git a/ariba/tests/data/vfdb_parser_test_run.in.fa b/ariba/tests/data/vfdb_parser_test_run.in.fa
new file mode 100644
index 0000000..fcb43d2
--- /dev/null
+++ b/ariba/tests/data/vfdb_parser_test_run.in.fa
@@ -0,0 +1,6 @@
+>VF123(gi:1234) (abcD) foobar description1 [abc] [genus1 species1]
+AAAA
+>VF234(gi:2345) (efgH) spam eggs description2 [abc] [genus2 species2]
+CCCC
+>seq1 blah
+ACGT
diff --git a/ariba/tests/data/vfdb_parser_test_run.out.fa b/ariba/tests/data/vfdb_parser_test_run.out.fa
new file mode 100644
index 0000000..720ef52
--- /dev/null
+++ b/ariba/tests/data/vfdb_parser_test_run.out.fa
@@ -0,0 +1,6 @@
+>abcD.VF123(gi:1234).genus1_species1
+AAAA
+>efgH.VF234(gi:2345).genus2_species2
+CCCC
+>seq1 blah
+ACGT
diff --git a/ariba/tests/data/vfdb_parser_test_run.out.tsv b/ariba/tests/data/vfdb_parser_test_run.out.tsv
new file mode 100644
index 0000000..242514c
--- /dev/null
+++ b/ariba/tests/data/vfdb_parser_test_run.out.tsv
@@ -0,0 +1,2 @@
+abcD.VF123(gi:1234).genus1_species1	.	.	.	foobar description1 [abc]
+efgH.VF234(gi:2345).genus2_species2	.	.	.	spam eggs description2 [abc]
diff --git a/ariba/tests/external_progs_test.py b/ariba/tests/external_progs_test.py
new file mode 100644
index 0000000..590b086
--- /dev/null
+++ b/ariba/tests/external_progs_test.py
@@ -0,0 +1,9 @@
+import unittest
+import os
+from ariba import external_progs
+
+class TestExternalProgs(unittest.TestCase):
+    def test_external_progs_ok(self):
+        '''Test that external programs are found'''
+        progs = external_progs.ExternalProgs(verbose=True)
+
diff --git a/ariba/tests/faidx_test.py b/ariba/tests/faidx_test.py
index ce83af6..fa88387 100644
--- a/ariba/tests/faidx_test.py
+++ b/ariba/tests/faidx_test.py
@@ -1,10 +1,11 @@
 import unittest
 import filecmp
 import os
-from ariba import faidx
+from ariba import faidx, external_progs
 
 modules_dir = os.path.dirname(os.path.abspath(faidx.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
 
 
 class TestFaidx(unittest.TestCase):
@@ -13,6 +14,6 @@ class TestFaidx(unittest.TestCase):
         infile = os.path.join(data_dir, 'faidx_test_write_fa_subset.in.fa')
         expected = os.path.join(data_dir, 'faidx_test_write_fa_subset.out.fa')
         tmpfile = 'tmp.test_write_fa_subset.out.fa'
-        faidx.write_fa_subset(['seq1', 'seq3', 'seq4'], infile, tmpfile)
+        faidx.write_fa_subset(['seq1', 'seq3', 'seq4'], infile, tmpfile, samtools_exe=extern_progs.exe('samtools'))
         self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
         os.unlink(tmpfile)
diff --git a/ariba/tests/flag_test.py b/ariba/tests/flag_test.py
index 3053c53..95801fa 100644
--- a/ariba/tests/flag_test.py
+++ b/ariba/tests/flag_test.py
@@ -24,7 +24,7 @@ class TestFlag(unittest.TestCase):
     def test_add(self):
         '''Test add'''
         f = flag.Flag()
-        expected = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023]
+        expected = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047]
         for i in range(len(flag.flags_in_order)):
             f.add(flag.flags_in_order[i])
             self.assertEqual(f.to_number(), expected[i])
@@ -41,16 +41,17 @@ class TestFlag(unittest.TestCase):
         '''Test to_long_str'''
         f = flag.Flag(13)
         expected = '\n'.join([
-            '[X] gene_assembled',
-            '[ ] gene_assembled_into_one_contig',
-            '[X] gene_region_assembled_twice',
-            '[X] complete_orf',
+            '[X] assembled',
+            '[ ] assembled_into_one_contig',
+            '[X] region_assembled_twice',
+            '[X] complete_gene',
             '[ ] unique_contig',
             '[ ] scaffold_graph_bad',
             '[ ] assembly_fail',
             '[ ] variants_suggest_collapsed_repeat',
             '[ ] hit_both_strands',
             '[ ] has_nonsynonymous_variants',
+            '[ ] ref_seq_choose_fail',
         ])
 
         self.assertEqual(expected, f.to_long_string())
diff --git a/ariba/tests/mapping_test.py b/ariba/tests/mapping_test.py
index fee1021..786ffe8 100644
--- a/ariba/tests/mapping_test.py
+++ b/ariba/tests/mapping_test.py
@@ -2,10 +2,12 @@ import unittest
 import os
 import shutil
 import pysam
-from ariba import mapping
+import pyfastaq
+from ariba import mapping, external_progs
 
 modules_dir = os.path.dirname(os.path.abspath(mapping.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
 
 
 # different smalt version output slightly different BAMs. Some columns
@@ -23,14 +25,22 @@ def get_sam_columns(bamfile):
 
 
 class TestMapping(unittest.TestCase):
-    def test_bowtie2_in_path(self):
-        '''Test that bowtie2 is in the user's path'''
-        assert(shutil.which('bowtie2') is not None)
+    def test_bowtie2_index(self):
+        '''test bowtie2_index'''
+        tmp_ref = 'tmp.test_bowtie2_index.ref.fa'
+        with open(tmp_ref, 'w') as f:
+            print('>ref', file=f)
+            print('ATCATACTACTCATACTGACTCATCATCATCATGACGTATG', file=f)
 
+        tmp_out = 'tmp.test_bowtie2_index.ref.out'
+        mapping.bowtie2_index(tmp_ref, tmp_out, bowtie2=extern_progs.exe('bowtie2'))
 
-    def test_samtools_in_path(self):
-        '''Test that samtools is in the user's path'''
-        assert(shutil.which('samtools') is not None)
+        expected_files = [tmp_out + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
+        for filename in expected_files:
+            self.assertTrue(os.path.exists(filename))
+            os.unlink(filename)
+
+        os.unlink(tmp_ref)
 
 
     def test_run_bowtie2(self):
@@ -40,54 +50,62 @@ class TestMapping(unittest.TestCase):
         reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq')
         reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq')
         out_prefix = 'tmp.out.bowtie2'
-        mapping.run_bowtie2(reads1, reads2, ref, out_prefix)
+        mapping.run_bowtie2(
+            reads1,
+            reads2,
+            ref,
+            out_prefix,
+            samtools=extern_progs.exe('samtools'),
+            bowtie2=extern_progs.exe('bowtie2'),
+        )
         expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_unsorted.bam'))
         got = get_sam_columns(out_prefix + '.bam')
         self.assertListEqual(expected, got)
         os.unlink(out_prefix + '.bam')
 
 
+    def test_run_bowtie2_remove_both_unmapped(self):
+        '''Test run_bowtie2 unsorted remove both unmapped'''
+        self.maxDiff = None
+        ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
+        reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_2.fq')
+        out_prefix = 'tmp.out.bowtie2_remove_both_unmapped'
+        mapping.run_bowtie2(
+            reads1,
+            reads2,
+            ref,
+            out_prefix,
+            samtools=extern_progs.exe('samtools'),
+            bowtie2=extern_progs.exe('bowtie2'),
+            remove_both_unmapped=True,
+        )
+        expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads.bam'))
+        got = get_sam_columns(out_prefix + '.bam')
+        self.assertListEqual(expected, got)
+        os.unlink(out_prefix + '.bam')
+
+
     def test_run_bowtie2_and_sort(self):
         '''Test run_bowtie2 sorted'''
         ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
         reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq')
         reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq')
         out_prefix = 'tmp.out.bowtie2'
-        mapping.run_bowtie2(reads1, reads2, ref, out_prefix, sort=True)
+        mapping.run_bowtie2(
+            reads1,
+            reads2,
+            ref,
+            out_prefix,
+            sort=True,
+            samtools=extern_progs.exe('samtools'),
+            bowtie2=extern_progs.exe('bowtie2'),
+        )
         expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_sorted.bam'))
         got = get_sam_columns(out_prefix + '.bam')
         self.assertListEqual(expected, got)
         os.unlink(out_prefix + '.bam')
         os.unlink(out_prefix + '.bam.bai')
-        os.unlink(out_prefix + '.unsorted.bam')
-
-
-    #def test_run_smalt(self):
-    #    '''Test run_smalt unsorted'''
-    #    ref = os.path.join(data_dir, 'mapping_test_smalt_ref.fa')
-    #    reads1 = os.path.join(data_dir, 'mapping_test_smalt_reads_1.fq')
-    #    reads2 = os.path.join(data_dir, 'mapping_test_smalt_reads_2.fq')
-    #    out_prefix = 'tmp.out.smalt'
-    #    mapping.run_smalt(reads1, reads2, ref, out_prefix)
-    #    expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_smalt_unsorted.bam'))
-    #    got = get_sam_columns(out_prefix + '.bam')
-    #    self.assertListEqual(expected, got)
-    #    os.unlink(out_prefix + '.bam')
-
-
-    #def test_run_smalt_and_sort(self):
-    #    '''Test run_smalt sorted'''
-    #    ref = os.path.join(data_dir, 'mapping_test_smalt_ref.fa')
-    #    reads1 = os.path.join(data_dir, 'mapping_test_smalt_reads_1.fq')
-    #    reads2 = os.path.join(data_dir, 'mapping_test_smalt_reads_2.fq')
-    #    out_prefix = 'tmp.out.smalt'
-    #    mapping.run_smalt(reads1, reads2, ref, out_prefix, sort=True)
-    #    expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_smalt_sorted.bam'))
-    #    got = get_sam_columns(out_prefix + '.bam')
-    #    self.assertListEqual(expected, got)
-    #    os.unlink(out_prefix + '.bam')
-    #    os.unlink(out_prefix + '.bam.bai')
-    #    os.unlink(out_prefix + '.unsorted.bam')
 
 
     def test_get_total_alignment_score(self):
@@ -97,3 +115,43 @@ class TestMapping(unittest.TestCase):
         got = mapping.get_total_alignment_score(bam)
         self.assertEqual(got, expected)
 
+
+    def test_sam_to_fastq(self):
+        '''test sam_to_fastq'''
+        expected = [
+            pyfastaq.sequences.Fastq('read1/1', 'GTATGAGTAGATATAAAGTCCGGAACTGTGATCGGGGGCGATTTATTTACTGGCCGTCCC', 'GHIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'),
+            pyfastaq.sequences.Fastq('read1/2', 'TCCCATACGTTGCAATCTGCAGACGCCACTCTTCCACGTCGGACGAACGCAACGTCAGGA', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGEDCBA')
+        ]
+
+
+        sam_reader = pysam.Samfile(os.path.join(data_dir, 'mapping_test_sam_to_fastq.bam'), "rb")
+        i = 0
+        for s in sam_reader.fetch(until_eof=True):
+            self.assertEqual(expected[i], mapping.sam_to_fastq(s))
+            i += 1
+
+
+    def test_sam_pair_to_insert(self):
+        '''test sam_pair_to_insert'''
+        expected = [
+            None, # both unmapped
+            None, # read 1 unmapped
+            None, # read 2 unmpapped
+            None, # mapped to different seqs
+            None, # same seqs, wrond orientation
+            660
+        ]
+
+        sam1 = None
+        i = 0
+        sam_reader = pysam.Samfile(os.path.join(data_dir, 'mapping_test_sam_pair_to_insert.bam'), 'rb')
+        for s in sam_reader.fetch(until_eof=True):
+            if sam1 is None:
+                sam1 = s
+                continue
+
+            self.assertEqual(mapping.sam_pair_to_insert(s, sam1), expected[i])
+            sam1 = None
+            i += 1
+
+
diff --git a/ariba/tests/read_store_test.py b/ariba/tests/read_store_test.py
new file mode 100644
index 0000000..842eed5
--- /dev/null
+++ b/ariba/tests/read_store_test.py
@@ -0,0 +1,80 @@
+import unittest
+import sys
+import os
+import shutil
+import filecmp
+import pyfastaq
+from ariba import read_store
+
+modules_dir = os.path.dirname(os.path.abspath(read_store.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+def file_to_list(infile):
+    f = pyfastaq.utils.open_file_read(infile)
+    lines = [x for x in f.readlines()]
+    pyfastaq.utils.close(f)
+    return lines
+
+
+class TestReadStore(unittest.TestCase):
+    def test_sort_file(self):
+        '''test _sort_file'''
+        infile = os.path.join(data_dir, 'read_store_test_sort_file.in')
+        expected = os.path.join(data_dir, 'read_store_test_sort_file.out')
+        tmpfile = 'tmp.read_store_test_sort_file.out'
+        read_store.ReadStore._sort_file(infile, tmpfile)
+        self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_compress_and_index_file(self):
+        '''Test _compress_and_index_file'''
+        infile = os.path.join(data_dir, 'read_store_test_compress_and_index_file.in')
+        tmpfile = 'tmp.test_compress_and_index_file.in'
+        tmpfile_gz = 'tmp.test_compress_and_index_file.in.gz'
+        shutil.copyfile(infile, tmpfile)
+        read_store.ReadStore._compress_and_index_file(tmpfile)
+        self.assertTrue(os.path.exists(tmpfile_gz))
+        expected_lines = file_to_list(infile)
+        got_lines = file_to_list(tmpfile_gz)
+        self.assertEqual(expected_lines, got_lines)
+        self.assertTrue(os.path.exists(tmpfile_gz + '.tbi'))
+        os.unlink(tmpfile)
+        os.unlink(tmpfile_gz)
+        os.unlink(tmpfile_gz + '.tbi')
+
+
+    def test_get_reads(self):
+        '''Test get_reads'''
+        infile = os.path.join(data_dir, 'read_store_test_get_reads.in')
+        expected1 = os.path.join(data_dir, 'read_store_test_get_reads.reads_1.fq')
+        expected2 = os.path.join(data_dir, 'read_store_test_get_reads.reads_2.fq')
+        outprefix = 'tmp.read_store_test_get_reads'
+        reads1 = outprefix + '.reads_1.fq'
+        reads2 = outprefix + '.reads_2.fq'
+        rstore = read_store.ReadStore(infile, outprefix)
+        rstore.get_reads('cluster2', reads1, reads2)
+        self.assertTrue(filecmp.cmp(expected1, reads1))
+        self.assertTrue(filecmp.cmp(expected2, reads2))
+        os.unlink(outprefix + '.gz')
+        os.unlink(outprefix + '.gz.tbi')
+        os.unlink(reads1)
+        os.unlink(reads2)
+
+
+    def test_clean(self):
+        '''Test clean'''
+        infile = os.path.join(data_dir, 'read_store_test_clean.in')
+        outprefix = 'tmp.read_store_test_clean'
+        self.assertFalse(os.path.exists(outprefix))
+        self.assertFalse(os.path.exists(outprefix + '.gz'))
+        self.assertFalse(os.path.exists(outprefix + '.gz.tbi'))
+        rstore = read_store.ReadStore(infile, outprefix)
+        self.assertFalse(os.path.exists(outprefix))
+        self.assertTrue(os.path.exists(outprefix + '.gz'))
+        self.assertTrue(os.path.exists(outprefix + '.gz.tbi'))
+        rstore.clean()
+        self.assertFalse(os.path.exists(outprefix))
+        self.assertFalse(os.path.exists(outprefix + '.gz'))
+        self.assertFalse(os.path.exists(outprefix + '.gz.tbi'))
diff --git a/ariba/tests/ref_preparer_test.py b/ariba/tests/ref_preparer_test.py
new file mode 100644
index 0000000..06feb9b
--- /dev/null
+++ b/ariba/tests/ref_preparer_test.py
@@ -0,0 +1,121 @@
+import unittest
+import sys
+import os
+import filecmp
+from ariba import external_progs, ref_preparer
+
+modules_dir = os.path.dirname(os.path.abspath(ref_preparer.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestRefPreparer(unittest.TestCase):
+    def test_get_ref_files_ref_prefix(self):
+        '''test _get_ref_files using ref_prefix'''
+        ref_prefix = os.path.abspath('tmp.test.ref_preparer')
+        presabs = ref_prefix + '.presence_absence.fa'
+        varonly = ref_prefix + '.variants_only.fa'
+        noncoding = ref_prefix + '.noncoding.fa'
+        metadata = ref_prefix + '.metadata.tsv'
+
+        for filename in (presabs, varonly, noncoding, metadata):
+            with open(filename, 'w') as f:
+                pass
+
+        got = ref_preparer.RefPreparer._get_ref_files(ref_prefix, None, None, None, None)
+        expected = {
+            'presabs': presabs,
+            'varonly': varonly,
+            'noncoding': noncoding,
+            'metadata': metadata,
+        }
+
+        self.assertEqual(expected, got)
+
+        os.unlink(metadata)
+        got = ref_preparer.RefPreparer._get_ref_files(ref_prefix, None, None, None, None)
+        expected['metadata'] = None
+        self.assertEqual(expected, got)
+
+        os.unlink(presabs)
+        got = ref_preparer.RefPreparer._get_ref_files(ref_prefix, None, None, None, None)
+        expected['presabs'] = None
+        self.assertEqual(expected, got)
+
+        os.unlink(varonly)
+        got = ref_preparer.RefPreparer._get_ref_files(ref_prefix, None, None, None, None)
+        expected['varonly'] = None
+        self.assertEqual(expected, got)
+
+        os.unlink(noncoding)
+        with self.assertRaises(ref_preparer.Error):
+            ref_preparer.RefPreparer._get_ref_files(ref_prefix, None, None, None, None)
+
+
+    def test_get_ref_files_naming_each_file(self):
+        ref_prefix = os.path.abspath('tmp.test.ref_preparer')
+        presabs = ref_prefix + '.presence_absence.fa'
+        varonly = ref_prefix + '.variants_only.fa'
+        noncoding = ref_prefix + '.noncoding.fa'
+        metadata = ref_prefix + '.metadata.tsv'
+        not_a_file = 'notafile'
+        self.assertFalse(os.path.exists(not_a_file))
+
+        for filename in (presabs, varonly, noncoding, metadata):
+            with open(filename, 'w') as f:
+                pass
+
+        got = ref_preparer.RefPreparer._get_ref_files(None, presabs, varonly, noncoding, metadata)
+        expected = {
+            'presabs': presabs,
+            'varonly': varonly,
+            'noncoding': noncoding,
+            'metadata': metadata,
+        }
+
+        self.assertEqual(expected, got)
+
+        got = ref_preparer.RefPreparer._get_ref_files(None, presabs, varonly, noncoding, None)
+        expected['metadata'] = None
+        self.assertEqual(expected, got)
+
+        got = ref_preparer.RefPreparer._get_ref_files(None, None, varonly, noncoding, None)
+        expected['presabs'] = None
+        self.assertEqual(expected, got)
+
+        got = ref_preparer.RefPreparer._get_ref_files(None, None, None, noncoding, None)
+        expected['varonly'] = None
+        self.assertEqual(expected, got)
+
+        with self.assertRaises(ref_preparer.Error):
+            got = ref_preparer.RefPreparer._get_ref_files(None, None, None, None, None)
+
+        with self.assertRaises(ref_preparer.Error):
+            got = ref_preparer.RefPreparer._get_ref_files(None, not_a_file, None, noncoding, None)
+
+        with self.assertRaises(ref_preparer.Error):
+            got = ref_preparer.RefPreparer._get_ref_files(None, presabs, not_a_file, None, None)
+
+        with self.assertRaises(ref_preparer.Error):
+            got = ref_preparer.RefPreparer._get_ref_files(None, presabs, None, not_a_file, None)
+
+        os.unlink(presabs)
+        os.unlink(varonly)
+        os.unlink(noncoding)
+        os.unlink(metadata)
+
+
+    def test_write_info_file(self):
+        '''test _write_info_file'''
+        extern_progs = external_progs.ExternalProgs()
+        refprep = ref_preparer.RefPreparer(extern_progs, genetic_code=1)
+        refprep.filenames = {
+            'presabs': 'presabs.fa',
+            'varonly': None,
+            'noncoding': None,
+            'metadata': 'metadata.tsv',
+        }
+        tmpfile = 'tmp.test_write_info_file.out'
+        refprep._write_info_file(tmpfile)
+        expected = os.path.join(data_dir, 'ref_preparer_test_write_info_file.out')
+        self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+        os.unlink(tmpfile)
diff --git a/ariba/tests/refcheck_test.py b/ariba/tests/refcheck_test.py
deleted file mode 100644
index bc13e01..0000000
--- a/ariba/tests/refcheck_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import unittest
-import os
-import filecmp
-import pyfastaq
-from ariba import refcheck
-
-modules_dir = os.path.dirname(os.path.abspath(refcheck.__file__))
-data_dir = os.path.join(modules_dir, 'tests', 'data')
-
-
-class TestRefcheck(unittest.TestCase):
-    def test_check_pass(self):
-        '''test check file OK'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_ok.fa')
-        c = refcheck.Checker(infile)
-        self.assertEqual(c.run(), (True, None, None))
-
-
-    def test_check_file_fail_not_gene(self):
-        '''test check file fail not a gene'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_not_gene.fa')
-        c = refcheck.Checker(infile)
-        seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGATGA')
-        self.assertEqual(c.run(), (False, 'Not a gene', seq))
-
-
-    def test_check_file_fail_too_short(self):
-        '''test check file fail short gene'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_too_short.fa')
-        c = refcheck.Checker(infile, min_length=10)
-        seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGGTGA')
-        self.assertEqual(c.run(), (False, 'Too short', seq))
-
-
-    def test_check_file_fail_too_long(self):
-        '''test check file fail long gene'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_too_long.fa')
-        c = refcheck.Checker(infile, max_length=6)
-        seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGGTGA')
-        self.assertEqual(c.run(), (False, 'Too long', seq))
-
-
-    def test_check_file_fail_spades_in_name(self):
-        '''test check file with sequence that has spaces in its name'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_spaces_in_name.fa')
-        c = refcheck.Checker(infile, min_length=3)
-        seq = pyfastaq.sequences.Fasta('gene foo', 'TTGTGGTGA')
-        self.assertEqual(c.run(), (False, 'Name has spaces', seq))
-
-
-    def test_check_file_fail_duplicate_name(self):
-        '''test check file with sequence that has two genes with the same name'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_duplicate_name.fa')
-        c = refcheck.Checker(infile, min_length=3)
-        seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGGTGA')
-        self.assertEqual(c.run(), (False, 'Duplicate name', seq))
-
-
-    def test_check_run_with_outfiles(self):
-        '''test run when making output files'''
-        infile = os.path.join(data_dir, 'refcheck_test_fix_in.fa')
-        tmp_prefix = 'tmp.refcheck_test_fix.out'
-        c = refcheck.Checker(infile, min_length=10, max_length=25, outprefix=tmp_prefix)
-        c.run()
-        for x in ['fa', 'log', 'rename', 'removed.fa']:
-            expected = os.path.join(data_dir, 'refcheck_test_fix_out.' + x)
-            got = tmp_prefix + '.' + x
-            self.assertTrue(filecmp.cmp(expected, got, shallow=False))
-            os.unlink(got)
diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py
new file mode 100644
index 0000000..cb64077
--- /dev/null
+++ b/ariba/tests/reference_data_test.py
@@ -0,0 +1,678 @@
+import unittest
+import filecmp
+import os
+import pyfastaq
+from ariba import reference_data, sequence_metadata
+
+modules_dir = os.path.dirname(os.path.abspath(reference_data.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestReferenceData(unittest.TestCase):
+    def test_init_fails(self):
+        '''Test __init__ fails when it should'''
+
+        with self.assertRaises(reference_data.Error):
+            ref_data = reference_data.ReferenceData()
+
+        presence_absence_bad  = os.path.join(data_dir, 'reference_data_init_presence_absence_bad.fa')
+
+        with self.assertRaises(reference_data.Error):
+            ref_data = reference_data.ReferenceData(presence_absence_fa=presence_absence_bad)
+
+        empty_fasta = os.path.join(data_dir, 'reference_data_init_empty.fa')
+
+        with self.assertRaises(reference_data.Error):
+            ref_data = reference_data.ReferenceData(presence_absence_fa=empty_fasta)
+
+
+    def test_init_ok(self):
+        '''Test init with good input'''
+        tsv_file = os.path.join(data_dir, 'reference_data_init.tsv')
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_init_presence_absence.fa')
+        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text')
+        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\t.\tfree text2')
+        meta3 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance')
+        meta4 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability")
+
+        expected_metadata = {
+            'gene1': {
+                'n': {12: {meta3}, 41: {meta1, meta2}},
+                'p': {},
+                '.': set(),
+            },
+            'gene2': {
+                'n': {},
+                'p': {41: {meta4}},
+                '.': set(),
+            }
+        }
+        ref_data = reference_data.ReferenceData(presence_absence_fa=presence_absence_fa, metadata_tsv=tsv_file)
+        self.assertEqual(expected_metadata, ref_data.metadata)
+
+        expected_seqs_dict = {
+            'presence_absence': {
+                'gene1': pyfastaq.sequences.Fasta('gene1', 'CATTCCTAGCGTCGTCTATCGTCG'),
+                'gene2': pyfastaq.sequences.Fasta('gene2', 'AAAAACCCCGGGGTTTT')
+            },
+            'variants_only': {},
+            'non_coding': {},
+        }
+
+        self.assertEqual(expected_seqs_dict, ref_data.seq_dicts)
+
+
+    def test_dict_keys_intersection(self):
+        '''Test dict_keys_intersection'''
+        d1 = {'a': 1, 'b':2, 'c': 42}
+        d2 = {'a': 42}
+        d3 = {'a': 11, 'b': 'xyz'}
+        self.assertEqual({'a'}, reference_data.ReferenceData._dict_keys_intersection([d1, d2, d3]))
+
+
+    def test_get_filename(self):
+        '''Test _get_filename'''
+        file_that_exists_abs = os.path.join(data_dir, 'reference_data_get_filename')
+        file_that_exists_rel = os.path.relpath(file_that_exists_abs)
+        self.assertEqual(file_that_exists_abs, reference_data.ReferenceData._get_filename(file_that_exists_rel))
+        self.assertIsNone(reference_data.ReferenceData._get_filename(None))
+
+        with self.assertRaises(reference_data.Error):
+            reference_data.ReferenceData._get_filename('thisisnotafilesoshouldthrowerror,unlessyoujustmadeitwhichseemslikeanoddthingtodoandyoudeservethefailingtest')
+
+
+    def test_load_metadata_tsv(self):
+        '''Test _load_metadata_tsv'''
+        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text')
+        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance')
+        meta3 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability")
+        expected = {
+            'gene1': {
+                'n': {12: {meta2}, 41: {meta1}},
+                'p': {},
+                '.': set(),
+            },
+            'gene2': {
+                'n': {},
+                'p': {41: {meta3}},
+                '.': set(),
+            }
+        }
+
+        tsv_file = os.path.join(data_dir, 'reference_data_load_metadata_tsv.tsv')
+        self.assertEqual(expected, reference_data.ReferenceData._load_metadata_tsv(tsv_file))
+
+
+    def test_load_fasta_file(self):
+        '''Test _load_fasta_file'''
+        expected = {'seq1': pyfastaq.sequences.Fasta('seq1', 'ACGT')}
+        filename = os.path.join(data_dir, 'reference_data_load_fasta_file.fa')
+        got = reference_data.ReferenceData._load_fasta_file(filename)
+        self.assertEqual(expected, got)
+
+
+    def test_find_gene_in_seqs(self):
+        '''Test _find_gene_in_seqs'''
+        seqs_dict = {
+            'dict1': {'name1': 'seq1', 'name2': 'seq2'},
+            'dict2': {'name3': 'seq3'}
+        }
+        self.assertEqual(None, reference_data.ReferenceData._find_gene_in_seqs('name42', seqs_dict))
+        self.assertEqual('dict1', reference_data.ReferenceData._find_gene_in_seqs('name1', seqs_dict))
+        self.assertEqual('dict1', reference_data.ReferenceData._find_gene_in_seqs('name2', seqs_dict))
+        self.assertEqual('dict2', reference_data.ReferenceData._find_gene_in_seqs('name3', seqs_dict))
+
+
+    def test_write_metadata_tsv(self):
+        '''Test _write_metadata_tsv'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_write_metadata_tsv_presence_absence.fa')
+        metadata_tsv_in = os.path.join(data_dir, 'reference_data_write_metadata_tsv.tsv')
+        metadata_tsv_expected = os.path.join(data_dir, 'reference_data_write_metadata_tsv.expected.tsv')
+        tmp_tsv = 'tmp.test_write_metadata_tsv.out.tsv'
+        ref_data = reference_data.ReferenceData(presence_absence_fa=presence_absence_fa, metadata_tsv=metadata_tsv_in)
+        ref_data._write_metadata_tsv(ref_data.metadata, tmp_tsv)
+        self.assertTrue(filecmp.cmp(metadata_tsv_expected, tmp_tsv, shallow=False))
+        os.unlink(tmp_tsv)
+
+
+    def test_write_dict_of_sequences(self):
+        '''Test _write_dict_of_sequences'''
+        d = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ACGT'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'GGGG'),
+        }
+        tmp_file = 'tmp.test_write_dict_of_sequences.fa'
+        reference_data.ReferenceData._write_dict_of_sequences(d, tmp_file)
+        expected = os.path.join(data_dir, 'reference_data_write_dict_of_sequences.fa')
+        self.assertTrue(filecmp.cmp(expected, tmp_file, shallow=False))
+        os.unlink(tmp_file)
+
+
+    def test_filter_bad_variant_data(self):
+        '''Test _filter_bad_variant_data'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_presence_absence.in.fa')
+        expected_presence_absence_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_presence_absence.expected.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_variants_only.in.fa')
+        expected_variants_only_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_variants_only.expected.fa')
+        non_coding_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_non_coding.in.fa')
+        expected_non_coding_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_non_coding.expected.fa')
+        metadata_tsv = os.path.join(data_dir, 'reference_data_filter_bad_data_metadata.in.tsv')
+        expected_tsv = os.path.join(data_dir, 'reference_data_filter_bad_data_metadata.expected.tsv')
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=non_coding_fa,
+            metadata_tsv=metadata_tsv
+        )
+
+        outprefix = 'tmp.test_filter_bad_variant_data'
+        refdata._filter_bad_variant_data(outprefix, set(), set())
+
+        self.assertTrue(filecmp.cmp(expected_tsv, outprefix + '.tsv'))
+        self.assertTrue(filecmp.cmp(expected_variants_only_fa, outprefix + '.variants_only.fa'))
+        self.assertTrue(filecmp.cmp(expected_presence_absence_fa, outprefix + '.presence_absence.fa'))
+        self.assertTrue(filecmp.cmp(expected_non_coding_fa, outprefix + '.non_coding.fa'))
+        os.unlink(outprefix + '.tsv')
+        os.unlink(outprefix + '.variants_only.fa')
+        os.unlink(outprefix + '.presence_absence.fa')
+        os.unlink(outprefix + '.non_coding.fa')
+        os.unlink(outprefix + '.log')
+
+
+    def test_try_to_get_gene_seq(self):
+        '''Test _try_to_get_gene_seq'''
+        tests = [
+            (pyfastaq.sequences.Fasta('x', 'ACGTG'), None, 'Remove: too short. Length: 5'),
+            (pyfastaq.sequences.Fasta('x', 'A' * 100), None, 'Remove: too long. Length: 100'),
+            (pyfastaq.sequences.Fasta('x', 'GAGGAGCCG'), None, 'Does not look like a gene (tried both strands and all reading frames) GAGGAGCCG'),
+            (pyfastaq.sequences.Fasta('x', 'ATGTAACCT'), None, 'Does not look like a gene (tried both strands and all reading frames) ATGTAACCT'),
+            (pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), pyfastaq.sequences.Fasta('x', 'ATGCCTTAA'), 'Made x into gene. strand=+, frame=0')
+        ]
+
+        for seq, got_seq, message in tests:
+            self.assertEqual((got_seq, message), reference_data.ReferenceData._try_to_get_gene_seq(seq, 6, 99))
+
+
+    def test_remove_bad_genes(self):
+        '''Test _remove_bad_genes'''
+        presence_absence_fasta = os.path.join(data_dir, 'reference_data_remove_bad_genes.in.fa')
+        refdata = reference_data.ReferenceData(presence_absence_fa=presence_absence_fasta, max_gene_length=99)
+        tmp_log = 'tmp.test_remove_bad_genes.log'
+
+        expected_removed = {'g1', 'g2', 'g3', 'g4'}
+        got_removed = refdata._remove_bad_genes(refdata.seq_dicts['presence_absence'], tmp_log)
+        self.assertEqual(expected_removed, got_removed)
+
+        expected_dict = {
+            'g5': pyfastaq.sequences.Fasta('g5', 'ATGCCTTAA')
+        }
+        self.assertEqual(expected_dict, refdata.seq_dicts['presence_absence'])
+        expected_log = os.path.join(data_dir, 'reference_data_test_remove_bad_genes.log')
+        self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False))
+        os.unlink(tmp_log)
+
+
+    def test_new_seq_name(self):
+        '''Test _new_seq_name'''
+        tests = [
+            ('name', 'name'),
+            ('name ', 'name'),
+            ('name xyz', 'name'),
+            ('name_a', 'name_a'),
+            ('name.a', 'name.a'),
+            ('name-a', 'name-a'),
+            ('name spam eggs foo', 'name'),
+            ('name!', 'name_'),
+            ('name:foo', 'name_foo'),
+            ('name:!@foo', 'name___foo'),
+        ]
+
+        for name, expected in tests:
+            self.assertEqual(expected, reference_data.ReferenceData._new_seq_name(name))
+
+
+    def test_seq_names_to_rename_dict(self):
+        '''Test _seq_names_to_rename_dict'''
+        names = {'foo', 'foo abc', 'foo xyz', 'bar!', 'bar:', 'spam abc', 'eggs'}
+        got = reference_data.ReferenceData._seq_names_to_rename_dict(names)
+        expected = {
+            'foo abc': 'foo_1',
+            'foo xyz': 'foo_2',
+            'bar!': 'bar_',
+            'bar:': 'bar__1',
+            'spam abc': 'spam'
+        }
+        self.assertEqual(expected, got)
+
+
+    def test_rename_names_in_seq_dicts(self):
+        '''Test _rename_names_in_seq_dicts'''
+        rename_dict = {
+            'pa abc': 'pa',
+            'pa 1': 'pa_1',
+            'vo:': 'vo_',
+        }
+        seqs_dict = {
+            'presence_absence': {
+                'pa abc': pyfastaq.sequences.Fasta('pa abc', 'AAAA'),
+                'pa 1': pyfastaq.sequences.Fasta('pa 1', 'CCC'),
+            },
+            'variants_only': {
+                'vo:': pyfastaq.sequences.Fasta('vo:', 'GGG'),
+            },
+            'non_coding': {
+                'nonc': pyfastaq.sequences.Fasta('nonc', 'TTT'),
+            }
+        }
+
+        got = reference_data.ReferenceData._rename_names_in_seq_dicts(seqs_dict, rename_dict)
+        expected = {
+            'presence_absence': {
+                'pa': pyfastaq.sequences.Fasta('pa', 'AAAA'),
+                'pa_1': pyfastaq.sequences.Fasta('pa_1', 'CCC'),
+            },
+            'variants_only': {
+                'vo_': pyfastaq.sequences.Fasta('vo_', 'GGG'),
+            },
+            'non_coding': {
+                'nonc': pyfastaq.sequences.Fasta('nonc', 'TTT'),
+            }
+        }
+        self.assertEqual(expected, seqs_dict)
+
+
+    def test_rename_metadata_set(self):
+        '''Test _rename_metadata_set'''
+        metaset = {
+            sequence_metadata.SequenceMetadata('foo 1\t.\t.\t.\tdescription'),
+            sequence_metadata.SequenceMetadata('foo 1\tp\tI42L\t.\tspam eggs')
+        }
+
+        expected = {
+            sequence_metadata.SequenceMetadata('new_name\t.\t.\t.\tdescription'),
+            sequence_metadata.SequenceMetadata('new_name\tp\tI42L\t.\tspam eggs')
+        }
+        got = reference_data.ReferenceData._rename_metadata_set(metaset, 'new_name')
+        self.assertEqual(expected, got)
+
+
+    def test_rename_names_in_metadata(self):
+        '''Test _rename_names_in_metadata'''
+        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\t.\tfree text')
+        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\t.\tfree text2')
+        meta3 = sequence_metadata.SequenceMetadata('gene1\t.\t.\t.\tfree text3')
+        meta4 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\t.\tconfers killer rabbit resistance')
+        meta5 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\t.\tremoves tardigrade's space-living capability")
+        meta1rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42G\t.\tfree text')
+        meta2rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tA42T\t.\tfree text2')
+        meta3rename = sequence_metadata.SequenceMetadata('new_gene1\t.\t.\t.\tfree text3')
+        meta4rename = sequence_metadata.SequenceMetadata('new_gene1\tn\tG13T\t.\tconfers killer rabbit resistance')
+
+        metadata = {
+            'gene1': {
+                'n': {12: {meta4}, 41: {meta1, meta2}},
+                'p': {},
+                '.': {meta3},
+            },
+            'gene2': {
+                'n': {},
+                'p': {41: {meta5}},
+                '.': set(),
+            }
+        }
+
+        expected = {
+            'new_gene1': {
+                'n': {12: {meta4rename}, 41: {meta1rename, meta2rename}},
+                'p': {},
+                '.': {meta3rename},
+            },
+            'gene2': {
+                'n': {},
+                'p': {41: {meta5}},
+                '.': set(),
+            }
+        }
+
+        rename_dict = {'gene1': 'new_gene1'}
+        got = reference_data.ReferenceData._rename_names_in_metadata(metadata, rename_dict)
+        self.assertEqual(expected, got)
+
+
+    def test_rename_sequences(self):
+        '''Test rename_sequences'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_rename_sequences.presence_absence.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_rename_sequences.variants_only.fa')
+        noncoding_fa = os.path.join(data_dir, 'reference_data_rename_sequences.noncoding.fa')
+        metadata_tsv = os.path.join(data_dir, 'reference_data_rename_sequences_metadata.tsv')
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=noncoding_fa,
+            metadata_tsv=metadata_tsv
+        )
+        tmp_out = 'tmp.test_rename_sequences.out'
+        refdata.rename_sequences(tmp_out)
+        expected_file = os.path.join(data_dir, 'reference_data_test_rename_sequences.out')
+        self.assertTrue(filecmp.cmp(expected_file, tmp_out, shallow=False))
+        os.unlink(tmp_out)
+
+        meta1 = sequence_metadata.SequenceMetadata('noncoding1\t.\t.\t.\toriginal name "noncoding1"')
+        meta2 = sequence_metadata.SequenceMetadata('noncoding1_1\t.\t.\t.\toriginal name "noncoding1 blah"')
+        meta3 = sequence_metadata.SequenceMetadata('pres_abs1_2\t.\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"')
+        meta4 = sequence_metadata.SequenceMetadata('pres_abs1_1\t.\t.\t.\toriginal name "pres_abs1 blah"')
+        meta5 = sequence_metadata.SequenceMetadata('pres_abs1\t.\t.\t.\toriginal name "pres\'abs1"')
+        meta6 = sequence_metadata.SequenceMetadata('pres_abs2\t.\t.\t.\toriginal name "pres_abs2"')
+        meta7 = sequence_metadata.SequenceMetadata('pres_abs3\t.\t.\t.\toriginal name "pres!abs3"')
+        meta8 = sequence_metadata.SequenceMetadata('var_only1_2\t.\t.\t.\toriginal name "var_only1 hello"')
+        meta9 = sequence_metadata.SequenceMetadata('var_only1\t.\t.\t.\toriginal name "var:only1 boo"')
+        meta10 = sequence_metadata.SequenceMetadata('var_only1_1\t.\t.\t.\toriginal name "var_only1"')
+        meta11 = sequence_metadata.SequenceMetadata('var_only2\t.\t.\t.\toriginal name "var_only2"')
+
+        expected_meta = {
+            'noncoding1': {'n': {}, 'p': {}, '.': {meta1}},
+            'noncoding1_1': {'n': {}, 'p': {}, '.': {meta2}},
+            'pres_abs1_2': {'n': {}, 'p': {}, '.': {meta3}},
+            'pres_abs1_1': {'n': {}, 'p': {}, '.': {meta4}},
+            'pres_abs1': {'n': {}, 'p': {}, '.': {meta5}},
+            'pres_abs2': {'n': {}, 'p': {}, '.': {meta6}},
+            'pres_abs3': {'n': {}, 'p': {}, '.': {meta7}},
+            'var_only1_2': {'n': {}, 'p': {}, '.': {meta8}},
+            'var_only1': {'n': {}, 'p': {}, '.': {meta9}},
+            'var_only1_1': {'n': {}, 'p': {}, '.': {meta10}},
+            'var_only2': {'n': {}, 'p': {}, '.': {meta11}},
+        }
+
+        self.assertEqual(expected_meta, refdata.metadata)
+
+        expected_seqs_dict = {
+            'non_coding': {
+                'noncoding1': pyfastaq.sequences.Fasta('noncoding1', 'AAAA'),
+                'noncoding1_1': pyfastaq.sequences.Fasta('noncoding1_1', 'CCCC'),
+            },
+            'presence_absence': {
+                'pres_abs1_2': pyfastaq.sequences.Fasta('pres_abs1_2', 'ACGT'),
+                'pres_abs1_1': pyfastaq.sequences.Fasta('pres_abs1_1', 'AAAA'),
+                'pres_abs1': pyfastaq.sequences.Fasta('pres_abs1', 'CCCC'),
+                'pres_abs2': pyfastaq.sequences.Fasta('pres_abs2', 'TTTT'),
+                'pres_abs3': pyfastaq.sequences.Fasta('pres_abs3', 'GGGG'),
+            },
+            'variants_only': {
+                'var_only1_2': pyfastaq.sequences.Fasta('var_only1_2', 'AAAA'),
+                'var_only1': pyfastaq.sequences.Fasta('var_only1', 'CCCC'),
+                'var_only1_1': pyfastaq.sequences.Fasta('var_only1_1', 'GGGG'),
+                'var_only2': pyfastaq.sequences.Fasta('var_only2', 'TTTT'),
+            }
+        }
+
+        self.assertEqual(expected_seqs_dict, refdata.seq_dicts)
+
+
+    def test_make_catted_fasta(self):
+        '''Test make_catted_fasta'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.presence_absence.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.variants_only.fa')
+        noncoding_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.noncoding.fa')
+        expected_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.expected.fa')
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=noncoding_fa
+        )
+        tmp_out = 'tmp.test.make_catted_fasta.out.fa'
+        refdata.make_catted_fasta(tmp_out)
+        self.assertTrue(filecmp.cmp(expected_fa, tmp_out, shallow=False))
+        os.unlink(tmp_out)
+
+
+    def test_sequence_type(self):
+        '''Test sequence_type'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_sequence_type.presence_absence.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_sequence_type.variants_only.fa')
+        noncoding_fa = os.path.join(data_dir, 'reference_data_sequence_type.noncoding.fa')
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=noncoding_fa
+        )
+
+        tests = [
+            ('pa', 'presence_absence'),
+            ('var_only', 'variants_only'),
+            ('noncoding', 'non_coding'),
+            ('not_there', None)
+        ]
+
+        for name, expected in tests:
+            self.assertEqual(expected, refdata.sequence_type(name))
+
+
+    def test_sequence(self):
+        '''Test sequence'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_sequence.presence_absence.fa')
+        expected = pyfastaq.sequences.Fasta('pa', 'ATGTTTTAA')
+        refdata = reference_data.ReferenceData(presence_absence_fa=presence_absence_fa)
+        self.assertEqual(expected, refdata.sequence('pa'))
+
+
+    def test_sequence_length(self):
+        '''Test sequence_length'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_sequence_length.presence_absence.fa')
+        refdata = reference_data.ReferenceData(presence_absence_fa=presence_absence_fa)
+        self.assertEqual(9, refdata.sequence_length('pa'))
+
+
+    def test_all_non_wild_type_variants(self):
+        '''Test all_non_wild_type_variants'''
+        tsv_file = os.path.join(data_dir, 'reference_data_test_all_non_wild_type_variants.tsv')
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_test_all_non_wild_type_variants.ref.pres_abs.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_test_all_non_wild_type_variants.ref.var_only.fa')
+        noncoding_fa = os.path.join(data_dir, 'reference_data_test_all_non_wild_type_variants.ref.noncoding.fa')
+
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=noncoding_fa,
+            metadata_tsv=tsv_file
+        )
+
+        v1 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tA8T\t.\tref has wild type A')
+        v2 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tG9C\t.\tref has variant C instead of G')
+        v3 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tP3Q\t.\tref has wild type P')
+        v4 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tG4I\t.\tref has wild type F')
+        v5 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tI5V\t.\tref has variant V instead of I')
+        v6 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tF6I\t.\tref has wild type F')
+        p1 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA4G\t.\tref has wild type A')
+        p2 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA6C\t.\tref has variant C instead of A')
+        p3 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tN2I\t.\tref has wild type N')
+        p4 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tA4G\t.\tref has variant G instead of A')
+        n1 = sequence_metadata.SequenceMetadata('non_coding\tn\tA2C\t.\tref has wild type A')
+        n2 = sequence_metadata.SequenceMetadata('non_coding\tn\tC4T\t.\tref has variant T instead of C')
+
+        var_only_expected = {
+             'n': {7: {v1}, 8: {v2}},
+             'p': {2: {v3}, 3: {v4}, 4: {v5}, 5: {v6}}
+        }
+
+        pres_abs_expected = {
+            'n': {3: {p1}, 5: {p2}},
+            'p': {1: {p3}, 3: {p4}},
+        }
+
+        non_coding_expected = {
+            'n': {1: {n1}, 3: {n2}},
+            'p': {}
+        }
+
+        self.assertEqual(var_only_expected, refdata.all_non_wild_type_variants('var_only_gene'))
+        self.assertEqual(pres_abs_expected, refdata.all_non_wild_type_variants('presence_absence_gene'))
+        self.assertEqual(non_coding_expected, refdata.all_non_wild_type_variants('non_coding'))
+        self.assertEqual({'n': {}, 'p': {}}, refdata.all_non_wild_type_variants('not_a_known_sequence'))
+
+
+    def test_write_cluster_allocation_file(self):
+        '''Test write_cluster_allocation_file'''
+        clusters = {
+            'presence_absence': {
+                'seq1': {'seq1', 'seq2'},
+                'seq3': {'seq3', 'seq4', 'seq5'},
+                'seq6': {'seq6'}
+            },
+            'non_coding' : {
+                'seq10': {'seq42'}
+            },
+            'variants_only': None
+        }
+        tmpfile = 'tmp.test_write_cluster_allocation_file.out'
+        reference_data.ReferenceData.write_cluster_allocation_file(clusters, tmpfile)
+        expected_file = os.path.join(data_dir, 'reference_data_test_write_cluster_allocation_file.expected')
+        self.assertTrue(filecmp.cmp(expected_file, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_cluster_with_cdhit(self):
+        '''Test cluster_with_cd_hit'''
+        inprefix = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit')
+        presence_absence_fa = inprefix + '.presence_absence.fa'
+        non_coding_fa = inprefix + '.non_coding.fa'
+
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            non_coding_fa=non_coding_fa,
+        )
+
+        outprefix = 'tmp.test_cluster_with_cdhit'
+
+        expected = {
+            'non_coding': {
+                'noncoding1.n': {'noncoding1'}
+            },
+            'presence_absence': {
+                'presence_absence1.p': {'presence_absence1', 'presence_absence2'},
+                'presence_absence3.p': {'presence_absence4', 'presence_absence3'}
+            },
+            'variants_only': None,
+        }
+
+        got = refdata.cluster_with_cdhit(inprefix, outprefix)
+        self.assertEqual(expected, got)
+        expected_seqs = {}
+        expected_cluster_reps_fa = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit.expected_representatives.fa')
+        pyfastaq.tasks.file_to_dict(expected_cluster_reps_fa, expected_seqs)
+        got_seqs = {}
+        pyfastaq.tasks.file_to_dict(outprefix + '.cluster_representatives.fa', got_seqs)
+        self.assertEqual(expected_seqs, got_seqs)
+
+        expected_clusters_file = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit.clusters.tsv')
+        got_clusters_file = outprefix + '.clusters.tsv'
+        self.assertTrue(filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False))
+
+        os.unlink(got_clusters_file)
+        os.unlink(outprefix + '.cluster_representatives.fa')
+        os.unlink(outprefix + '.non_coding.cdhit')
+        os.unlink(outprefix + '.presence_absence.cdhit')
+
+
+    def test_cluster_with_cdhit_clusters_in_file(self):
+        '''Test cluster_with_cd_hit clusters from file'''
+        inprefix = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_clusters_in_file')
+        presence_absence_fa = inprefix + '.presence_absence.fa'
+        non_coding_fa = inprefix + '.non_coding.fa'
+        clusters_file = inprefix + '.clusters'
+
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            non_coding_fa=non_coding_fa,
+        )
+
+        outprefix = 'tmp.test_cluster_with_cdhit_clusters_in_file'
+
+        expected = {
+            'non_coding': {
+                'noncoding1.n': {'noncoding1'},
+                'noncoding2.n': {'noncoding2'}
+            },
+            'presence_absence': {
+                'presence_absence1.p': {'presence_absence1', 'presence_absence3', 'presence_absence4'},
+                'presence_absence2.p': {'presence_absence2'}
+            },
+            'variants_only': None,
+        }
+
+        got = refdata.cluster_with_cdhit(inprefix, outprefix, clusters_file=clusters_file)
+        self.assertEqual(expected, got)
+
+        expected_cluster_reps_fa = inprefix +  '.expected_representatives.fa'
+        expected_seqs = {}
+        pyfastaq.tasks.file_to_dict(expected_cluster_reps_fa, expected_seqs)
+        got_seqs = {}
+        pyfastaq.tasks.file_to_dict(outprefix + '.cluster_representatives.fa', got_seqs)
+        self.assertEqual(expected_seqs, got_seqs)
+
+        expected_clusters_file = inprefix + '.clusters.tsv'
+        got_clusters_file = outprefix + '.clusters.tsv'
+        self.assertTrue(filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False))
+
+        os.unlink(got_clusters_file)
+        os.unlink(outprefix + '.cluster_representatives.fa')
+        os.unlink(outprefix + '.non_coding.cdhit')
+        os.unlink(outprefix + '.presence_absence.cdhit')
+
+
+    def test_cluster_with_cdhit_clusters_in_file(self):
+        '''Test cluster_with_cd_hit clusters from file'''
+        inprefix = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit_nocluster')
+        presence_absence_fa = inprefix + '.presence_absence.fa'
+        non_coding_fa = inprefix + '.non_coding.fa'
+        clusters_file = inprefix + '.clusters'
+
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            non_coding_fa=non_coding_fa,
+        )
+
+        outprefix = 'tmp.test_cluster_with_cdhit_nocluster'
+
+        expected = {
+            'non_coding': {
+                'noncoding1.n': {'noncoding1'},
+                'noncoding2.n': {'noncoding2'}
+            },
+            'presence_absence': {
+                'presence_absence1.p': {'presence_absence1'},
+                'presence_absence2.p': {'presence_absence2'},
+                'presence_absence3.p': {'presence_absence3'},
+                'presence_absence4.p': {'presence_absence4'},
+            },
+            'variants_only': None,
+        }
+
+        got = refdata.cluster_with_cdhit(inprefix, outprefix, nocluster=True)
+        self.assertEqual(expected, got)
+
+        expected_cluster_reps_fa = inprefix +  '.expected_representatives.fa'
+        expected_seqs = {}
+        pyfastaq.tasks.file_to_dict(expected_cluster_reps_fa, expected_seqs)
+        got_seqs = {}
+        pyfastaq.tasks.file_to_dict(outprefix + '.cluster_representatives.fa', got_seqs)
+        self.assertEqual(expected_seqs, got_seqs)
+
+        expected_clusters_file = inprefix + '.clusters.tsv'
+        got_clusters_file = outprefix + '.clusters.tsv'
+        self.assertTrue(filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False))
+
+        os.unlink(got_clusters_file)
+        os.unlink(outprefix + '.cluster_representatives.fa')
+        os.unlink(outprefix + '.non_coding.cdhit')
+        os.unlink(outprefix + '.presence_absence.cdhit')
+
+
+    def test_write_seqs_to_fasta(self):
+        '''Test write_seqs_to_fasta'''
+        refdata = reference_data.ReferenceData(presence_absence_fa=os.path.join(data_dir, 'reference_data_test_write_seqs_to_fasta.in.fa'))
+        expected_outfile = os.path.join(data_dir, 'reference_data_test_write_seqs_to_fasta.expected.fa')
+        tmpfile = 'tmp.test.reference_data.write_seqs_to_fasta.out.fa'
+        refdata.write_seqs_to_fasta(tmpfile, {'seq1', 'seq4', 'seq5'})
+        self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
diff --git a/ariba/tests/report_filter_test.py b/ariba/tests/report_filter_test.py
new file mode 100644
index 0000000..7725de6
--- /dev/null
+++ b/ariba/tests/report_filter_test.py
@@ -0,0 +1,335 @@
+import unittest
+import os
+import filecmp
+import pyfastaq
+from ariba import flag, report_filter, report
+
+modules_dir = os.path.dirname(os.path.abspath(report_filter.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestReportFilter(unittest.TestCase):
+    def test_init_good_file(self):
+        '''test __init__ on good input file'''
+        infile = os.path.join(data_dir, 'report_filter_test_init_good.tsv')
+        rf = report_filter.ReportFilter(infile=infile)
+        line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'a:n:C42T:id1:foo', 'free_text'])
+        line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id2:bar', 'free_text2'])
+        line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '12.4', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id3:spam', 'free_text3'])
+        line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '20.2', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'a:v:I42L:id4:eggs', 'free_text3'])
+
+        expected = {
+            'cluster1': {
+                'cluster1.scaffold.1': [report_filter.ReportFilter._report_line_to_dict(line1), report_filter.ReportFilter._report_line_to_dict(line2)],
+                'cluster1.scaffold.2': [report_filter.ReportFilter._report_line_to_dict(line3)],
+            },
+            'cluster2': {
+                'cluster2.scaffold.1': [report_filter.ReportFilter._report_line_to_dict(line4)]
+            }
+        }
+
+        self.assertEqual(expected, rf.report)
+
+
+    def test_init_bad_file(self):
+        '''test __init__ on bad input file'''
+        infile = os.path.join(data_dir, 'report_filter_test_init_bad.tsv')
+        with self.assertRaises(report_filter.Error):
+            rf = report_filter.ReportFilter(infile=infile)
+
+
+    def test_report_line_to_dict(self):
+        line = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t999\t23.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\ta:n:C42T:id1:foo\tfree text'
+        expected = {
+            'ref_name':           'cluster1',
+            'ref_type':           'non_coding',
+            'flag':               flag.Flag(27),
+            'reads':              10000,
+            'cluster':        'cluster1',
+            'ref_len':            1000,
+            'ref_base_assembled': 999,
+            'pc_ident':           99.42,
+            'ctg':                'cluster1.scaffold.1',
+            'ctg_len':            999,
+            'ctg_cov':            23.2,
+            'known_var':          '1',
+            'var_type':           'SNP',
+            'var_seq_type':       'n',
+            'known_var_change':   'C42T',
+            'has_known_var':      '0',
+            'ref_ctg_change':     '.',
+            'ref_ctg_effect':     '.',
+            'ref_start':          42,
+            'ref_end':            42,
+            'ref_nt':             'C',
+            'ctg_start':          142,
+            'ctg_end':            142,
+            'ctg_nt':             'C',
+            'smtls_total_depth':  '500',
+            'smtls_alt_nt':       '.',
+            'smtls_alt_depth':    '500',
+            'var_description':    'a:n:C42T:id1:foo',
+            'free_text':          'free text',
+        }
+
+        self.assertEqual(expected, report_filter.ReportFilter._report_line_to_dict(line))
+
+        bad_line = '\t'.join(line.split('\t')[:-1])
+        self.assertEqual(None, report_filter.ReportFilter._report_line_to_dict(bad_line))
+
+
+    def test_dict_to_report_line(self):
+        '''Test _dict_to_report_line'''
+        report_dict = {
+            'ref_name':           'cluster1',
+            'ref_type':           'non_coding',
+            'flag':               flag.Flag(27),
+            'reads':              10000,
+            'cluster':        'cluster1',
+            'ref_len':            1000,
+            'ref_base_assembled': 999,
+            'pc_ident':           99.42,
+            'ctg':                'cluster1.scaffold.1',
+            'ctg_len':            1300,
+            'ctg_cov':            42.4,
+            'known_var':          '1',
+            'var_type':           'SNP',
+            'var_seq_type':       'n',
+            'known_var_change':   'C42T',
+            'has_known_var':      '0',
+            'ref_ctg_change':     '.',
+            'ref_ctg_effect':     '.',
+            'ref_start':          42,
+            'ref_end':            42,
+            'ref_nt':             'C',
+            'ctg_start':          142,
+            'ctg_end':            142,
+            'ctg_nt':             'C',
+            'smtls_total_depth':  '500',
+            'smtls_alt_nt':       '.',
+            'smtls_alt_depth':    '500',
+            'var_description':    'a:n:C42T:id1:foo',
+            'free_text':          'free text',
+        }
+
+        expected = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t1300\t42.4\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\ta:n:C42T:id1:foo\tfree text'
+        self.assertEqual(expected, report_filter.ReportFilter._dict_to_report_line(report_dict))
+
+
+    def test_load_report(self):
+        good_infile = os.path.join(data_dir, 'report_filter_test_load_report_good.tsv')
+        bad_infile = os.path.join(data_dir, 'report_filter_test_load_report_bad.tsv')
+
+        line1 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', '.', '500', 'a:n:C42T:id1:foo', 'free_text'])
+        line2 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '12.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id2:bar', 'free_text2'])
+        line3 = '\t'.join(['cluster1', 'non_coding', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '22.2', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', '.', '542', 'a:n:A51G:id3:spam', 'free_text3'])
+        line4 = '\t'.join(['cluster2', 'variants_only', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '33.3', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', '.', '290', 'a:v:I42L:id4:eggs', 'free_text3'])
+
+        expected = {
+            'cluster1': {
+                'cluster1.scaffold.1': [report_filter.ReportFilter._report_line_to_dict(line1), report_filter.ReportFilter._report_line_to_dict(line2)],
+                'cluster1.scaffold.2': [report_filter.ReportFilter._report_line_to_dict(line3)],
+            },
+            'cluster2': {
+                'cluster2.scaffold.1': [report_filter.ReportFilter._report_line_to_dict(line4)]
+            }
+        }
+
+        got = report_filter.ReportFilter._load_report(good_infile)
+        self.maxDiff = None
+        self.assertEqual(expected, got)
+        with self.assertRaises(report_filter.Error):
+            report_filter.ReportFilter._load_report(bad_infile)
+
+
+    def test_report_dict_passes_non_essential_filters_known_vars(self):
+        '''Test _report_dict_passes_non_essential_filters with known vars'''
+        tests = [
+            ('.', '.', True, True),
+            ('.', '.', False, True),
+            ('0', '0', True, True),
+            ('0', '0', False, True),
+            ('1', '0', True, False),
+            ('1', '1', True, True),
+            ('1', '0', False, True),
+            ('1', '1', False, True),
+        ]
+
+        for known_var, has_known_var, ignore_not_has_known_variant, expected in tests:
+            d = {'known_var': known_var, 'has_known_var': has_known_var}
+            rf = report_filter.ReportFilter(ignore_not_has_known_variant=ignore_not_has_known_variant)
+            self.assertEqual(expected, rf._report_dict_passes_non_essential_filters(d))
+
+
+    def test_report_dict_passes_non_essential_filters_synonymous(self):
+        '''Test _report_dict_passes_non_essential_filters with synonymous AA changes'''
+        tests = [
+             ('.', True, True),
+             ('.', False, True),
+             ('SNP', True, True),
+             ('SNP', False, True),
+             ('SYN', True, False),
+             ('SYN', False, True),
+        ]
+
+        for var, remove_synonymous_snps, expected in tests:
+            d = {'known_var': '1', 'ref_ctg_effect': var, 'has_known_var': '1'}
+            rf = report_filter.ReportFilter(remove_synonymous_snps=remove_synonymous_snps)
+            self.assertEqual(expected, rf._report_dict_passes_non_essential_filters(d))
+
+
+    def test_report_dict_passes_essential_filters(self):
+        '''Test _report_dict_passes_essential_filters'''
+        line1 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        line2 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t0\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        line3 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        tests = [
+            (report_filter.ReportFilter._report_line_to_dict(line1), True),
+            (report_filter.ReportFilter._report_line_to_dict(line2), False),
+            (report_filter.ReportFilter._report_line_to_dict(line3), False),
+        ]
+
+        for test_dict, expected in tests:
+            rf = report_filter.ReportFilter()
+            self.assertEqual(expected,  rf._report_dict_passes_essential_filters(test_dict))
+
+
+    def test_flag_passes_filter(self):
+        '''Test _flag_passes_filter'''
+        rf = report_filter.ReportFilter()
+        exclude_flags = ['assembly_fail', 'ref_seq_choose_fail']
+        f = flag.Flag()
+        self.assertTrue(rf._flag_passes_filter(f, exclude_flags))
+        f.add('assembled')
+        self.assertTrue(rf._flag_passes_filter(f, exclude_flags))
+        f = flag.Flag()
+        f.add('assembly_fail')
+        self.assertFalse(rf._flag_passes_filter(f, exclude_flags))
+        f = flag.Flag()
+        f.add('ref_seq_choose_fail')
+        self.assertFalse(rf._flag_passes_filter(f, exclude_flags))
+
+
+    def test_filter_list_of_dicts_all_fail(self):
+        '''Test _filter_list_of_dicts where all fail'''
+        rf = report_filter.ReportFilter()
+        line1 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t88.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        line2 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
+        dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
+        got = rf._filter_list_of_dicts([dict1, dict2])
+        self.assertEqual([], got)
+
+
+    def test_filter_list_of_dicts_with_essential(self):
+        '''Test _filter_list_of_dicts with an essential line but all others fail'''
+        rf = report_filter.ReportFilter(ignore_not_has_known_variant=True)
+        line1 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        line2 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
+        dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
+        expected_line = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t' + '\t'.join(['.'] * 17) + '\tfree text'
+        expected = [report_filter.ReportFilter._report_line_to_dict(expected_line)]
+        assert expected != [None]
+        got = rf._filter_list_of_dicts([dict1, dict2])
+        self.assertEqual(expected, got)
+
+
+    def test_filter_list_of_dicts_with_pass(self):
+        '''Test _filter_list_of_dicts with a line that passes'''
+        rf = report_filter.ReportFilter(ignore_not_has_known_variant=True)
+        line1 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        line2 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC46T\t1\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C46T\tfree text'
+        line3 = 'cluster1\tnon_coding\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
+        dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
+        dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
+        dict3 = report_filter.ReportFilter._report_line_to_dict(line3)
+        got = rf._filter_list_of_dicts([dict1, dict2, dict3])
+        self.assertEqual([dict2], got)
+
+
+    def test_remove_all_after_first_frameshift(self):
+        '''Test _remove_all_after_first_frameshift'''
+        self.assertEqual([], report_filter.ReportFilter._remove_all_after_first_frameshift([]))
+        line1 = 'cluster1\tpresence_absence\t528\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tE89G\tNONSYN\t65\t265\tA;A\t766\t766\tG;C\t88;90\t.;.\t87;90\t.\t.'
+        line2 = 'cluster1\tpresence_absence\t528\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tQ37fs\tFSHIFT\t109\t109\tA\t634\t634\t.\t67\t.\t67\t.\t.'
+        line3 = 'cluster1\tpresence_absence\t528\t1874\tcluster1\t1188\t1097\t92.43\tcluster1.scaffold.1\t2218\t42.42\t0\t.\tp\t.\t0\tE89G\tNONSYN\t265\t265\tA;A\t766\t766\tG;C\t88;90\t.;.\t87;90\t.\t.'
+        dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
+        dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
+        dict3 = report_filter.ReportFilter._report_line_to_dict(line3)
+        self.assertEqual([dict1], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1]))
+        self.assertEqual([dict1, dict2], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1, dict2]))
+        self.assertEqual([dict2], report_filter.ReportFilter._remove_all_after_first_frameshift([dict2, dict3]))
+        self.assertEqual([dict1, dict2], report_filter.ReportFilter._remove_all_after_first_frameshift([dict1, dict2, dict3]))
+
+
+    def test_filter_dicts(self):
+        '''Test _filter_dicts'''
+        rf = report_filter.ReportFilter(min_ref_base_assembled=10, ignore_not_has_known_variant=True)
+        ref_2_dict = {x: '.' for x in report.columns}
+        ref_2_dict['pc_ident'] = 91.0
+        ref_2_dict['ref_base_assembled'] = 10
+        ref_2_dict['has_known_var'] = '0'
+        ref_2_dict['flag'] = flag.Flag(27)
+
+        rf.report = {
+            'ref1': {
+                'ref1.scaff1': [
+                    {'flag': flag.Flag(27), 'pc_ident': 91.0, 'ref_base_assembled': 9, 'known_var': '1', 'has_known_var': '1'},
+                    {'flag': flag.Flag(27), 'pc_ident': 89.0, 'ref_base_assembled': 10, 'known_var': '1', 'has_known_var': '1'},
+                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '0'},
+                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1'},
+                ]
+            },
+            'ref2': {
+                'ref2.scaff1': [
+                    ref_2_dict
+                ]
+            },
+            'ref3': {
+                'ref3.scaff1': [
+                    {'flag': flag.Flag(27), 'pc_ident': 84.0, 'ref_base_assembled': 10, 'known_var': '1', 'has_known_var': '0'},
+                ]
+            },
+            'ref4': {
+                'ref4.scaff1': [
+                    {'flag': flag.Flag(64), 'pc_ident': '.', 'ref_base_assembled': '.', 'known_var': '.', 'has_known_var': '.'},
+                ]
+            }
+        }
+
+        expected = {
+            'ref1': {
+                'ref1.scaff1': [
+                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1'},
+                ]
+            },
+            'ref2': {
+                'ref2.scaff1': [ref_2_dict]
+            }
+        }
+
+        rf._filter_dicts()
+        self.assertEqual(expected, rf.report)
+
+
+    def test_write_report_tsv(self):
+        '''Test write_report_tsv'''
+        infile = os.path.join(data_dir, 'report_filter_test_write_report.tsv')
+        tmpfile = 'tmp.test.report_filter.write_report.tsv'
+        rf = report_filter.ReportFilter(infile=infile)
+        rf._write_report_tsv(tmpfile)
+        self.assertTrue(filecmp.cmp(tmpfile, infile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_run(self):
+        '''Test run'''
+        infile = os.path.join(data_dir, 'report_filter_test_run.in.tsv')
+        expected_file = os.path.join(data_dir, 'report_filter_test_run.expected.tsv')
+        tmpprefix = 'tmp.test.report_filter.run.out'
+        rf = report_filter.ReportFilter(infile=infile)
+        rf.run(tmpprefix)
+        self.assertTrue(filecmp.cmp(expected_file, tmpprefix + '.tsv', shallow=False))
+        os.unlink(tmpprefix + '.tsv')
+
diff --git a/ariba/tests/samtools_variants_test.py b/ariba/tests/samtools_variants_test.py
new file mode 100644
index 0000000..5460cf8
--- /dev/null
+++ b/ariba/tests/samtools_variants_test.py
@@ -0,0 +1,173 @@
+import unittest
+import os
+import pyfastaq
+import pymummer
+from ariba import samtools_variants, external_progs
+
+modules_dir = os.path.dirname(os.path.abspath(samtools_variants.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
+
+
+def file2lines(filename):
+    f = pyfastaq.utils.open_file_read(filename)
+    lines = f.readlines()
+    pyfastaq.utils.close(f)
+    return lines
+
+
+class TestSamtoolsVariants(unittest.TestCase):
+    def test_make_vcf_and_read_depths_files(self):
+        '''test _make_vcf_and_read_depths_files'''
+        ref = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa')
+        bam = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.bam')
+        expected_vcf = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf')
+        expected_depths = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz')
+        tmp_prefix = 'tmp.test_make_vcf_and_read_depths_files'
+        sv = samtools_variants.SamtoolsVariants(
+            ref,
+            bam,
+            tmp_prefix,
+            samtools_exe=extern_progs.exe('samtools'),
+            bcftools_exe=extern_progs.exe('bcftools')
+        )
+        sv._make_vcf_and_read_depths_files()
+
+        def get_vcf_call_lines(fname):
+            with open(fname) as f:
+                lines = [x for x in f.readlines() if not x.startswith('#')]
+            return lines
+
+        expected_lines = get_vcf_call_lines(expected_vcf)
+        got_lines = get_vcf_call_lines(sv.vcf_file)
+
+        # need to check that the vcf lines look the same. Column 8 is ;-delimited
+        # and can be an any order.
+        self.assertEqual(len(expected_lines), len(got_lines))
+
+        for i in range(len(expected_lines)):
+            expected = expected_lines[i].split('\t')
+            got = got_lines[i].split('\t')
+            self.assertEqual(len(expected), len(got))
+            self.assertEqual(expected[:7], got[:7])
+            self.assertEqual(expected[-2:], got[-2:])
+            exp_set = set(expected[7].split(';'))
+            got_set = set(got[7].split(';'))
+            self.assertEqual(exp_set, got_set)
+
+
+        # samtools-1.2 and 1.3 output not xonsistent in final column, so
+        # ignore those.
+        expected_lines = file2lines(expected_depths)
+        got_lines = file2lines(sv.read_depths_file)
+        self.assertEqual(len(expected_lines), len(got_lines))
+
+        for i in range(len(expected_lines)):
+            self.assertEqual(expected_lines[i].split('\t')[:-1], got_lines[i].split('\t')[:-1])
+
+        os.unlink(sv.vcf_file)
+        os.unlink(sv.read_depths_file)
+        os.unlink(sv.read_depths_file + '.tbi')
+
+
+    def test_get_read_depths(self):
+        '''test _get_read_depths'''
+        read_depths_file = os.path.join(data_dir, 'samtools_variants_test_get_read_depths.gz')
+
+        tests = [
+            ( ('ref1', 42), None ),
+            ( ('ref2', 1), None ),
+            ( ('ref1', 0), ('G', '.', 1, '1') ),
+            ( ('ref1', 2), ('T', 'A', 3, '2,1') ),
+            ( ('ref1', 3), ('C', 'A,G', 42, '21,11,10') ),
+            ( ('ref1', 4), ('C', 'AC', 41, '0,42') )
+        ]
+
+        for (name, position), expected in tests:
+            self.assertEqual(expected, samtools_variants.SamtoolsVariants._get_read_depths(read_depths_file, name, position))
+
+
+    def test_get_variant_positions_from_vcf(self):
+        '''test _get_variant_positions_from_vcf'''
+        vcf_file = os.path.join(data_dir, 'samtools_variants_test_get_variant_positions_from_vcf.vcf')
+
+        expected = [
+            ('16__cat_2_M35190.scaffold.1', 92),
+            ('16__cat_2_M35190.scaffold.1', 179),
+            ('16__cat_2_M35190.scaffold.1', 263),
+            ('16__cat_2_M35190.scaffold.6', 93)
+        ]
+        self.assertEqual(expected, samtools_variants.SamtoolsVariants._get_variant_positions_from_vcf(vcf_file))
+
+
+    def test_get_variants(self):
+        '''test _get_variants'''
+        vcf_file = os.path.join(data_dir, 'samtools_variants_test_get_variants.vcf')
+        read_depths_file = os.path.join(data_dir, 'samtools_variants_test_get_variants.read_depths.gz')
+        positions = [
+            ('16__cat_2_M35190.scaffold.1', 92),
+            ('16__cat_2_M35190.scaffold.1', 179),
+            ('16__cat_2_M35190.scaffold.1', 263),
+            ('16__cat_2_M35190.scaffold.6', 93)
+        ]
+        expected = {
+            '16__cat_2_M35190.scaffold.1': {
+                92: ('T', 'A', 123, '65,58'),
+                179: ('A', 'T', 86, '41,45'),
+                263: ('G', 'C', 97, '53,44'),
+            },
+            '16__cat_2_M35190.scaffold.6': {
+                93: ('T', 'G', 99, '56,43')
+            }
+        }
+
+        got = samtools_variants.SamtoolsVariants._get_variants(vcf_file, read_depths_file, positions=positions)
+        self.assertEqual(expected, got)
+
+
+    def test_total_depth_per_contig(self):
+        '''test total_depth_per_contig'''
+        infile = os.path.join(data_dir, 'samtools_variants_test_total_depth_per_contig')
+        expected = {'scaff1': 67, 'scaff2': 72}
+        got = samtools_variants.SamtoolsVariants.total_depth_per_contig(infile)
+        self.assertEqual(expected, got)
+
+
+    def test_variants_in_coords(self):
+        '''test variants_in_coords'''
+        vcf_file = os.path.join(data_dir, 'samtools_variants_test_variants_in_coords.vcf')
+
+        hit = ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1']
+        nucmer_hits = {
+            'scaff1': [pyfastaq.intervals.Interval(0, 41)]
+        }
+
+        got = samtools_variants.SamtoolsVariants.variants_in_coords(nucmer_hits, vcf_file)
+        self.assertEqual(1, got)
+
+
+    def test_get_depths_at_position(self):
+        '''test get_depths_at_position'''
+        bam = os.path.join(data_dir, 'samtools_variants_test_get_depths_at_position.bam')
+        ref_fa = os.path.join(data_dir, 'samtools_variants_test_get_depths_at_position.ref.fa')
+        tmp_prefix = 'tmp.test_get_depths_at_position'
+        samtools_vars = samtools_variants.SamtoolsVariants(
+            ref_fa,
+            bam,
+            tmp_prefix,
+            samtools_exe=extern_progs.exe('samtools'),
+            bcftools_exe=extern_progs.exe('bcftools')
+        )
+        samtools_vars.run()
+        tests = [
+            (('ref', 425), ('C', 'T', 31, '18,13')),
+            (('not_a_ref', 10), ('ND', 'ND', 'ND', 'ND')),
+            (('ref', 1000000000), ('ND', 'ND', 'ND', 'ND'))
+        ]
+        for (ref, pos), expected in tests:
+            got = samtools_vars.get_depths_at_position(ref, pos)
+            self.assertEqual(expected, got)
+
+        os.unlink(samtools_vars.vcf_file)
+        os.unlink(samtools_vars.read_depths_file)
+        os.unlink(samtools_vars.read_depths_file + '.tbi')
diff --git a/ariba/tests/sequence_metadata_test.py b/ariba/tests/sequence_metadata_test.py
new file mode 100644
index 0000000..ed4fe47
--- /dev/null
+++ b/ariba/tests/sequence_metadata_test.py
@@ -0,0 +1,89 @@
+import unittest
+import os
+import pyfastaq
+from ariba import sequence_metadata, sequence_variant
+
+modules_dir = os.path.dirname(os.path.abspath(sequence_metadata.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestSequenceMetadata(unittest.TestCase):
+    def test_init_fails_on_bad_lines(self):
+        '''Test init fails on bad lines'''
+        lines = [
+            'only one column. There can NOT be only one\n',
+            'two\tcolumns is not enough\n',
+            'three\tcolumns\tis still not enough\n',
+            'four\tcolumns\tis\tis also not enough\n',
+            'six\tcolumns\tis\tone\ttoo\tmany\n',
+        ]
+
+        for line in lines:
+            with self.assertRaises(sequence_metadata.Error):
+                sequence_metadata.SequenceMetadata(line)
+
+        lines = [
+            'gene\tx\tI42L\tid\tfoo\n',
+        ]
+
+        for line in lines:
+            with self.assertRaises(sequence_variant.Error):
+                sequence_metadata.SequenceMetadata(line)
+
+
+    def test_init_on_good_input(self):
+        '''test init ok on good input'''
+        data = sequence_metadata.SequenceMetadata('gene\tn\tI42L\tid\tspam spam wonderful spam')
+        self.assertEqual(data.name, 'gene')
+        self.assertEqual(data.variant_type, 'n')
+        self.assertEqual(data.variant.wild_value, 'I')
+        self.assertEqual(data.variant.variant_value, 'L')
+        self.assertEqual(data.variant.identifier, 'id')
+        self.assertEqual(data.free_text, 'spam spam wonderful spam')
+
+
+    def test_str(self):
+        '''test __str__'''
+        lines = [
+            'gene1\tn\tA42G\tid1\tspam',
+            'gene2\t.\t.\t.\t.',
+            'gene3\t.\t.\t.\teggs',
+            'gene4\tp\tI42K\tid\tthis mutation kills tardigrades',
+        ]
+
+        for line in lines:
+            self.assertEqual(line, str(sequence_metadata.SequenceMetadata(line)))
+
+
+    def test_has_variant(self):
+        '''test has_variant'''
+        tests = [
+            ('gene1\t.\t.\t.\t.', False),
+            ('gene1\tn\tA2T\t.\t,', True),
+            ('gene1\tn\tT2A\t.\t.', False),
+            ('gene1\tp\tI2Y\t.\t.', True),
+            ('gene1\tp\tY2I\t.\t.', False),
+        ]
+
+        seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC*
+
+        for line, expected in tests:
+            metadata = sequence_metadata.SequenceMetadata(line)
+            self.assertEqual(expected, metadata.has_variant(seq))
+
+
+    def test_to_string(self):
+        '''test to_string'''
+        lines = [
+            ('gene1', 'n', 'A42G', 'id1', 'spam'),
+            ('gene2', '.', '.', '.', '.'),
+            ('gene3', '.', '.', '.', 'eggs'),
+            ('gene4', 'p', 'I42K', 'id', 'this mutation kills tardigrades'),
+        ]
+
+        for line in lines:
+            m = sequence_metadata.SequenceMetadata('\t'.join(line))
+            for separator in ('_', '\t'):
+                expected = separator.join(line)
+                self.assertEqual(expected, m.to_string(separator=separator))
+
diff --git a/ariba/tests/sequence_variant_test.py b/ariba/tests/sequence_variant_test.py
new file mode 100644
index 0000000..1c19915
--- /dev/null
+++ b/ariba/tests/sequence_variant_test.py
@@ -0,0 +1,100 @@
+import unittest
+import os
+import pyfastaq
+from ariba import sequence_variant
+
+modules_dir = os.path.dirname(os.path.abspath(sequence_variant.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestSequenceVariant(unittest.TestCase):
+    def test_init_fails_on_bad_variant_strings(self):
+        '''Test init fails on bad variant strings'''
+        bad_variants = [
+            'x',
+            'x1',
+            '1x',
+            '1x1',
+            'I42K43',
+            'I-1K',
+        ]
+
+        for var in bad_variants:
+            with self.assertRaises(sequence_variant.Error):
+                v = sequence_variant.Variant('p', var, '.')
+
+
+    def test_init_ok(self):
+        '''Test init ok'''
+        variants = [('I42K', '.'), ('i42k', 'id1'), ('I42k', 'id2'), ('i42K', 'id3')]
+
+        for var, identifier in variants:
+            aa_var = sequence_variant.Variant('p', var, identifier)
+            self.assertEqual(41, aa_var.position)
+            self.assertEqual('I', aa_var.wild_value)
+            self.assertEqual('K', aa_var.variant_value)
+            if identifier == '.':
+                self.assertIsNone(aa_var.identifier)
+            else:
+                self.assertEqual(identifier, aa_var.identifier)
+
+
+    def test_init_str(self):
+        '''Test init ok and str'''
+        variants = ['I42K', 'i42k', 'I42k', 'i42K']
+        expected = 'I42K'
+
+        for var in variants:
+            self.assertEqual(expected, str(sequence_variant.Variant('p', var, '.')))
+
+
+    def test_sanity_check_against_seq_no_translate(self):
+        '''test sanity_check_against_seq with translate False'''
+        seq = 'BrissSpecialStvff'
+        tests = [
+            ('I3K', True),
+            ('K3I', True),
+            ('A2b', False),
+            ('x1000y', False)
+        ]
+
+        for var, expected in tests:
+            variant = sequence_variant.Variant('p', var, '.')
+            self.assertEqual(expected, variant.sanity_check_against_seq(seq))
+
+
+    def test_sanity_check_against_seq_translate(self):
+        '''test sanity_check_against_seq with translate True'''
+        seq = 'AGTACGACGTAC'  # translates to STTY
+        tests = [
+            ('S1X', True),
+            ('x1s', True),
+            ('a1y', False),
+            ('x5y', False)
+        ]
+
+        for var, expected in tests:
+            variant = sequence_variant.Variant('p', var, '.')
+            self.assertEqual(expected, variant.sanity_check_against_seq(seq, translate_seq=True))
+
+
+    def test_has_variant(self):
+        '''test has_variant'''
+        seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC*
+        tests = [
+            (sequence_variant.Variant('n', 'A2T', '.'), True),
+            (sequence_variant.Variant('n', 'T2A', '.'), False),
+            (sequence_variant.Variant('p', 'I2Y', '.'), True),
+            (sequence_variant.Variant('p', 'Y2I', '.'), False),
+        ]
+
+        for var, expected in tests:
+            self.assertEqual(expected, var.has_variant(seq))
+
+
+    def test_nucleotide_range(self):
+        '''test nucleotide_range'''
+        sv = sequence_variant.Variant('n', 'A2T', '.')
+        self.assertEqual((1, 1), sv.nucleotide_range())
+        sv = sequence_variant.Variant('p', 'I42L', '.')
+        self.assertEqual((123, 125), sv.nucleotide_range())
diff --git a/ariba/tests/summary_cluster_test.py b/ariba/tests/summary_cluster_test.py
new file mode 100644
index 0000000..4219af7
--- /dev/null
+++ b/ariba/tests/summary_cluster_test.py
@@ -0,0 +1,424 @@
+import unittest
+import copy
+import filecmp
+import os
+from ariba import flag, summary_cluster
+
+modules_dir = os.path.dirname(os.path.abspath(summary_cluster.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+class TestSummaryCluster(unittest.TestCase):
+    def test_line2dict(self):
+        '''Test _line2dict'''
+        line = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:var_group1:ref has wild type, foo bar\tsome free text'
+
+        expected = {
+            'ref_name': 'refname',
+            'ref_type': 'reftype',
+            'flag': flag.Flag(19),
+            'reads': 78,
+            'cluster': 'cluster',
+            'ref_len': 120,
+            'ref_base_assembled': 120,
+            'pc_ident': 98.33,
+            'ctg': 'ctg_name',
+            'ctg_len': 279,
+            'ctg_cov': '24.4',
+            'known_var': '1',
+            'var_type': 'SNP',
+            'var_seq_type': 'n',
+            'known_var_change': 'A14T',
+            'has_known_var': '1',
+            'ref_ctg_change': 'A14T',
+            'ref_ctg_effect': 'SNP',
+            'ref_start': 13,
+            'ref_end': 13,
+            'ref_nt': 'A',
+            'ctg_start': 84,
+            'ctg_end': 84,
+            'ctg_nt': 'T',
+            'smtls_total_depth': '17',
+            'smtls_alt_nt': '.',
+            'smtls_alt_depth': '17',
+            'var_description': 'noncoding1:n:A14T:var_group1:ref has wild type, foo bar',
+            'var_group': 'var_group1',
+            'free_text': 'some free text'
+        }
+
+        self.assertEqual(summary_cluster.SummaryCluster.line2dict(line), expected)
+
+
+    def test_add_data_dict(self):
+        '''Test add_data_dict'''
+        cluster = summary_cluster.SummaryCluster()
+        self.assertTrue(cluster.name is None)
+        line1 = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line2 = 'refname\treftype\t19\t78\tcluster2\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text'
+        line3 = 'refname2\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text'
+        data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
+        data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
+        data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
+        cluster.add_data_dict(data_dict1)
+        self.assertEqual(cluster.name, data_dict1['cluster'])
+        self.assertEqual(cluster.data,[data_dict1])
+        with self.assertRaises(summary_cluster.Error):
+            cluster.add_data_dict(data_dict2)
+
+        with self.assertRaises(summary_cluster.Error):
+            cluster.add_data_dict(data_dict3)
+
+
+    def test_pc_id_of_longest(self):
+        '''Test pc_id_of_longest'''
+        cluster = summary_cluster.SummaryCluster()
+        self.assertTrue(cluster.name is None)
+        line1 = 'refname\treftype\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line2 = 'refname\treftype\t19\t78\tcluster\t120\t119\t98.20\tctg_name2\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        line3 = 'refname\treftype\t19\t78\tcluster\t120\t114\t98.32\tctg_name3\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
+        data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
+        data_dict3 = summary_cluster.SummaryCluster.line2dict(line3)
+        cluster.add_data_dict(data_dict1)
+        cluster.add_data_dict(data_dict2)
+        cluster.add_data_dict(data_dict3)
+        self.assertEqual(98.2, cluster.pc_id_of_longest())
+
+
+    def test_to_cluster_summary_number(self):
+        '''Test _to_cluster_summary_assembled'''
+        line = 'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text'
+        data_dict = summary_cluster.SummaryCluster.line2dict(line)
+
+        tests = [
+            ('non_coding', 0, 'no'),
+            ('non_coding', 64, 'no'),
+            ('non_coding', 1024, 'no'),
+            ('non_coding', 1, 'fragmented'),
+            ('non_coding', 3, 'yes_nonunique'),
+            ('non_coding', 19, 'yes'),
+            ('non_coding', 23, 'yes_nonunique'),
+            ('non_coding', 51, 'yes_nonunique'),
+            ('non_coding', 147, 'yes_nonunique'),
+            ('non_coding', 275, 'yes_nonunique'),
+            ('presence_absence', 0, 'no'),
+            ('presence_absence', 64, 'no'),
+            ('presence_absence', 1024, 'no'),
+            ('presence_absence', 1, 'fragmented'),
+            ('presence_absence', 11, 'yes_nonunique'),
+            ('presence_absence', 27, 'yes'),
+            ('presence_absence', 29, 'fragmented'),
+            ('presence_absence', 59, 'yes_nonunique'),
+            ('presence_absence', 155, 'yes_nonunique'),
+            ('presence_absence', 283, 'yes_nonunique'),
+        ]
+
+        for seq_type, f, expected in tests:
+            cluster = summary_cluster.SummaryCluster()
+            data_dict['ref_type'] = seq_type
+            data_dict['flag'] = flag.Flag(f)
+            cluster.add_data_dict(data_dict)
+            self.assertEqual(expected, cluster._to_cluster_summary_assembled())
+
+
+    def test_has_known_variant(self):
+        '''Test _has_known_variant'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
+        ]
+
+        dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+        expected = [True, False, False, False, False]
+        assert len(dicts) == len(expected)
+
+        for i in range(len(dicts)):
+            self.assertEqual(expected[i], summary_cluster.SummaryCluster._has_known_variant(dicts[i]))
+
+
+    def test_has_any_known_variant(self):
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
+        ]
+
+        expected = ['yes', 'no', 'no', 'no', 'no']
+        assert len(lines) == len(expected)
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster = summary_cluster.SummaryCluster()
+            cluster.add_data_dict(data_dict)
+            self.assertEqual(expected[i], cluster._has_any_known_variant())
+
+
+    def test_has_nonsynonymous(self):
+        '''Test _has_nonsynonymous'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
+        ]
+
+        dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+        expected = [False, True, False, True, True, True]
+        assert len(dicts) == len(expected)
+
+        for i in range(len(dicts)):
+            self.assertEqual(expected[i], summary_cluster.SummaryCluster._has_nonsynonymous(dicts[i]))
+
+
+    def test_has_any_nonsynonymous(self):
+        '''Test _has_any_nonsynonymous'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:N_ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+        ]
+
+        expected = ['no', 'yes', 'no', 'yes', 'yes']
+        assert len(lines) == len(expected)
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster = summary_cluster.SummaryCluster()
+            cluster.add_data_dict(data_dict)
+            self.assertEqual(expected[i], cluster._has_any_nonsynonymous())
+
+
+    def test_has_novel_nonsynonymous(self):
+        '''Test _has_novel_nonsynonymous'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
+        ]
+
+        dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines]
+        expected = [False, False, True, True, True]
+        assert len(dicts) == len(expected)
+
+        for i in range(len(dicts)-1):
+            self.assertEqual(expected[i], summary_cluster.SummaryCluster._has_novel_nonsynonymous(dicts[i]))
+
+
+    def test_has_any_novel_nonsynonymous(self):
+        '''Test _has_any_novel_nonsynonymous'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\t.\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tMULTIPLE\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.',
+            'refname\tpresence_absence\t528\t2814\tcluster\t1188\t1009\t90.49\tctg_name\t2470\t141.8\t0\t.\tp\t.\t0\t.\tINDELS\t594\t594\tC;T\t1195\t1195\t.;C\t207;204\t.;.\t207;204\t.\t.'
+        ]
+
+        expected = ['no', 'no', 'yes', 'yes', 'yes']
+        assert len(lines) == len(expected)
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster = summary_cluster.SummaryCluster()
+            cluster.add_data_dict(data_dict)
+            self.assertEqual(expected[i], cluster._has_any_novel_nonsynonymous())
+
+
+    def test_to_cluster_summary_has_known_nonsynonymous(self):
+        '''Test _to_cluster_summary_has_known_nonsynonymous'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+        ]
+
+        expected = ['yes', 'yes', 'no', 'no', 'no']
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster = summary_cluster.SummaryCluster()
+            cluster.add_data_dict(data_dict)
+            for assembled_summary in ['yes', 'fragmented', 'yes_nonunique']:
+                self.assertEqual(expected[i], cluster._to_cluster_summary_has_known_nonsynonymous(assembled_summary))
+            self.assertEqual('NA', cluster._to_cluster_summary_has_known_nonsynonymous('no'))
+
+
+    def test_to_cluster_summary_has_novel_nonsynonymous(self):
+        '''Test _to_cluster_summary_has_novel_nonsynonymous'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+        ]
+
+        expected = ['no', 'no', 'no', 'yes', 'yes']
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster = summary_cluster.SummaryCluster()
+            cluster.add_data_dict(data_dict)
+            for assembled_summary in ['yes', 'fragmented', 'yes_nonunique']:
+                self.assertEqual(expected[i], cluster._to_cluster_summary_has_novel_nonsynonymous(assembled_summary))
+            self.assertEqual('NA', cluster._to_cluster_summary_has_novel_nonsynonymous('no'))
+
+
+    def test_to_cluster_summary_has_nonsynonymous(self):
+        '''Test _to_cluster_summary_has_nonsynonymous'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\t.\tn\t.\t.\t.\tMULTIPLE\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+        ]
+
+        expected = ['no', 'yes', 'no', 'yes', 'yes']
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster = summary_cluster.SummaryCluster()
+            cluster.add_data_dict(data_dict)
+            for assembled_summary in ['yes', 'fragmented', 'yes_nonunique']:
+                self.assertEqual(expected[i], cluster._to_cluster_summary_has_nonsynonymous(assembled_summary))
+            self.assertEqual('NA', cluster._to_cluster_summary_has_nonsynonymous('no'))
+
+
+    def test_get_nonsynonymous_var(self):
+        '''Test _get_nonsynonymous_var'''
+        d = {
+            'ref_name': 'ref',
+            'var_type': '.',
+            'known_var_change': '.',
+            'has_known_var': '.',
+            'known_var': '0',
+            'ref_ctg_change': '.',
+            'ref_ctg_effect': '.',
+            'var_seq_type': '.',
+            'var_group': '.',
+        }
+
+        self.assertEqual(None, summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+
+        d['var_type'] = 'p'
+        d['known_var'] = '1'
+        d['has_known_var'] = '1'
+        with self.assertRaises(summary_cluster.Error):
+            summary_cluster.SummaryCluster._get_nonsynonymous_var(d)
+
+        d['known_var_change'] = 'I42L'
+        self.assertEqual(('ref', 'I42L', 'ungrouped', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+
+        d['var_group'] = 'vgroup'
+        self.assertEqual(('ref', 'I42L', 'grouped', 'vgroup'), summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+        d['var_group'] = '.'
+
+        d['ref_ctg_change'] = 'P43Q'
+        with self.assertRaises(summary_cluster.Error):
+            summary_cluster.SummaryCluster._get_nonsynonymous_var(d)
+
+        d['known_var_change'] = '.'
+        self.assertEqual(('ref', 'P43Q', 'novel', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+
+        d['ref_ctg_change'] = '.'
+        with self.assertRaises(summary_cluster.Error):
+            summary_cluster.SummaryCluster._get_nonsynonymous_var(d)
+
+        d['ref_ctg_effect'] = 'MULTIPLE'
+        self.assertEqual(('ref', 'MULTIPLE', 'novel', None), summary_cluster.SummaryCluster._get_nonsynonymous_var(d))
+
+
+    def test_has_resistance(self):
+        '''Test _has_resistance'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+        ]
+
+        expected = ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no']
+
+        for i in range(len(lines)):
+            data_dict = summary_cluster.SummaryCluster.line2dict(lines[i])
+            cluster = summary_cluster.SummaryCluster()
+            cluster.add_data_dict(data_dict)
+            for assembled_summary in ['yes', 'yes_nonunique']:
+                self.assertEqual(expected[i], cluster._has_resistance(assembled_summary))
+            for assembled_summary in ['no', 'fragmented']:
+                self.assertEqual('no', cluster._has_resistance(assembled_summary))
+
+
+    def test_has_var_groups(self):
+        '''Test has_var_groups'''
+        lines = [
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSYN\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id1:ref has wild type, foo bar\tsome free text',
+            'refname\tnon_coding\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tn\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id2:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id3:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id4:ref has wild type, foo bar\tsome free text',
+            'refname\tpresence_absence\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id5:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id6:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tp\tA14T\t0\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
+            'refname\tvariants_only\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tSNP\tp\tA14T\t.\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1:n:A14T:id7:ref has wild type, foo bar\tsome free text',
+        ]
+        dicts = [summary_cluster.SummaryCluster.line2dict(line) for line in lines]
+        cluster = summary_cluster.SummaryCluster()
+        for d in dicts:
+            cluster.add_data_dict(d)
+        got = cluster.has_var_groups()
+        expected = {'id1', 'id3', 'id6'}
+        self.assertEqual(expected, got)
+
+
+    def test_column_summary_data(self):
+        '''Test column_summary_data'''
+        line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
+        line2 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text'
+
+        data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
+        data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
+        cluster = summary_cluster.SummaryCluster()
+        cluster.add_data_dict(data_dict1)
+        cluster.add_data_dict(data_dict2)
+        expected = {
+            'assembled': 'yes',
+            'has_res': 'yes',
+            'ref_seq': 'ref1',
+            'novel_var': 'no',
+            'known_var': 'yes',
+            'pct_id': '98.33',
+        }
+        got = cluster.column_summary_data()
+        self.assertEqual(expected, got)
+
+
+    def test_non_synon_variants(self):
+        '''Test non_synon_variants'''
+        line1 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnon_coding1:n:A14T:id1:foo_bar\tspam eggs'
+        line2 = 'ref1\tnon_coding\t531\t78\tcluster1\t120\t95\t98.42\tctg_name\t279\t24.4\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tsome free text'
+
+        data_dict1 = summary_cluster.SummaryCluster.line2dict(line1)
+        data_dict2 = summary_cluster.SummaryCluster.line2dict(line2)
+        cluster = summary_cluster.SummaryCluster()
+        cluster.add_data_dict(data_dict1)
+        cluster.add_data_dict(data_dict2)
+        got = cluster.non_synon_variants()
+        expected = {('ref1', 'A14T', 'grouped', 'id1')}
+        self.assertEqual(expected, got)
diff --git a/ariba/tests/summary_sample_test.py b/ariba/tests/summary_sample_test.py
new file mode 100644
index 0000000..3c5b2be
--- /dev/null
+++ b/ariba/tests/summary_sample_test.py
@@ -0,0 +1,104 @@
+import unittest
+import os
+from ariba import summary_cluster, summary_sample
+
+modules_dir = os.path.dirname(os.path.abspath(summary_sample.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestSummarySample(unittest.TestCase):
+    def test_load_file(self):
+        '''Test _load_file'''
+        infile = os.path.join(data_dir, 'summary_sample_test_load_file.in.tsv')
+        with open(infile) as f:
+             lines = [x.rstrip() for x in f]
+
+        dicts = [summary_cluster.SummaryCluster.line2dict(x) for x in lines[1:]]
+        cluster1 = summary_cluster.SummaryCluster()
+        cluster1.add_data_dict(dicts[0])
+        cluster1.add_data_dict(dicts[1])
+        cluster1.add_data_dict(dicts[2])
+        cluster2 = summary_cluster.SummaryCluster()
+        cluster2.add_data_dict(dicts[3])
+        cluster2.add_data_dict(dicts[4])
+        cluster3 = summary_cluster.SummaryCluster()
+        cluster3.add_data_dict(dicts[5])
+
+        expected = {
+            'cluster.n': cluster1,
+            'cluster.p': cluster2,
+            'cluster.v': cluster3
+        }
+
+        got = summary_sample.SummarySample._load_file(infile, 90)
+        self.assertEqual(expected, got)
+
+
+    def test_column_summary_data(self):
+        '''Test _column_summary_data'''
+        infile = os.path.join(data_dir, 'summary_sample_test_column_summary_data.tsv')
+        sample_summary = summary_sample.SummarySample(infile)
+        sample_summary.clusters = sample_summary._load_file(infile, 90)
+        expected = {
+            'cluster.n': {
+                'assembled': 'yes',
+                'has_res': 'yes',
+                'ref_seq': 'noncoding1',
+                'known_var': 'yes',
+                'novel_var': 'yes',
+                'pct_id': '98.33'
+            },
+            'cluster.p': {
+                'assembled': 'yes',
+                'has_res': 'yes',
+                'ref_seq': 'presence_absence1',
+                'known_var': 'yes',
+                'novel_var': 'no',
+                'pct_id': '98.96'
+            },
+            'cluster.v': {
+                'assembled': 'yes',
+                'has_res': 'yes',
+                'ref_seq': 'variants_only1',
+                'known_var': 'yes',
+                'novel_var': 'no',
+                'pct_id': '100.0'
+            }
+        }
+        self.maxDiff = None
+        got = sample_summary._column_summary_data()
+        self.assertEqual(expected, got)
+
+
+    def test_var_groups(self):
+        '''test _var_groups'''
+        infile = os.path.join(data_dir, 'summary_sample_test_var_groups.tsv')
+        sample_summary = summary_sample.SummarySample(infile)
+        sample_summary.clusters = sample_summary._load_file(infile, 90)
+        got = sample_summary._var_groups()
+        expected = {
+            'cluster.n': {'id1', 'id2'},
+            'cluster.p': {'id3'},
+            'cluster.v': {'id4'}
+        }
+        self.assertEqual(expected, got)
+
+
+    def test_variant_column_names_tuples(self):
+        '''Test _variant_column_names_tuples'''
+        infile = os.path.join(data_dir, 'summary_sample_test_column_names_tuples.tsv')
+        sample_summary = summary_sample.SummarySample(infile)
+        sample_summary.clusters = sample_summary._load_file(infile, 90)
+        sample_summary.column_summary_data = sample_summary._column_summary_data()
+        expected = {
+            'cluster.v': {('variants_only1', 'S5T', 'ungrouped', None)},
+            'cluster.n': {
+                ('noncoding1', 'A6G', 'grouped', 'id2'),
+                ('noncoding1', 'A14T', 'ungrouped', None),
+                ('noncoding1', 'G15T', 'novel', None)
+             },
+            'cluster.p': {('presence_absence1', 'A10V', 'grouped', 'id3')}
+        }
+        got = sample_summary._variant_column_names_tuples()
+        self.assertEqual(expected, got)
+
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index 5e27416..b748862 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -2,12 +2,12 @@ import unittest
 import copy
 import filecmp
 import os
-from ariba import summary, flag
+from ariba import flag, summary, summary_cluster, summary_sample
 
 modules_dir = os.path.dirname(os.path.abspath(summary.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
 
-class TestSummry(unittest.TestCase):
+class TestSummary(unittest.TestCase):
     def test_init(self):
         '''Test init'''
         fofn = os.path.join(data_dir, 'summary_test_init.fofn')
@@ -19,73 +19,110 @@ class TestSummry(unittest.TestCase):
         self.assertEqual(s.filenames, ['file42', 'file1', 'file2'])
 
 
+    def test_determine_cluster_cols(self):
+        col_strings = [
+            'assembled,has_res,ref_seq,pct_id,known_var,novel_var',
+            'ref_seq,pct_id,known_var,novel_var',
+            'assembled,pct_id,known_var,novel_var',
+            'assembled',
+            '',
+            None,
+        ]
 
-    def test_line2dict(self):
-        '''Test _line2dict'''
-        line = '\t'.join(['gene1', '187', '42', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '66', '66', 'A', 'gene1.scaffold.1', '1047', '67', '67', 'C', '42', 'A', '22,20'])
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
-        expected = {
-            'gene': 'gene1',
-            'flag':  flag.Flag(187),
-            'reads': 42,
-            'cluster': '3',
-            'gene_len': 750,
-            'assembled': 750,
-            'pc_ident': 98.93,
-            'var_type': 'SNP',
-            'var_effect': 'SYN',
-            'new_aa': '.',
-            'gene_start': 66,
-            'gene_end': 66,
-            'gene_nt': 'A',
-            'scaffold': 'gene1.scaffold.1',
-            'scaff_len': 1047,
-            'scaff_start': 67,
-            'scaff_end': 67,
-            'scaff_nt': 'C',
-            'read_depth': 42,
-            'alt_bases': 'A',
-            'ref_alt_depth': '22,20'
-        }
-        self.assertEqual(s._line2dict(line), expected)
-
-
-    def test_load_file(self):
-        '''Test _load_file'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
-        infile = os.path.join(data_dir, 'summary_test_load_file.in.tsv')
-
-        lines = [
-            ['gene1', '27', '42', '1', '822', '822', '100.0', '.', '.', '.', '.', '.', '.', 'gene1.scaffold.1', '1490', '.', '.', '.', '.', '.', '.'],
-            ['gene2', '15', '44', '2', '780', '780', '100.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.2', '1124', '.', '.', '.', '.', '.', '.'],
-            ['gene2', '15', '46', '2', '780', '770', '99.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.3', '1097', '.', '.', '.', '.', '.', '.'],
-            ['gene3', '187', '48', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '318', '318', 'C', 'gene3.scaffold.1', '1047', '319', '319', 'G', '.', '.', '.']
-]
-        dicts = [s._line2dict('\t'.join(x)) for x in lines]
-        expected = {'gene1': [dicts[0]], 'gene2': dicts[1:3], 'gene3': [dicts[3]]}
-        got = s._load_file(infile)
-        self.assertEqual(expected, got)
+        expected = [
+            {'assembled': True, 'has_res': True, 'ref_seq': True, 'pct_id': True, 'known_var': True, 'novel_var': True},
+            {'assembled': False, 'has_res': False, 'ref_seq': True, 'pct_id': True, 'known_var': True, 'novel_var': True},
+            {'assembled': True, 'has_res': False, 'ref_seq': False, 'pct_id': True, 'known_var': True, 'novel_var': True},
+            {'assembled': True, 'has_res': False, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False},
+            {'assembled': False, 'has_res': False, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False},
+            {'assembled': False, 'has_res': False, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False},
+        ]
 
+        assert len(col_strings) == len(expected)
 
-    def test_to_summary_number(self):
-        '''Test _to_summary_number'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
-        tests = [
-            (0, 0),
-            (64, 0),
-            (7, 1),
-            (259, 1),
-            (15, 2),
-            (539, 3),
-            (27, 4),
+        for i in range(len(col_strings)):
+            self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i]))
+
+
+    def test_determine_var_cols(self):
+        col_strings = [
+            'groups,grouped,ungrouped,novel',
+            'groups,grouped,ungrouped',
+            'grouped,novel',
+            'ungrouped,novel',
+            'grouped',
+            'ungrouped',
+            'novel',
+            ''
         ]
 
-        for t in tests:
-            l = [{'flag': flag.Flag(t[0]), 'assembled': 42, 'pc_ident': 99}]
-            self.assertEqual(s._to_summary_number(l), t[1])
+        expected = [
+            {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': True},
+            {'groups': True, 'grouped': True, 'ungrouped': True, 'novel': False},
+            {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': True},
+            {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': True},
+            {'groups': False, 'grouped': True, 'ungrouped': False, 'novel': False},
+            {'groups': False, 'grouped': False, 'ungrouped': True, 'novel': False},
+            {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': True},
+            {'groups': False, 'grouped': False, 'ungrouped': False, 'novel': False},
+        ]
+
+        assert len(col_strings) == len(expected)
+
+        for i in range(len(col_strings)):
+            self.assertEqual(expected[i], summary.Summary._determine_var_cols(col_strings[i]))
+
+
+    def test_load_input_files(self):
+        '''Test _load_input_files'''
+        file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv')
+        file2 = os.path.join(data_dir, 'summary_test_load_input_files.2.tsv')
+        sample1 = summary_sample.SummarySample(file1)
+        sample2 = summary_sample.SummarySample(file2)
+        sample1.run()
+        sample2.run()
+        got = summary.Summary._load_input_files([file1, file2], 90)
+        expected = {file1: sample1, file2: sample2}
+        self.assertEqual(expected, got)
+
+
+    def test_get_all_cluster_names(self):
+        '''Test _get_all_cluster_names'''
+        file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv')
+        file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv')
+        samples = summary.Summary._load_input_files([file1, file2], 90)
+        got = summary.Summary._get_all_cluster_names(samples)
+        expected = {'cluster.n.1', 'cluster.v.1', 'cluster.p.1', 'cluster.p.2'}
+        self.assertEqual(expected, got)
 
-        l = [{'flag': flag.Flag(27), 'assembled': 42, 'pc_ident': 89}]
-        self.assertEqual(s._to_summary_number(l), 0)
+
+    def test_get_all_variant_columns(self):
+        '''Test _get_all_variant_columns'''
+        file1 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.1.tsv')
+        file2 = os.path.join(data_dir, 'summary_test_get_all_cluster_names.2.tsv')
+        samples = summary.Summary._load_input_files([file1, file2], 90)
+        got = summary.Summary._get_all_variant_columns(samples)
+        expected = {
+            'cluster.p.2': {('presence_absence1', 'A10V', 'grouped', 'id3')},
+            'cluster.n.1': {('noncoding1', 'A6G', 'grouped', 'id2'), ('noncoding1', 'A14T', 'grouped', 'id1')},
+            'cluster.p.1': {('presence_absence1', 'A10V', 'grouped', 'id2')},
+        }
+        self.assertEqual(expected, got)
+
+
+    def test_get_all_var_groups(self):
+        '''test _get_all_var_groups'''
+        file1 = os.path.join(data_dir, 'summary_test_get_all_var_groups.1.tsv')
+        file2 = os.path.join(data_dir, 'summary_test_get_all_var_groups.2.tsv')
+        samples = summary.Summary._load_input_files([file1, file2], 90)
+        got = summary.Summary._get_all_var_groups(samples)
+        expected = {
+            'cluster.p.1': {'id4'},
+            'cluster.p.2': {'id3'},
+            'cluster.v.1': set(),
+            'cluster.n.1': {'id1', 'id2'}
+        }
+        self.assertEqual(expected, got)
 
 
     def test_gather_output_rows(self):
@@ -94,62 +131,316 @@ class TestSummry(unittest.TestCase):
             os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'),
             os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv')
         ]
-        s = summary.Summary('out', filenames=infiles)
-        s._gather_output_rows()
-        expected = [
-            ['filename', 'gene1', 'gene2', 'gene3'],
-            [infiles[0], 4, 2, 0],
-            [infiles[1], 4, 0, 4],
+        s = summary.Summary('out', filenames=infiles, variant_cols=None)
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        expected = {
+            infiles[0]: {
+                'noncoding1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'noncoding1',
+                    'known_var': 'yes',
+                    'novel_var': 'no',
+                    'pct_id': '98.33',
+                },
+                'presence_absence1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'presence_absence1',
+                    'known_var': 'no',
+                    'novel_var': 'yes',
+                    'pct_id': '98.96',
+                },
+                'variants_only1': {
+                    'assembled': 'no',
+                    'has_res': 'no',
+                    'ref_seq': 'NA',
+                    'known_var': 'NA',
+                    'novel_var': 'NA',
+                    'pct_id': 'NA',
+                }
+            },
+            infiles[1]: {
+                'noncoding1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'noncoding1',
+                    'known_var': 'yes',
+                    'novel_var': 'no',
+                    'pct_id': '98.33',
+                },
+                'presence_absence1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'presence_absence1',
+                    'pct_id': '98.96',
+                    'known_var': 'no',
+                    'novel_var': 'yes',
+                },
+                'variants_only1': {
+                    'assembled': 'no',
+                    'has_res': 'no',
+                    'ref_seq': 'NA',
+                    'known_var': 'NA',
+                    'novel_var': 'NA',
+                    'pct_id': 'NA',
+                }
+            },
+        }
+        got = s._gather_output_rows()
+        self.assertEqual(expected, got)
+
+        s.var_columns['groups'] = True
+        expected[infiles[0]]['noncoding1']['vgroup.id1'] = 'yes'
+        expected[infiles[0]]['noncoding1']['vgroup.id3'] = 'no'
+        expected[infiles[1]]['noncoding1']['vgroup.id1'] = 'yes'
+        expected[infiles[1]]['noncoding1']['vgroup.id3'] = 'yes'
+        got = s._gather_output_rows()
+        self.assertEqual(expected, got)
+
+
+        s.var_columns['grouped'] = True
+        s.var_columns['ungrouped'] = True
+        expected[infiles[0]]['noncoding1']['noncoding1.A14T'] = 'yes'
+        expected[infiles[0]]['noncoding1']['noncoding1.A6G'] = 'no'
+        expected[infiles[1]]['noncoding1']['noncoding1.A14T'] = 'yes'
+        expected[infiles[1]]['noncoding1']['noncoding1.A6G'] = 'yes'
+        got = s._gather_output_rows()
+        self.assertEqual(expected, got)
+
+        s.var_columns['novel'] = True
+        expected[infiles[0]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
+        expected[infiles[1]]['presence_absence1']['presence_absence1.A10V'] = 'yes'
+        got = s._gather_output_rows()
+        self.assertEqual(expected, got)
+
+        for filename in expected:
+            del expected[filename]['noncoding1']['vgroup.id1']
+            del expected[filename]['noncoding1']['vgroup.id3']
+            for gene_type in expected[filename]:
+                del expected[filename][gene_type]['ref_seq']
+
+        s = summary.Summary('out', filenames=infiles, cluster_cols='assembled,has_res,pct_id,known_var,novel_var', variant_cols='ungrouped,grouped,novel')
+        s.samples = summary.Summary._load_input_files(infiles, 90)
+        s.include_all_variant_columns = True
+        got = s._gather_output_rows()
+        self.assertEqual(expected, got)
+
+
+    def test_to_matrix(self):
+        '''Test _to_matrix'''
+        rows = {
+            'file1': {
+                'cluster.n.1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'noncoding1',
+                    'known_var': 'yes',
+                    'novel_var': 'no',
+                    'pct_id': '98.33',
+                    'noncoding1.A14T': 'yes'
+                },
+                'cluster.p.1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'presence_absence1',
+                    'known_var': 'yes',
+                    'novel_var': 'no',
+                    'pct_id': '98.96',
+                    'presence_absence1.I42L': 'yes'
+                },
+                'cluster.v.1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'varonly1',
+                    'known_var': 'no',
+                    'novel_var': 'no',
+                    'pct_id': '99.42',
+                }
+            },
+            'file2': {
+                'cluster.n.1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'noncoding1',
+                    'known_var': 'no',
+                    'novel_var': 'no',
+                    'pct_id': '98.33',
+                    'noncoding1.A14T': 'no'
+                },
+                'cluster.p.1': {
+                    'assembled': 'yes',
+                    'has_res': 'yes',
+                    'ref_seq': 'presence_absence1',
+                    'pct_id': '98.96',
+                    'known_var': 'no',
+                    'novel_var': 'no',
+                    'presence_absence1.I42L': 'no'
+                },
+                'cluster.v.1': {
+                    'assembled': 'no',
+                    'has_res': 'NA',
+                    'ref_seq': 'NA',
+                    'known_var': 'NA',
+                    'novel_var': 'NA',
+                    'pct_id': 'NA',
+                }
+            },
+        }
+        filenames = ['file1', 'file2']
+        cluster_cols = {'assembled': True, 'has_res': True, 'ref_seq': False, 'pct_id': False, 'known_var': False, 'novel_var': False}
+        got_phandago_header, got_csv_header, got_lines  = summary.Summary._to_matrix(filenames, rows, cluster_cols)
+        expected_phandango_header = ['name', 'cluster.n.1.assembled.:o1', 'cluster.n.1.has_res.:o1', 'cluster.n.1.noncoding1.A14T:o1', 'cluster.p.1.assembled.:o1', 'cluster.p.1.has_res.:o1', 'cluster.p.1.presence_absence1.I42L:o1', 'cluster.v.1.assembled.:o1', 'cluster.v.1.has_res.:o1']
+        expected_csv_header = ['name', 'cluster.n.1.assembled', 'cluster.n.1.has_res', 'cluster.n.1.noncoding1.A14T', 'cluster.p.1.assembled', 'cluster.p.1.has_res', 'cluster.p.1.presence_absence1.I42L', 'cluster.v.1.assembled', 'cluster.v.1.has_res']
+        expected_lines = [
+            ['file1', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'],
+            ['file2', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'no', 'NA']
         ]
-        self.assertEqual(expected, s.rows_out)
+        self.assertEqual(expected_phandango_header, got_phandago_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_lines, got_lines)
 
 
-    def test_filter_output_rows_filter_true(self):
-        '''Test _filter_output_rows'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
-        s.rows_out = [
-            ['filename', 'gene1', 'gene2', 'gene3'],
-            ['file1', 0, 0, 0],
-            ['file2', 1, 0, 3],
-            ['file3', 2, 0, 4],
+    def test_filter_matrix_rows(self):
+        '''Test _filter_matrix_rows'''
+        matrix = [
+            ['yes', 'yes'],
+            ['yes', 'no'],
+            ['no', 'no'],
+            ['yes_nonunique', 'no'],
+            ['NA', 'no'],
+            ['no', 'NA'],
+            ['NA', 'NA']
         ]
 
         expected = [
-            ['filename', 'gene1', 'gene3'],
-            ['file2', 1, 3],
-            ['file3', 2, 4],
+            ['yes', 'yes'],
+            ['yes', 'no'],
+            ['yes_nonunique', 'no'],
         ]
+        got = summary.Summary._filter_matrix_rows(matrix)
+        self.assertEqual(expected, got)
 
-        s._filter_output_rows()
-        self.assertEqual(s.rows_out, expected)
 
+    def test_filter_matrix_columns(self):
+        '''Test _filter_matrix_columns'''
+        matrix = [
+            ['yes', 'yes', 'no', 'yes_nonunique', 'NA', 'no', 'NA'],
+            ['yes', 'no', 'no', 'no', 'no', 'NA', 'NA']
+        ]
+        phandango_header = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7']
+        csv_header = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']
 
-    def test_filter_output_rows_filter_false(self):
-        '''Test _filter_output_rows'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'], filter_output=False)
-        rows_out = [
-            ['filename', 'gene1', 'gene2', 'gene3'],
-            ['file1', 0, 0, 0],
-            ['file2', 1, 0, 3],
-            ['file3', 2, 0, 4],
+        got_phandago_header, got_csv_header, got_matrix  = summary.Summary._filter_matrix_columns(matrix, phandango_header, csv_header)
+        expected_phandango_header = ['p1', 'p2', 'p4']
+        expected_csv_header = ['h1', 'h2', 'h4']
+        expected_matrix = [
+            ['yes', 'yes', 'yes_nonunique'],
+            ['yes', 'no', 'no'],
         ]
+        self.assertEqual(expected_phandango_header, got_phandago_header)
+        self.assertEqual(expected_csv_header, got_csv_header)
+        self.assertEqual(expected_matrix, got_matrix)
 
-        s.rows_out = copy.copy(rows_out)
 
-        s._filter_output_rows()
-        self.assertEqual(s.rows_out, rows_out)
+    def test_add_phandango_colour_columns(self):
+        '''Test _add_phandango_colour_columns'''
+        header = ['head1', 'head2:o1', 'head3:o1', 'head4', 'head5:o1']
+        matrix = [
+            ['yes', 'yes', 'yes_nonunique', 'yes', 'no'],
+            ['yes', 'yes_nonunique', 'no', 'yes', 'NA'],
+            ['yes', 'no', 'NA', 'yes', 'yes'],
+            ['yes', 'NA', 'yes', 'yes', 'yes_nonunique'],
+        ]
+
+        expected_header = ['head1', 'head2', 'head2:colour', 'head3', 'head3:colour', 'head4', 'head5', 'head5:colour']
+        expected_matrix = [
+            ['yes', 'yes', '#1f78b4', 'yes_nonunique', '#a6cee3', 'yes', 'no', '#33a02c'],
+            ['yes', 'yes_nonunique', '#a6cee3', 'no', '#33a02c', 'yes', 'NA', '#b2df8a'],
+            ['yes', 'no', '#33a02c', 'NA', '#b2df8a', 'yes', 'yes', '#1f78b4'],
+            ['yes', 'NA', '#b2df8a', 'yes', '#1f78b4', 'yes', 'yes_nonunique', '#a6cee3'],
+        ]
+        got_header, got_matrix = summary.Summary._add_phandango_colour_columns(header, matrix)
+        self.assertEqual(expected_header, got_header)
+        self.assertEqual(expected_matrix, got_matrix)
 
 
-    def test_write_tsv(self):
-        '''Test _write_tsv'''
-        tmp_out = 'tmp.out.tsv'
-        s = summary.Summary(tmp_out, filenames=['spam', 'eggs'])
-        s.rows_out = [
-            ['filename', 'gene1', 'gene3'],
-            ['file2', 1, 3],
-            ['file3', 2, 4],
+    def test_matrix_to_csv(self):
+        '''Test _matrix_to_csv'''
+        matrix = [
+            ['line1_1', 'line1_2'],
+            ['line2_1', 'line2_2'],
         ]
-        s._write_tsv()
-        expected = os.path.join(data_dir, 'summary_test_write_tsv.out.tsv')
-        self.assertTrue(filecmp.cmp(tmp_out, expected, shallow=False))
-        os.unlink(tmp_out)
+        header = ['head1', 'head2']
+        tmpfile = 'tmp.test.matrix_to_csv.csv'
+        summary.Summary._matrix_to_csv(matrix, header, tmpfile)
+        with open(tmpfile) as f:
+            got = f.read()
+
+        expected = 'head1,head2\nline1_1,line1_2\nline2_1,line2_2\n'
+        self.assertEqual(expected, got)
+        os.unlink(tmpfile)
+
+
+    def test_distance_score_bewteen_values(self):
+        '''Test _distance_score_bewteen_values'''
+        tests = [
+            (('no', 'no'), 0),
+            (('no', 'yes'), 1),
+            (('no', 'yes_nonunique'), 1),
+            (('no', 'fragmented'), 1),
+            (('yes', 'no'), 1),
+            (('yes', 'yes'), 0),
+            (('yes', 'yes_nonunique'), 1),
+            (('yes', 'fragmented'), 1),
+            (('yes_nonunique', 'no'), 1),
+            (('yes_nonunique', 'yes'), 1),
+            (('yes_nonunique', 'yes_nonunique'), 0),
+            (('yes_nonunique', 'fragmented'), 1),
+            (('fragmented', 'no'), 1),
+            (('fragmented', 'yes'), 1),
+            (('fragmented', 'yes_nonunique'), 1),
+            (('fragmented', 'fragmented'), 0),
+            (('NA', 'no'), 0),
+            (('NA', 'yes'), 1),
+            (('NA', 'yes_nonunique'), 1),
+            (('NA', 'fragmented'), 1),
+        ]
+
+        for (val1, val2), expected in tests:
+            self.assertEqual(expected, summary.Summary._distance_score_between_values(val1, val2))
+            self.assertEqual(expected, summary.Summary._distance_score_between_values(val2, val1))
+
+
+
+    def test_distance_score_between_lists(self):
+        '''Test _distance_score_between_lists'''
+        list1 = ['NA', 'no', 'yes']
+        list2 = ['NA', 'no', 'no']
+        self.assertEqual(1, summary.Summary._distance_score_between_lists(list1, list2))
+
+
+    def test_write_distance_matrix(self):
+        '''Test _write_distance_matrix'''
+        rows = [
+            ['file1', 'no', 'yes', 'no'],
+            ['file2', 'yes', 'no', 'yes'],
+            ['file3', 'no', 'no', 'yes'],
+        ]
+
+        tmp_distances = 'tmp.test.write_distance_matrix.distances'
+        summary.Summary._write_distance_matrix(rows, tmp_distances)
+        expected = os.path.join(data_dir, 'summary_test_write_distance_matrix.distances')
+        self.assertTrue(filecmp.cmp(expected, tmp_distances, shallow=False))
+        os.unlink(tmp_distances)
+
+
+    def test_newick_from_dist_matrix(self):
+        '''Test _newick_from_dist_matrix'''
+        tmp_tree = 'tmp.test.newick_from_dist_matrix.tre'
+        dist_file = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.distances')
+        summary.Summary._newick_from_dist_matrix(dist_file, tmp_tree)
+        expected = os.path.join(data_dir, 'summary_test_newick_from_dist_matrix.tre')
+        self.assertTrue(filecmp.cmp(expected, tmp_tree, shallow=False))
+        os.unlink(tmp_tree)
+
diff --git a/ariba/tests/versions_test.py b/ariba/tests/versions_test.py
new file mode 100644
index 0000000..3609e37
--- /dev/null
+++ b/ariba/tests/versions_test.py
@@ -0,0 +1,8 @@
+import unittest
+from ariba import versions
+
+class TestVersions(unittest.TestCase):
+    def test_get_all_versions(self):
+        '''Test get_all_versions'''
+        versions.get_all_versions(None)
+
diff --git a/ariba/tests/vfdb_parser_test.py b/ariba/tests/vfdb_parser_test.py
new file mode 100644
index 0000000..934e729
--- /dev/null
+++ b/ariba/tests/vfdb_parser_test.py
@@ -0,0 +1,60 @@
+import unittest
+import filecmp
+import os
+from ariba import vfdb_parser
+
+modules_dir = os.path.dirname(os.path.abspath(vfdb_parser.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestVfdbParser(unittest.TestCase):
+    def test_fa_header_to_name_pieces(self):
+        '''test _fa_header_to_name_pieces'''
+        tests = [
+            ('VF123(gi:1234) (abcD) foobar description [abc] [genus species]', ('VF123(gi:1234)', 'abcD', 'foobar description [abc]', 'genus species')),
+            ('F123(gi:1234) (abcD) foobar description [abc] [genus species]', None), # no V at start
+            ('VF123(gi:1234) (abcD) foobar description [abc]', None),  # missing genus species
+            ('VF123(gi:1234) abcD foobar description [abc] [genus species]', None), # brackets missing around abcD
+        ]
+
+        for header, expected in tests:
+            got = vfdb_parser.VfdbParser._fa_header_to_name_pieces(header)
+            self.assertEqual(expected, got)
+
+
+    def test_fa_header_to_name_and_metadata(self):
+        '''test _fa_header_to_name_and_metadata'''
+        headers = [
+            'VF123(gi:1234) (abcD) foobar description [abc] [genus species]',
+            'F123(gi:1234) (abcD) foobar description [abc] [genus species]', # no V at start
+            'VF123(gi:1234) (abcD) foobar description [abc]', # missing genus species
+            'VF123(gi:1234) abcD foobar description [abc] [genus species]', # brackets missing around abcD
+        ]
+
+        expected = [
+            ('abcD.VF123(gi:1234).genus_species', 'foobar description [abc]'),
+            (headers[1], None),
+            (headers[2], None),
+            (headers[3], None),
+        ]
+
+        assert len(headers) == len(expected)
+        for i in range(len(headers)):
+            got = vfdb_parser.VfdbParser._fa_header_to_name_and_metadata(headers[i])
+            self.assertEqual(expected[i], got)
+
+
+    def test_run(self):
+        '''test run'''
+        infile = os.path.join(data_dir, 'vfdb_parser_test_run.in.fa')
+        expected_tsv = os.path.join(data_dir, 'vfdb_parser_test_run.out.tsv')
+        expected_fa = os.path.join(data_dir, 'vfdb_parser_test_run.out.fa')
+        outprefix = 'tmp.vfdb_parser_test_run'
+        got_tsv = outprefix + '.metadata.tsv'
+        got_fa = outprefix + '.presence_absence.fa'
+        vp = vfdb_parser.VfdbParser(infile, outprefix)
+        vp.run()
+        self.assertTrue(filecmp.cmp(expected_tsv, got_tsv, shallow=False))
+        self.assertTrue(filecmp.cmp(expected_fa, got_fa, shallow=False))
+        os.unlink(got_tsv)
+        os.unlink(got_fa)
diff --git a/ariba/versions.py b/ariba/versions.py
new file mode 100644
index 0000000..a49b3e7
--- /dev/null
+++ b/ariba/versions.py
@@ -0,0 +1,66 @@
+import sys
+from distutils.version import LooseVersion
+from ariba import external_progs
+from ariba import __version__ as ariba_version
+
+
+package_min_versions = {
+    'openpyxl': '1.6.2',
+    'pyfastaq': '3.12.0',
+    'pysam': '0.8.1',
+    'pymummer' : '0.7.1',
+}
+
+package_max_versions = {
+    'pysam': '0.8.3',
+}
+
+
+def get_all_versions(raise_error=True):
+    extern_progs = external_progs.ExternalProgs(fail_on_error=False)
+
+    report_lines = [
+        'ARIBA version: ' + ariba_version,
+        '\nExternal dependencies:',
+        '\n'.join(extern_progs.version_report),
+        '\nExternal dependencies OK: ' + str(extern_progs.all_deps_ok),
+        '\nPython version:',
+        str(sys.version),
+        '\nPython packages:',
+    ]
+
+    python_packages_ok = True
+
+    for package in ['ariba', 'openpyxl', 'pyfastaq', 'pymummer', 'pysam']:
+        try:
+            exec('import ' + package)
+            version = eval(package + '.__version__')
+            path = eval(package + '.__file__')
+        except:
+            version = 'NOT_FOUND'
+            path = 'NOT_FOUND'
+            python_packages_ok = False
+
+        if version != 'NOT_FOUND':
+            if package in package_min_versions and LooseVersion(version) < package_min_versions[package]:
+                version += '... THIS IS TOO LOW. Needs>=' + package_min_versions[package]
+                python_packages_ok = False
+            elif package in package_max_versions and LooseVersion(version) > package_max_versions[package]:
+                version += '...THIS IS TOO HIGH. Needs <=' + package_max_versions[package]
+                python_packages_ok = False
+
+        report_lines.append(package + '\t' + version + '\t' + path)
+
+    all_ok = extern_progs.all_deps_ok and python_packages_ok
+
+    report_lines.extend([
+        '\nPython packages OK: ' + str(python_packages_ok),
+        '\nEverything looks OK: ' + str(all_ok),
+    ])
+
+    if raise_error and not all_ok:
+        print(*report_lines, sep='\n', file=sys.stderr)
+        print('Some dependencies not satisfied. Cannot continue.', file=sys.stderr)
+        sys.exit(1)
+
+    return extern_progs, report_lines
diff --git a/ariba/vfdb_parser.py b/ariba/vfdb_parser.py
new file mode 100644
index 0000000..9e0dab8
--- /dev/null
+++ b/ariba/vfdb_parser.py
@@ -0,0 +1,46 @@
+import re
+import pyfastaq
+
+class Error (Exception): pass
+
+name_regex = re.compile(r'^(?P<vfdb_id>V\S+\)) \((?P<name>\S+)\) (?P<description>.*\]) \[(?P<genus_etc>.*)\]$')
+
+class VfdbParser:
+    def __init__(self, infile, outprefix):
+        self.infile = infile
+        self.outprefix = outprefix
+
+
+    @classmethod
+    def _fa_header_to_name_pieces(cls, fa_header):
+        m = name_regex.search(fa_header)
+        if m is None:
+            return None
+        else:
+            return tuple([m.group(x) for x in ['vfdb_id', 'name', 'description', 'genus_etc']])
+
+
+    @staticmethod
+    def _fa_header_to_name_and_metadata(fa_header):
+        name_data = VfdbParser._fa_header_to_name_pieces(fa_header)
+        if name_data is None:
+            return fa_header, None
+        else:
+            vfdb_id, name, description, genus_etc = name_data
+            return name + '.' + vfdb_id + '.' + genus_etc.replace(' ', '_'), description
+
+
+    def run(self):
+        file_reader = pyfastaq.sequences.file_reader(self.infile)
+        fa_out = pyfastaq.utils.open_file_write(self.outprefix + '.presence_absence.fa')
+        tsv_out = pyfastaq.utils.open_file_write(self.outprefix + '.metadata.tsv')
+
+        for seq in file_reader:
+            seq.id, description = self._fa_header_to_name_and_metadata(seq.id)
+            if description is not None:
+                print(seq.id, '.', '.', '.', description, sep='\t', file=tsv_out)
+            print(seq, file=fa_out)
+
+        pyfastaq.utils.close(fa_out)
+        pyfastaq.utils.close(tsv_out)
+
diff --git a/install_dependencies.sh b/install_dependencies.sh
new file mode 100755
index 0000000..5267d31
--- /dev/null
+++ b/install_dependencies.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+set -e
+set -x
+
+start_dir=$(pwd)
+
+BCFTOOLS_VERSION=1.3
+BOWTIE2_VERSION=2.2.8
+CDHIT_VERSION=4.6.5
+SAMTOOLS_VERSION=1.3
+MUMMER_VERSION=3.23
+SPADES_VERSION=3.6.0
+
+BCFTOOLS_DOWNLOAD_URL="https://github.com/samtools/bcftools/releases/download/1.3/bcftools-${BCFTOOLS_VERSION}.tar.bz2"
+BOWTIE2_DOWNLOAD_URL="http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip"
+CDHIT_DOWNLOAD_URL="https://github.com/weizhongli/cdhit/archive/V${CDHIT_VERSION}.tar.gz"
+SAMTOOLS_DOWNLOAD_URL="https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2"
+MUMMER_DOWNLOAD_URL="http://downloads.sourceforge.net/project/mummer/mummer/${MUMMER_VERSION}/MUMmer${MUMMER_VERSION}.tar.gz"
+SPADES_DOWNLOAD_URL="http://spades.bioinf.spbau.ru/release${SPADES_VERSION}/SPAdes-${SPADES_VERSION}-Linux.tar.gz"
+
+
+# Make an install location
+if [ ! -d 'build' ]; then
+  mkdir build
+fi
+cd build
+build_dir=$(pwd)
+
+# DOWNLOAD ALL THE THINGS
+download () {
+  url=$1
+  download_location=$2
+
+  if [ -e $download_location ]; then
+    echo "Skipping download of $url, $download_location already exists"
+  else
+    echo "Downloading $url to $download_location"
+    wget $url -O $download_location
+  fi
+}
+
+
+# --------------- bcftools -----------------
+cd $build_dir
+download $BCFTOOLS_DOWNLOAD_URL "bcftools-${BCFTOOLS_VERSION}.tar.bz2"
+bcftools_dir="$build_dir/bcftools-${BCFTOOLS_VERSION}"
+tar -xjf bcftools-${BCFTOOLS_VERSION}.tar.bz2
+cd $bcftools_dir
+make
+
+
+# --------------- bowtie2 ------------------
+cd $build_dir
+download $BOWTIE2_DOWNLOAD_URL "bowtie2-${BOWTIE2_VERSION}.zip"
+bowtie2_dir="$build_dir/bowtie2-${BOWTIE2_VERSION}"
+unzip -n bowtie2-${BOWTIE2_VERSION}.zip
+
+
+# --------------- cdhit --------------------
+cd $build_dir
+download $CDHIT_DOWNLOAD_URL "cdhit-${CDHIT_VERSION}.tar.gz"
+tar -zxf cdhit-${CDHIT_VERSION}.tar.gz
+cdhit_dir="$build_dir/cdhit-${CDHIT_VERSION}"
+cd $cdhit_dir
+make
+
+
+# --------------- samtools -----------------
+cd $build_dir
+download $SAMTOOLS_DOWNLOAD_URL "samtools-${SAMTOOLS_VERSION}.tar.bz2"
+samtools_dir="$build_dir/samtools-${SAMTOOLS_VERSION}"
+tar -xjf samtools-${SAMTOOLS_VERSION}.tar.bz2
+cd $samtools_dir
+make
+
+
+# --------------- mummer ------------------
+cd $build_dir
+download $MUMMER_DOWNLOAD_URL "MUMmer${MUMMER_VERSION}.tar.gz"
+mummer_dir="$build_dir/MUMmer${MUMMER_VERSION}"
+tar -zxf MUMmer${MUMMER_VERSION}.tar.gz
+cd $mummer_dir
+make
+
+
+# --------------- spades -----------------
+cd $build_dir
+download $SPADES_DOWNLOAD_URL "SPAdes-${SPADES_VERSION}-Linux.tar.gz"
+spades_dir="$build_dir/SPAdes-${SPADES_VERSION}-Linux/bin"
+tar -zxf SPAdes-${SPADES_VERSION}-Linux.tar.gz
+
+
+cd $start_dir
+
+update_path () {
+  new_dir=$1
+  if [[ ! "$PATH" =~ (^|:)"${new_dir}"(:|$) ]]; then
+    export PATH=${new_dir}:${PATH}
+  fi
+}
+
+update_path ${bcftools_dir}
+update_path ${bowtie2_dir}
+update_path ${cdhit_dir}
+update_path ${mummer_dir}
+update_path ${samtools_dir}
+update_path ${spades_dir}
+
+
+# -------------- R packages ---------------
+mkdir -p ~/R/libs
+echo "R_LIBS=~/R/libs" > ~/.Renviron
+wget https://cran.r-project.org/src/contrib/Archive/ape/ape_3.1.tar.gz
+R CMD INSTALL ape_3.1.tar.gz
+
diff --git a/scripts/ariba b/scripts/ariba
index 7a4debe..28d1678 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -1,23 +1,29 @@
 #!/usr/bin/env python3
 
-import argparse
 import sys
 
-
 tasks = {
-    'refcheck': 'Check or fix input genes FASTA',
+    'getref': 'Download reference data',
+    'prepareref': 'Prepare reference data for running the pipeline',
+    'reportfilter': 'Filter report.tsv file',
     'run': 'Run the ARIBA local assembly pipeline',
     'summary': 'Summarise multiple reports made by "run"',
     'flag': 'Translate the meaning of a flag output by the pipeline',
+    'aln2meta': 'Make metadata input to preparef, using multialignment and SNPs',
+    'test': 'Run on small test dataset',
     'version': 'Print version and exit',
 }
 
 
 ordered_tasks = [
-    'refcheck',
+    'getref',
+    'prepareref',
     'run',
+    'reportfilter',
     'summary',
     'flag',
+    'aln2meta',
+    'test',
     'version',
 ]
 
diff --git a/setup.py b/setup.py
index 39ab459..ccc1607 100644
--- a/setup.py
+++ b/setup.py
@@ -7,9 +7,10 @@ from setuptools import setup, find_packages
 
 setup(
     name='ariba',
-    version='0.6.0',
+    version='1.0.0',
     description='ARIBA: Antibiotic Resistance Identification By Assembly',
     packages = find_packages(),
+    package_data={'ariba': ['test_run_data/*']},
     author='Martin Hunt',
     author_email='path-help at sanger.ac.uk',
     url='https://github.com/sanger-pathogens/ariba',
@@ -17,10 +18,10 @@ setup(
     test_suite='nose.collector',
     tests_require=['nose >= 1.3'],
     install_requires=[
-        'openpyxl',
-        'pyfastaq >= 3.10.0',
-        'pysam >= 0.8.1',
-        'pymummer>=0.6.1'
+        'openpyxl >= 1.6.2',
+        'pyfastaq >= 3.12.0',
+        'pysam >= 0.8.1, <= 0.8.3',
+        'pymummer>=0.6.1',
     ],
     license='GPLv3',
     classifiers=[

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ariba.git



More information about the debian-med-commit mailing list