[med-svn] [gubbins] 01/05: Imported Upstream version 1.4.9

Andreas Tille tille at debian.org
Thu Apr 21 09:39:24 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository gubbins.

commit 747b52f8e6f6f23e0cae69504c325120c4e61669
Author: Andreas Tille <tille at debian.org>
Date:   Wed Apr 20 19:53:47 2016 +0200

    Imported Upstream version 1.4.9
---
 CHANGELOG                                          |  50 ++++
 README.md                                          |   5 +
 VERSION                                            |   2 +-
 homebrew/gubbins.rb                                |  35 ---
 install_dependencies.sh                            |  16 +-
 python/gubbins/PreProcessFasta.py                  |  96 +++++++
 python/gubbins/ValidateFastaAlignment.py           |  82 ++++++
 python/gubbins/common.py                           | 149 +----------
 .../tests/data/alignment_with_no_variation.aln     |  10 -
 .../tests/data/all_unique_sequence_names.fa        |   2 +
 .../tests/data/pairwise.aln.snp_sites.aln_expected |   2 +-
 .../data/preprocessfasta/all_same_sequence.aln     |   8 +
 .../data/preprocessfasta/expected_missing_data.aln |   4 +
 .../expected_multiple_duplicates.aln               |   4 +
 .../preprocessfasta/expected_one_duplicate.aln     |   6 +
 .../tests/data/preprocessfasta/missing_data.aln    |   8 +
 .../data/preprocessfasta/multiple_duplicates.aln   |   8 +
 .../tests/data/preprocessfasta/no_duplicates.aln   |   8 +
 .../tests/data/preprocessfasta/one_duplicate.aln   |   8 +
 python/gubbins/tests/data/valid_alignment.aln      |   2 +
 .../gubbins/tests/test_alignment_python_methods.py |   5 -
 python/gubbins/tests/test_external_dependancies.py | 286 ++++-----------------
 python/gubbins/tests/test_fastml.py                |   0
 python/gubbins/tests/test_pre_process_fasta.py     |  85 ++++++
 python/gubbins/tests/test_validate_fasta_input.py  |  46 ++--
 python/scripts/run_gubbins.py                      |   2 +-
 26 files changed, 485 insertions(+), 444 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index f2902ce..cfeba0b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,53 @@
+v1.4.9 - 15 Apr 2016
+------
+If sequences are 100% identical, filter out duplicates.
+Refactor Python FASTA validation to separate file.
+
+v1.4.8 - 12 Apr 2016
+------
+Fix to lookup of CPU instruction sets to decide which versions of RAxML are supported. Provided by aaronk.
+
+v1.4.7 - 29 Feb 2016
+------
+Fix allocation from stack bug (from jeromekelleher). For large numbers of SNPs allocating from the stack caused a segfault. 
+This fixes the issue by allocating from the heap with malloc.
+
+v1.4.6 - 29 Feb 2016
+------
+Added back midpoint rerooting as the default. Was the default up to v1.3.5 (25/6/15) and was removed by mistake. 
+It didnt effect the recombination detection, just the tree didnt look as pretty at the end.
+
+v1.4.5 - 13 Jan 2016
+------
+Removed the install.sh file which gets regenerated automatically when the code is built, 
+but if its there already it can cause problems on some platforms.
+
+v1.4.4 - 9 Dec 2015
+------
+Debian reported that Gubbins failed to build on one platform. These changes fix this bug and 
+cleaned up some other build issues to make it easier to compile on OSX from source.
+
+v1.4.3 - 19 Nov 2015
+------
+The Debian directory was removed because it now lives in its own repository in the Debian world.
+
+v1.4.2 - 2 Sept 2015
+------
+If filenames are too long, RAxML crashes. To avoid this a hard limit was introduced, and Gubbins exits with an error message.
+
+v1.4.1 - 7 Jul 2015
+------
+Update Brew installation instructions and python dependancy versions.
+
+v1.4.0 - 1 Jul 2015
+------
+Update from python 2 to python 3. This is to allow for inclusion in Debian (because they dont accept python 2 anymore). 
+Also detect fastml version and modify parameters as required.
+
+
+
+Changes prior to May 2015
+=========
 gubbins (1.3.4~trusty1) trusty; urgency=low
 
   * Identify blocks at end of genome
diff --git a/README.md b/README.md
index ddcceb1..8bdc5af 100644
--- a/README.md
+++ b/README.md
@@ -125,3 +125,8 @@ Data from the paper
 ===================
 * ftp://ftp.sanger.ac.uk/pub/project/pathogens/gubbins/PMEN1.aln.gz
 * ftp://ftp.sanger.ac.uk/pub/project/pathogens/gubbins/ST239.aln.gz
+* 
+
+Midpoint rerooting
+==================
+From version 1.3.5 (25/6/15) to version 1.4.6 (29/2/16) trees were not midpoint rerooted by default. This doesnt have any effect on the recombination detection, but the output trees may not look as expected. Users are advised to upgrade to the latest version.
diff --git a/VERSION b/VERSION
index be05bba..4ea2b1f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.4.7
+1.4.9
diff --git a/homebrew/gubbins.rb b/homebrew/gubbins.rb
deleted file mode 100644
index 0267066..0000000
--- a/homebrew/gubbins.rb
+++ /dev/null
@@ -1,35 +0,0 @@
-require 'formula'
-
-class Gubbins < Formula
-  homepage 'https://github.com/sanger-pathogens/gubbins'
-  url 'https://github.com/sanger-pathogens/gubbins/archive/v0.6.tar.gz'
-  sha1 '0a4e75fb511de2270f4ad16390fb090d6ddb6ad7'
-  head 'https://github.com/sanger-pathogens/gubbins.git'
-
-  depends_on :autoconf
-  depends_on :automake
-  depends_on :libtool
-  depends_on 'raxml'
-  depends_on 'fasttree'
-  depends_on 'fastml'
-  depends_on :python
-  
-  depends_on LanguageModuleDependency.new :python, "biopython", "Bio"
-  
-  def install
-    inreplace "src/Makefile.am", "-lrt", "" if OS.mac? # no librt for osx
-
-    system "autoreconf -i"
-    system "./configure",
-           "--disable-debug",
-           "--disable-dependency-tracking",
-           "--prefix=#{prefix}"
-
-    system "make","DESTDIR=#{prefix}", "install"    
-  end
-
-  test do
-    system "gubbins"
-    system "run_gubbins.py"
-  end
-end
diff --git a/install_dependencies.sh b/install_dependencies.sh
old mode 100644
new mode 100755
index e8b834c..5ba857a
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -6,11 +6,11 @@ set -e
 start_dir=$(pwd)
 
 RAXML_VERSION="8.1.21"
-FASTML_VERSION="2.3"
-FASTTREE_VERSION="2.1.8"
+FASTML_VERSION="3.1"
+FASTTREE_VERSION="2.1.9"
 
 RAXML_DOWNLOAD_URL="https://github.com/stamatak/standard-RAxML/archive/v${RAXML_VERSION}.tar.gz"
-FASTML_DOWNLOAD_URL="https://github.com/sanger-pathogens/fastml/archive/v${FASTML_VERSION}.tar.gz"
+FASTML_DOWNLOAD_URL="http://fastml.tau.ac.il/source/FastML.v${FASTML_VERSION}.tgz"
 FASTTREE_DOWNLOAD_URL="http://www.microbesonline.org/fasttree/FastTree-${FASTTREE_VERSION}.c"
 
 # Make an install location
@@ -68,14 +68,22 @@ fi
 cd $build_dir
 
 ## FASTML
-fastml_dir=$(pwd)/"fastml-${FASTML_VERSION}"
+fastml_dir=$(pwd)/"FastML.v${FASTML_VERSION}"
+
 if [ ! -d $fastml_dir ]; then
   tar xzf fastml-${FASTML_VERSION}.tgz
+  ls -al
 fi
 cd $fastml_dir
 if [ -e "${fastml_dir}/programs/fastml/fastml" ]; then
   echo "Already build FASTML; skipping build"
 else
+  sed -i 's/getopt/fastml_getopt/g' libs/phylogeny/phylogeny.vcxproj
+  sed -i 's/getopt/fastml_getopt/g' libs/phylogeny/phylogeny.vcproj
+  mv libs/phylogeny/getopt.h libs/phylogeny/fastml_getopt.h
+  mv libs/phylogeny/getopt.c libs/phylogeny/fastml_getopt.c
+  mv libs/phylogeny/getopt1.c libs/phylogeny/fastml_getopt1.c
+
   make
 fi
 
diff --git a/python/gubbins/PreProcessFasta.py b/python/gubbins/PreProcessFasta.py
new file mode 100644
index 0000000..1dbbc85
--- /dev/null
+++ b/python/gubbins/PreProcessFasta.py
@@ -0,0 +1,96 @@
+import os
+import hashlib
+from Bio import AlignIO
+from Bio.Align import MultipleSeqAlignment
+from collections import defaultdict
+
+class PreProcessFasta(object):
+  def __init__(self, input_filename, verbose = False, filter_percentage = 25):
+      self.input_filename = input_filename
+      self.verbose = verbose
+      self.filter_percentage = filter_percentage
+  
+  def hash_sequences(self):
+      sequence_hash_to_taxa = defaultdict(list)
+      with open(self.input_filename) as input_handle:
+          alignments = AlignIO.parse(input_handle, "fasta")
+          for alignment in alignments:
+              for record in alignment:
+                  sequence_hash = hashlib.md5()
+                  sequence_hash.update(str(record.seq).encode('utf-8') )
+                  hash_of_sequence  = sequence_hash.digest()
+                  sequence_hash_to_taxa[hash_of_sequence].append(record.id)
+
+                  if self.verbose:
+                      print("Sample "+ str(record.id) + " has a hash of " +str(hash_of_sequence) )
+      input_handle.close() 
+      return sequence_hash_to_taxa
+      
+  def calculate_sequences_missing_data_percentage(self):
+      sequences_to_missing_data = defaultdict(list)
+      with open(self.input_filename) as input_handle:
+          alignments = AlignIO.parse(input_handle, "fasta")
+          for alignment in alignments:
+              for record in alignment:
+                  number_of_gaps = 0
+                  number_of_gaps += record.seq.count('n')
+                  number_of_gaps += record.seq.count('N')
+                  number_of_gaps += record.seq.count('-')
+                  sequence_length = len(record.seq)
+                  
+                  if sequence_length == 0:
+                    sequences_to_missing_data[record.id] = 100
+                    if self.verbose:
+                        print("Sample "+ str(record.id) + " has no sequence " )
+                  else:
+                    per_missing_data = (number_of_gaps*100/sequence_length)
+                    sequences_to_missing_data[record.id] = per_missing_data
+                    if self.verbose:
+                        print("Sample "+ str(record.id) + " has missing data percentage of " + str(per_missing_data))  
+
+      input_handle.close()
+      return sequences_to_missing_data
+      
+  def taxa_missing_too_much_data(self):
+      taxa_to_remove = []
+      for taxa, percentage_missing in self.calculate_sequences_missing_data_percentage().items():
+          if(percentage_missing > self.filter_percentage):
+            taxa_to_remove.append(taxa)
+            print("Excluded sequence " +taxa + " because it had " + str(percentage_missing) +" percentage missing data while a maximum of "+ str(self.filter_percentage) +" is allowed")
+
+      return taxa_to_remove
+
+  def taxa_of_duplicate_sequences(self):
+     taxa_to_remove = []
+     for sequence_hash, taxa in sorted(self.hash_sequences().items()):
+         if len(taxa) > 1:
+             taxon_to_keep = taxa.pop()
+             for taxon in taxa:
+                 print("Sequences in "+ taxon + " and " + taxon_to_keep + " are identical, removing " + taxon  + " from analysis")
+                 taxa_to_remove.append(taxon)
+     
+     return taxa_to_remove
+
+  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename):
+      taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
+      
+      with open(self.input_filename) as input_handle:
+          with open(output_filename, "w+") as output_handle:
+              alignments = AlignIO.parse(input_handle, "fasta")
+              output_alignments = []
+              
+              number_of_included_alignments = 0
+              for alignment in alignments:
+                  for record in alignment:
+                      
+                      if record.id not in taxa_to_remove:
+                          output_alignments.append(record)
+                          number_of_included_alignments += 1
+              
+              if number_of_included_alignments <= 1:
+                  sys.exit("Not enough sequences are left after removing duplicates.Please check you input data.")
+              
+              AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")
+              output_handle.close()
+          input_handle.close()
+      return taxa_to_remove
diff --git a/python/gubbins/ValidateFastaAlignment.py b/python/gubbins/ValidateFastaAlignment.py
new file mode 100644
index 0000000..7761e9a
--- /dev/null
+++ b/python/gubbins/ValidateFastaAlignment.py
@@ -0,0 +1,82 @@
+import os
+import re
+import sys
+from Bio import AlignIO
+from collections import Counter
+from Bio.Align import MultipleSeqAlignment
+
+class ValidateFastaAlignment(object):
+    def __init__(self, input_filename):
+      self.input_filename = input_filename
+
+    def is_input_fasta_file_valid(self):
+      try:
+          if not self.does_each_sequence_have_the_same_length():
+              print("Each sequence must be the same length")
+              return False
+          if not self.are_sequence_names_unique():
+              print("All sequence names in the fasta file must be unique")
+              return False
+          if not self.does_each_sequence_have_a_name_and_genomic_data():
+              print("Each sequence must have a name and some genomic data")
+              return False
+      except:
+          return False
+    
+      return True
+    
+    def does_each_sequence_have_a_name_and_genomic_data(self):
+      with  open(self.input_filename, "r") as input_handle:
+        alignments = AlignIO.parse(input_handle, "fasta")
+        number_of_sequences = 0
+        for alignment in alignments:
+            for record in alignment:
+                number_of_sequences +=1
+                if record.name is None or record.name == "":
+                  print("Error with the input FASTA file: One of the sequence names is blank")
+                  return False
+                if record.seq is None or record.seq == "":
+                  print("Error with the input FASTA file: One of the sequences is empty")
+                  return False
+                if re.search('[^ACGTNacgtn-]', str(record.seq))  != None:
+                  print("Error with the input FASTA file: One of the sequences contains odd characters, only ACGTNacgtn- are permitted")
+                  return False
+        if number_of_sequences <= 3:
+          print("Error with input FASTA file: you need more than 3 sequences to build a meaningful tree")
+          return False
+        input_handle.close()
+      return True
+    
+    def does_each_sequence_have_the_same_length(self):
+      try:
+        with open(self.input_filename) as input_handle:
+    
+          alignments = AlignIO.parse(input_handle, "fasta")
+          sequence_length = -1
+          for alignment in alignments:
+              for record in alignment:
+                 if sequence_length == -1:
+                   sequence_length = len(record.seq)
+                 elif sequence_length != len(record.seq):
+                   print("Error with the input FASTA file: The sequences dont have the same lengths this isnt an alignment: "+record.name)
+                   return False
+          input_handle.close()
+      except:
+        print("Unexpected error:", sys.exc_info()[0])
+        print("Error with the input FASTA file: It is in the wrong format so check its an alignment")
+        return False
+      return True
+    
+    def are_sequence_names_unique(self):
+      with open(self.input_filename) as input_handle:
+        alignments = AlignIO.parse(input_handle, "fasta")
+        sequence_names = []
+        for alignment in alignments:
+            for record in alignment:
+                sequence_names.append(record.name)
+          
+        if [k for k,v in list(Counter(sequence_names).items()) if v>1] != []:
+          return False
+        input_handle.close()
+      return True
+      
\ No newline at end of file
diff --git a/python/gubbins/common.py b/python/gubbins/common.py
index 0cc8c2c..7b45b9b 100644
--- a/python/gubbins/common.py
+++ b/python/gubbins/common.py
@@ -23,7 +23,6 @@ from Bio import SeqIO
 from Bio.Align import MultipleSeqAlignment
 from Bio.Seq import Seq
 from io import StringIO
-from collections import Counter
 import argparse
 import dendropy
 from dendropy.calculate import treecompare
@@ -36,6 +35,8 @@ import sys
 import tempfile
 import time
 from gubbins.Fastml import Fastml
+from gubbins.PreProcessFasta import PreProcessFasta
+from gubbins.ValidateFastaAlignment import ValidateFastaAlignment
 
 class GubbinsError(Exception):
   def __init__(self, value,message):
@@ -81,26 +82,6 @@ class GubbinsCommon():
 
 
   @staticmethod
-  def does_fasta_contain_variation(alignment_filename):
-    with open(alignment_filename, "r") as input_handle:
-      alignments = AlignIO.parse(input_handle, "fasta")
-      first_sequence = ""
-      
-      for index, alignment in enumerate(alignments):
-        for record_index, record in enumerate(alignment):
-      
-          if record_index == 0:
-            first_sequence = record.seq
-      
-          if str(record.seq) != str(first_sequence):
-            input_handle.close()
-            return 1
-            
-      input_handle.close()
-    return 0
-
-
-  @staticmethod
   def does_file_exist(alignment_filename, file_type_msg):
     if(not os.path.exists(alignment_filename)):
       print(GubbinsError('','Cannot access the input '+file_type_msg+'. Check its been entered correctly'))
@@ -111,7 +92,7 @@ class GubbinsCommon():
   def choose_executable(list_of_executables):
     flags = []
     if os.path.exists('/proc/cpuinfo'):
-      output = subprocess.Popen('grep flags /proc/cpuinfo', stdout = subprocess.PIPE, shell=True).communicate()[0]
+      output = subprocess.Popen('grep flags /proc/cpuinfo', stdout = subprocess.PIPE, shell=True).communicate()[0].decode("utf-8")
       flags =  output.split()
     
     for executable in list_of_executables:
@@ -171,7 +152,7 @@ class GubbinsCommon():
     if self.args.converge_method not in ['weighted_robinson_foulds','robinson_foulds','recombination']:
       sys.exit("Please choose weighted_robinson_foulds, robinson_foulds or recombination for the --converge_method option")
 
-    if(GubbinsCommon.does_file_exist(self.args.alignment_filename, 'Alignment File') == 0 or GubbinsCommon.is_input_fasta_file_valid(self.args.alignment_filename) == 0 ):
+    if(GubbinsCommon.does_file_exist(self.args.alignment_filename, 'Alignment File') == 0 or not ValidateFastaAlignment(self.args.alignment_filename).is_input_fasta_file_valid() ):
        sys.exit("There is a problem with your input fasta file so nothing can be done until you fix it")
        
     if(self.args.starting_tree is not None and self.args.starting_tree != "" and (GubbinsCommon.does_file_exist(self.args.starting_tree, 'Starting Tree') == 0 or GubbinsCommon.is_input_starting_tree_valid(self.args.starting_tree) )):
@@ -195,7 +176,10 @@ class GubbinsCommon():
 
     # put a filtered copy into a temp directory and work from that
     temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())
-    taxa_removed = GubbinsCommon.filter_out_alignments_with_too_much_missing_data(self.args.alignment_filename, temp_working_dir+"/"+starting_base_filename, self.args.filter_percentage, self.args.verbose)
+    
+    pre_process_fasta = PreProcessFasta(self.args.alignment_filename,self.args.verbose,self.args.filter_percentage)
+    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename)
+    
     self.args.alignment_filename = temp_working_dir+"/"+starting_base_filename
 
     # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from the tree
@@ -308,7 +292,6 @@ class GubbinsCommon():
         print(fastml_command)
         fastml_command_suffix = ''
 
-
       try:
         subprocess.check_call(fastml_command+fastml_command_suffix, shell=True)
       except:
@@ -318,10 +301,9 @@ class GubbinsCommon():
       shutil.copyfile(starting_base_filename+".start", starting_base_filename+".gaps.snp_sites.aln")
       GubbinsCommon.reinsert_gaps_into_fasta_file(current_tree_name+'.seq.joint.txt', starting_base_filename +".gaps.vcf", starting_base_filename+".gaps.snp_sites.aln")
 
-      if(GubbinsCommon.does_file_exist(starting_base_filename+".gaps.snp_sites.aln", 'Alignment File') == 0 or GubbinsCommon.is_input_fasta_file_valid(starting_base_filename+".gaps.snp_sites.aln") == 0 ):
+      if(GubbinsCommon.does_file_exist(starting_base_filename+".gaps.snp_sites.aln", 'Alignment File') == 0 or not ValidateFastaAlignment(starting_base_filename+".gaps.snp_sites.aln").is_input_fasta_file_valid() ):
          sys.exit("There is a problem with your FASTA file after running FASTML. Please check this intermediate file is valid: "+ str(starting_base_filename)+".gaps.snp_sites.aln")
 
-
       if self.args.verbose > 0:
         print(int(time.time()))
 
@@ -736,26 +718,6 @@ class GubbinsCommon():
         sequence_names.append(record.id)
       handle.close()
     return sequence_names
-
-  @staticmethod
-  def is_input_fasta_file_valid(input_filename):
-    try:
-      if GubbinsCommon.does_each_sequence_have_the_same_length(input_filename) == 0:
-        print("Each sequence must be the same length")
-        return 0
-      if GubbinsCommon.are_sequence_names_unique(input_filename) == 0:
-        print("All sequence names in the fasta file must be unique")
-        return 0
-      if GubbinsCommon.does_each_sequence_have_a_name_and_genomic_data(input_filename) == 0:
-        print("Each sequence must have a name and some genomic data")
-        return 0
-      if GubbinsCommon.does_fasta_contain_variation(input_filename) == 0:
-        print("All of the input sequences contain the same data")
-        return 0
-    except:
-      return 0
-
-    return 1
     
   @staticmethod
   def is_input_starting_tree_valid(starting_tree):
@@ -768,99 +730,6 @@ class GubbinsCommon():
       return 0
 
     return 1
-  
-
-  @staticmethod
-  def does_each_sequence_have_a_name_and_genomic_data(input_filename):
-    with  open(input_filename, "r") as input_handle:
-      alignments = AlignIO.parse(input_handle, "fasta")
-      number_of_sequences = 0
-      for alignment in alignments:
-          for record in alignment:
-              number_of_sequences +=1
-              if record.name is None or record.name == "":
-                print("Error with the input FASTA file: One of the sequence names is blank")
-                return 0
-              if record.seq is None or record.seq == "":
-                print("Error with the input FASTA file: One of the sequences is empty")
-                return 0
-              if re.search('[^ACGTNacgtn-]', str(record.seq))  != None:
-                print("Error with the input FASTA file: One of the sequences contains odd characters, only ACGTNacgtn- are permitted")
-                return 0
-      if number_of_sequences <= 3:
-        print("Error with input FASTA file: you need more than 3 sequences to build a meaningful tree")
-        return 0
-      input_handle.close()
-    return 1
-    
-    
-  @staticmethod
-  def does_each_sequence_have_the_same_length(input_filename):
-    try:
-      with open(input_filename) as input_handle:
-      
-        alignments = AlignIO.parse(input_handle, "fasta")
-        sequence_length = -1
-        for alignment in alignments:
-            for record in alignment:
-               if sequence_length == -1:
-                 sequence_length = len(record.seq)
-               elif sequence_length != len(record.seq):
-                 print("Error with the input FASTA file: The sequences dont have the same lengths this isnt an alignment: "+record.name)
-                 return 0
-        input_handle.close()
-    except:
-      print("Error with the input FASTA file: It is in the wrong format so check its an alignment")
-      return 0
-    return 1
-
-  @staticmethod
-  def are_sequence_names_unique(input_filename):
-    with open(input_filename) as input_handle:
-      alignments = AlignIO.parse(input_handle, "fasta")
-      sequence_names = []
-      for alignment in alignments:
-          for record in alignment:
-              sequence_names.append(record.name)
-              
-      if [k for k,v in list(Counter(sequence_names).items()) if v>1] != []:
-        return 0
-      input_handle.close()
-    return 1
-
-  @staticmethod
-  def filter_out_alignments_with_too_much_missing_data(input_filename, output_filename, filter_percentage,verbose):
-    with open(input_filename) as input_handle:
-      with open(output_filename, "w+") as output_handle:
-        alignments = AlignIO.parse(input_handle, "fasta")
-        output_alignments = []
-        taxa_removed = []
-        number_of_included_alignments = 0
-        for alignment in alignments:
-            for record in alignment:
-              number_of_gaps = 0
-              number_of_gaps += record.seq.count('n')
-              number_of_gaps += record.seq.count('N')
-              number_of_gaps += record.seq.count('-')
-              sequence_length = len(record.seq)
-        
-              if sequence_length == 0:
-                taxa_removed.append(record.id)
-                print("Excluded sequence " + record.id + " because there werent enough bases in it")
-              elif((number_of_gaps*100/sequence_length) <= filter_percentage):
-                output_alignments.append(record)
-                number_of_included_alignments += 1
-              else:
-                taxa_removed.append(record.id)
-                print("Excluded sequence " + record.id + " because it had " + str(number_of_gaps*100/sequence_length) +" percentage gaps while a maximum of "+ str(filter_percentage) +" is allowed")
-        
-        if number_of_included_alignments <= 1:
-          sys.exit("Too many sequences have been excluded so theres no data left to work with. Please increase the -f parameter")
-        
-        AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")
-        output_handle.close()
-      input_handle.close()
-    return taxa_removed
 
   @staticmethod
   def filter_out_removed_taxa_from_tree_and_return_new_file(starting_tree, temp_working_dir, taxa_removed):
diff --git a/python/gubbins/tests/data/alignment_with_no_variation.aln b/python/gubbins/tests/data/alignment_with_no_variation.aln
deleted file mode 100644
index a79a17d..0000000
--- a/python/gubbins/tests/data/alignment_with_no_variation.aln
+++ /dev/null
@@ -1,10 +0,0 @@
->seq1
-aaaaaaaaaa
->seq2
-aaaaaaaaaa
->seq3
-aaaaaaaaaa
->seq4
-aaaaaaaaaa
->seq5
-aaaaaaaaaa
\ No newline at end of file
diff --git a/python/gubbins/tests/data/all_unique_sequence_names.fa b/python/gubbins/tests/data/all_unique_sequence_names.fa
index 0bc2b83..64bfe5e 100644
--- a/python/gubbins/tests/data/all_unique_sequence_names.fa
+++ b/python/gubbins/tests/data/all_unique_sequence_names.fa
@@ -4,3 +4,5 @@ AAAAA
 CCCCC
 >sequence3
 TTTTT
+>sequence4
+GGGGG
diff --git a/python/gubbins/tests/data/pairwise.aln.snp_sites.aln_expected b/python/gubbins/tests/data/pairwise.aln.snp_sites.aln_expected
index a36fd8c..f62b996 100644
--- a/python/gubbins/tests/data/pairwise.aln.snp_sites.aln_expected
+++ b/python/gubbins/tests/data/pairwise.aln.snp_sites.aln_expected
@@ -3,4 +3,4 @@ AAA
 >sequence2
 GGG
 >N1
-TTT
+AAA
diff --git a/python/gubbins/tests/data/preprocessfasta/all_same_sequence.aln b/python/gubbins/tests/data/preprocessfasta/all_same_sequence.aln
new file mode 100644
index 0000000..1425bb2
--- /dev/null
+++ b/python/gubbins/tests/data/preprocessfasta/all_same_sequence.aln
@@ -0,0 +1,8 @@
+>sample1
+AAAAAAA
+>sample2
+AAAAAAA
+>sample3
+AAAAAAA
+>sample4
+AAAAAAA
diff --git a/python/gubbins/tests/data/preprocessfasta/expected_missing_data.aln b/python/gubbins/tests/data/preprocessfasta/expected_missing_data.aln
new file mode 100644
index 0000000..45230fc
--- /dev/null
+++ b/python/gubbins/tests/data/preprocessfasta/expected_missing_data.aln
@@ -0,0 +1,4 @@
+>sample1
+AAAAAAA
+>sample3
+TTTTTTT
diff --git a/python/gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln b/python/gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln
new file mode 100644
index 0000000..d6f7d4b
--- /dev/null
+++ b/python/gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln
@@ -0,0 +1,4 @@
+>sample3
+AAAAAAA
+>sample4
+GGGGGGG
diff --git a/python/gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln b/python/gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln
new file mode 100644
index 0000000..3e7f5a7
--- /dev/null
+++ b/python/gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln
@@ -0,0 +1,6 @@
+>sample2
+CCCCCCC
+>sample3
+AAAAAAA
+>sample4
+GGGGGGG
diff --git a/python/gubbins/tests/data/preprocessfasta/missing_data.aln b/python/gubbins/tests/data/preprocessfasta/missing_data.aln
new file mode 100644
index 0000000..83c6613
--- /dev/null
+++ b/python/gubbins/tests/data/preprocessfasta/missing_data.aln
@@ -0,0 +1,8 @@
+>sample1
+AAAAAAA
+>sample2
+CCCNNCC
+>sample3
+TTTTTTT
+>sample4
+G--GGGG
diff --git a/python/gubbins/tests/data/preprocessfasta/multiple_duplicates.aln b/python/gubbins/tests/data/preprocessfasta/multiple_duplicates.aln
new file mode 100644
index 0000000..aef0b4e
--- /dev/null
+++ b/python/gubbins/tests/data/preprocessfasta/multiple_duplicates.aln
@@ -0,0 +1,8 @@
+>sample1
+AAAAAAA
+>sample2
+GGGGGGG
+>sample3
+AAAAAAA
+>sample4
+GGGGGGG
diff --git a/python/gubbins/tests/data/preprocessfasta/no_duplicates.aln b/python/gubbins/tests/data/preprocessfasta/no_duplicates.aln
new file mode 100644
index 0000000..5950e6b
--- /dev/null
+++ b/python/gubbins/tests/data/preprocessfasta/no_duplicates.aln
@@ -0,0 +1,8 @@
+>sample1
+AAAAAAA
+>sample2
+CCCCCCC
+>sample3
+TTTTTTT
+>sample4
+GGGGGGG
diff --git a/python/gubbins/tests/data/preprocessfasta/one_duplicate.aln b/python/gubbins/tests/data/preprocessfasta/one_duplicate.aln
new file mode 100644
index 0000000..a4f9875
--- /dev/null
+++ b/python/gubbins/tests/data/preprocessfasta/one_duplicate.aln
@@ -0,0 +1,8 @@
+>sample1
+AAAAAAA
+>sample2
+CCCCCCC
+>sample3
+AAAAAAA
+>sample4
+GGGGGGG
diff --git a/python/gubbins/tests/data/valid_alignment.aln b/python/gubbins/tests/data/valid_alignment.aln
index 0f0205b..649b03c 100644
--- a/python/gubbins/tests/data/valid_alignment.aln
+++ b/python/gubbins/tests/data/valid_alignment.aln
@@ -4,3 +4,5 @@ AAAAAAAAAA
 CCCCCCCCCC
 >sequence3
 GGGGGGGGGG
+>sequence4
+TTTTTTTTTT
diff --git a/python/gubbins/tests/test_alignment_python_methods.py b/python/gubbins/tests/test_alignment_python_methods.py
index 27b69f3..400512a 100644
--- a/python/gubbins/tests/test_alignment_python_methods.py
+++ b/python/gubbins/tests/test_alignment_python_methods.py
@@ -19,11 +19,6 @@ class TestAlignmentPythonMethods(unittest.TestCase):
   def test_get_sequence_names_from_alignment(self):
     assert  common.GubbinsCommon.get_sequence_names_from_alignment('gubbins/tests/data/small_alignment.aln') == ['sequence1','sequence2','sequence3','sequence4','sequence5']
 
-  def test_filter_out_alignments_with_too_much_missing_data(self):
-    common.GubbinsCommon.filter_out_alignments_with_too_much_missing_data('gubbins/tests/data/alignment_with_too_much_missing_data.aln', 'gubbins/tests/data/alignment_with_too_much_missing_data.aln.actual', 5,0)
-    assert filecmp.cmp('gubbins/tests/data/alignment_with_too_much_missing_data.aln.actual','gubbins/tests/data/alignment_with_too_much_missing_data.aln.expected')
-    os.remove('gubbins/tests/data/alignment_with_too_much_missing_data.aln.actual')
-
   def test_reinsert_gaps_into_fasta_file(self):
     common.GubbinsCommon.reinsert_gaps_into_fasta_file('gubbins/tests/data/gaps_to_be_reinserted.aln', 'gubbins/tests/data/gaps_to_be_reinserted.vcf', 'gubbins/tests/data/gaps_to_be_reinserted.aln.actual')
     assert filecmp.cmp('gubbins/tests/data/gaps_to_be_reinserted.aln.actual','gubbins/tests/data/gaps_to_be_reinserted.aln.expected')
diff --git a/python/gubbins/tests/test_external_dependancies.py b/python/gubbins/tests/test_external_dependancies.py
old mode 100755
new mode 100644
index a4382a8..3188aa5
--- a/python/gubbins/tests/test_external_dependancies.py
+++ b/python/gubbins/tests/test_external_dependancies.py
@@ -25,25 +25,11 @@ class TestExternalDependancies(unittest.TestCase):
     assert re.match('.*/ls$', common.GubbinsCommon.which('ls -alrt')) != None
     assert common.GubbinsCommon.which('non_existant_program') == None
     
-    
   def test_change_window_size(self):
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--version',                action='version', version=str(pkg_resources.get_distribution("gubbins").version))
+    parser = self.base_arg_parse()
     parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 80)
     parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 1000)
+    
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(["--prefix", "different_prefix",'gubbins/tests/data/multiple_recombinations.aln']))
     gubbins_runner.parse_and_run()
     
@@ -56,34 +42,10 @@ class TestExternalDependancies(unittest.TestCase):
     assert os.path.exists('different_prefix.branch_base_reconstruction.embl')
     assert os.path.exists('different_prefix.final_tree.tre')
 
-    os.remove('different_prefix.summary_of_snp_distribution.vcf')
-    os.remove('different_prefix.node_labelled.tre')
-    os.remove('different_prefix.recombination_predictions.embl')
-    os.remove('different_prefix.per_branch_statistics.csv')
-    os.remove('different_prefix.filtered_polymorphic_sites.fasta')
-    os.remove('different_prefix.filtered_polymorphic_sites.phylip')
-    os.remove('different_prefix.recombination_predictions.gff')
-    os.remove('different_prefix.branch_base_reconstruction.embl')
-    os.remove('different_prefix.final_tree.tre')
+    self.cleanup()
   
   def test_rename_final_output(self):
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--version',                action='version', version=str(pkg_resources.get_distribution("gubbins").version))
-    parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
-    parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
+    parser = self.default_arg_parse()
     
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(["--prefix", "different_prefix",'gubbins/tests/data/multiple_recombinations.aln']))
     gubbins_runner.parse_and_run()
@@ -97,96 +59,32 @@ class TestExternalDependancies(unittest.TestCase):
     assert os.path.exists('different_prefix.branch_base_reconstruction.embl')
     assert os.path.exists('different_prefix.final_tree.tre')
 
-    os.remove('different_prefix.summary_of_snp_distribution.vcf')
-    os.remove('different_prefix.node_labelled.tre')
-    os.remove('different_prefix.recombination_predictions.embl')
-    os.remove('different_prefix.per_branch_statistics.csv')
-    os.remove('different_prefix.filtered_polymorphic_sites.fasta')
-    os.remove('different_prefix.filtered_polymorphic_sites.phylip')
-    os.remove('different_prefix.recombination_predictions.gff')
-    os.remove('different_prefix.branch_base_reconstruction.embl')
-    os.remove('different_prefix.final_tree.tre')
+    self.cleanup()
     
   def test_recombination_convergence(self):
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
-    parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
+    parser = self.default_arg_parse()
     
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(["--converge_method", "recombination", "--iterations", '15','--no_cleanup', 'gubbins/tests/data/multiple_recombinations.aln']))
     gubbins_runner.parse_and_run()
     
     assert common.GubbinsCommon.have_recombinations_been_seen_before('multiple_recombinations.recombination_predictions.embl', ['RAxML_result.multiple_recombinations.iteration_1.tab','RAxML_result.multiple_recombinations.iteration_2.tab','RAxML_result.multiple_recombinations.iteration_3.tab','RAxML_result.multiple_recombinations.iteration_4.tab','RAxML_result.multiple_recombinations.iteration_5.tab','RAxML_result.multiple_recombinations.iteration_6.tab','RAxML_result.multiple_recombinati [...]
     
-    os.remove('log.txt')
-    r = glob.glob("RAxML_*")
-    for i in r:
-       os.remove(i)
-    r = glob.glob("multiple_recombinations.aln.*")
-    for i in r:
-       os.remove(i)
+    self.cleanup()
        
        
   def test_robinson_foulds_convergence(self):
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
-    parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
+    self.cleanup()
+    parser = self.default_arg_parse()
     
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(["--converge_method", "robinson_foulds", "--iterations", '15','--no_cleanup', 'gubbins/tests/data/multiple_recombinations.aln']))
     gubbins_runner.parse_and_run()
 
     assert common.GubbinsCommon.has_tree_been_seen_before(['RAxML_result.multiple_recombinations.iteration_1','RAxML_result.multiple_recombinations.iteration_2','RAxML_result.multiple_recombinations.iteration_3','RAxML_result.multiple_recombinations.iteration_4','RAxML_result.multiple_recombinations.iteration_5','RAxML_result.multiple_recombinations.iteration_6','RAxML_result.multiple_recombinations.iteration_7','RAxML_result.multiple_recombinations.iteration_8','RAxML_result.multiple_re [...]
 
-    os.remove('log.txt')
-    r = glob.glob("RAxML_*")
-    for i in r:
-       os.remove(i)
-    r = glob.glob("multiple_recombinations.aln.*")
-    for i in r:
-       os.remove(i)
+    self.cleanup()
 
   def test_rename_final_output_fasttree(self):
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
-    parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
+    parser = self.default_arg_parse()
     
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(["--prefix", "ft_prefix","--tree_builder", "fasttree",'gubbins/tests/data/multiple_recombinations.aln']))
     gubbins_runner.parse_and_run()
@@ -200,33 +98,10 @@ class TestExternalDependancies(unittest.TestCase):
     assert os.path.exists('ft_prefix.branch_base_reconstruction.embl')
     assert os.path.exists('ft_prefix.final_tree.tre')
 
-    os.remove('ft_prefix.summary_of_snp_distribution.vcf')
-    os.remove('ft_prefix.recombination_predictions.embl')
-    os.remove('ft_prefix.per_branch_statistics.csv')
-    os.remove('ft_prefix.filtered_polymorphic_sites.fasta')
-    os.remove('ft_prefix.filtered_polymorphic_sites.phylip')
-    os.remove('ft_prefix.recombination_predictions.gff')
-    os.remove('ft_prefix.branch_base_reconstruction.embl')
-    os.remove('ft_prefix.final_tree.tre')
-    os.remove('ft_prefix.node_labelled.tre')
+    self.cleanup()
     
   def test_rename_final_output_hybrid(self):
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
-    parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
+    parser = self.default_arg_parse()
     
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(["--prefix", "hybrid_prefix","--tree_builder", "hybrid",'gubbins/tests/data/multiple_recombinations.aln']))
     gubbins_runner.parse_and_run()
@@ -240,34 +115,11 @@ class TestExternalDependancies(unittest.TestCase):
     assert os.path.exists('hybrid_prefix.branch_base_reconstruction.embl')
     assert os.path.exists('hybrid_prefix.final_tree.tre')
  
-    os.remove('hybrid_prefix.summary_of_snp_distribution.vcf')
-    os.remove('hybrid_prefix.node_labelled.tre')
-    os.remove('hybrid_prefix.recombination_predictions.embl')
-    os.remove('hybrid_prefix.per_branch_statistics.csv')
-    os.remove('hybrid_prefix.filtered_polymorphic_sites.fasta')
-    os.remove('hybrid_prefix.filtered_polymorphic_sites.phylip')
-    os.remove('hybrid_prefix.recombination_predictions.gff')
-    os.remove('hybrid_prefix.branch_base_reconstruction.embl')
-    os.remove('hybrid_prefix.final_tree.tre')
+    self.cleanup()
     
   def test_fasttree_default_output_names(self):
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
-    parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
-    
+    parser = self.default_arg_parse()
+
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(["--tree_builder", "fasttree",'gubbins/tests/data/multiple_recombinations.aln']))
     gubbins_runner.parse_and_run()
 
@@ -281,33 +133,10 @@ class TestExternalDependancies(unittest.TestCase):
     assert os.path.exists('multiple_recombinations.final_tree.tre')
     assert os.path.exists('multiple_recombinations.node_labelled.tre')
 
-    os.remove('multiple_recombinations.node_labelled.tre')
-    os.remove('multiple_recombinations.summary_of_snp_distribution.vcf')
-    os.remove('multiple_recombinations.recombination_predictions.embl')
-    os.remove('multiple_recombinations.per_branch_statistics.csv')
-    os.remove('multiple_recombinations.filtered_polymorphic_sites.fasta')
-    os.remove('multiple_recombinations.filtered_polymorphic_sites.phylip')
-    os.remove('multiple_recombinations.recombination_predictions.gff')
-    os.remove('multiple_recombinations.branch_base_reconstruction.embl')
-    os.remove('multiple_recombinations.final_tree.tre')
+    self.cleanup()
 
   def test_hybrid_default_output_names(self):
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
-    parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
+    parser = self.default_arg_parse()
     
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(["--tree_builder", "hybrid",'gubbins/tests/data/multiple_recombinations.aln']))
     gubbins_runner.parse_and_run()
@@ -321,34 +150,10 @@ class TestExternalDependancies(unittest.TestCase):
     assert os.path.exists('multiple_recombinations.branch_base_reconstruction.embl')
     assert os.path.exists('multiple_recombinations.final_tree.tre')
 
-    os.remove('multiple_recombinations.summary_of_snp_distribution.vcf')
-    os.remove('multiple_recombinations.recombination_predictions.embl')
-    os.remove('multiple_recombinations.per_branch_statistics.csv')
-    os.remove('multiple_recombinations.filtered_polymorphic_sites.fasta')
-    os.remove('multiple_recombinations.filtered_polymorphic_sites.phylip')
-    os.remove('multiple_recombinations.recombination_predictions.gff')
-    os.remove('multiple_recombinations.branch_base_reconstruction.embl')
-    os.remove('multiple_recombinations.final_tree.tre')
-    os.remove('multiple_recombinations.node_labelled.tre')
+    self.cleanup()
 
   def test_parse_and_run(self):
-
-    parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
-    parser.add_argument('alignment_filename',       help='Multifasta alignment file')
-    parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
-    parser.add_argument('--starting_tree',    '-s', help='Starting tree')
-    parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names')
-    parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
-    parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files')
-    parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
-    parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
-    parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
-    parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
-    parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
-    parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
-    parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
-    parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
-    parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
+    parser = self.default_arg_parse()
     
     #  multiple recombinations
     gubbins_runner  = common.GubbinsCommon(parser.parse_args(['gubbins/tests/data/multiple_recombinations.aln']))
@@ -361,17 +166,7 @@ class TestExternalDependancies(unittest.TestCase):
     assert not os.path.exists('log.txt')
     assert not os.path.exists('latest_tree.multiple_recombinations.tre')
   
-    os.remove('multiple_recombinations.summary_of_snp_distribution.vcf')
-    os.remove('multiple_recombinations.recombination_predictions.embl')
-    os.remove('multiple_recombinations.per_branch_statistics.csv')
-    os.remove('multiple_recombinations.filtered_polymorphic_sites.fasta')
-    os.remove('multiple_recombinations.filtered_polymorphic_sites.phylip')
-    os.remove('multiple_recombinations.recombination_predictions.gff')
-    os.remove('multiple_recombinations.branch_base_reconstruction.embl')
-    os.remove('multiple_recombinations.final_tree.tre')
-    os.remove('multiple_recombinations.node_labelled.tre')
-  
-  
+    self.cleanup()
   
   def test_pairwise_comparison(self):
     shutil.copyfile('gubbins/tests/data/input_pairwise.aln.vcf','gubbins/tests/data/pairwise.aln.vcf' )
@@ -386,11 +181,7 @@ class TestExternalDependancies(unittest.TestCase):
     # Check the reconstruction of internal nodes
     assert filecmp.cmp('pairwise.filtered_polymorphic_sites.fasta','gubbins/tests/data/pairwise.aln.snp_sites.aln_expected');
     
-    
-    os.remove('pairwise.summary_of_snp_distribution.vcf')
-    os.remove('pairwise.filtered_polymorphic_sites.fasta')
-    os.remove('pairwise.filtered_polymorphic_sites.phylip')
-    os.remove('pairwise.final_tree.tre')
+    self.cleanup()
     
   
   def test_delete_files_based_on_list_of_regexes(self):
@@ -413,6 +204,39 @@ class TestExternalDependancies(unittest.TestCase):
     assert re.search('fastml -mg -qf -b ',common.GubbinsCommon.use_bundled_exec('fastml -mg -qf -b ', 'fastml')) != None
     assert re.search('../src/gubbins',common.GubbinsCommon.use_bundled_exec('gubbins', '../src/gubbins')) != None
 
+  def cleanup(self):
+      if os.path.exists('log.txt'):
+          os.remove('log.txt')
+          
+      for regex_to_remove in ["RAxML_*", "multiple_recombinations.*","pairwise.*","hybrid_prefix.*","ft_prefix.*","different_prefix.*"]:
+          r = glob.glob(regex_to_remove)
+          for i in r:
+             os.remove(i)
+             
+  def base_arg_parse(self):
+      parser = argparse.ArgumentParser(description='Iteratively detect recombinations')
+      parser.add_argument('alignment_filename',       help='Multifasta alignment file')
+      parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting')
+      parser.add_argument('--starting_tree',    '-s', help='Starting tree')
+      parser.add_argument('--use_time_stamp',   '-u', action='count', help='Use a time stamp in file names',default = 0)
+      parser.add_argument('--verbose',          '-v', action='count', help='Turn on debugging',default = 0)
+      parser.add_argument('--no_cleanup',       '-n', action='count', help='Dont cleanup intermediate files',default = 0)
+      parser.add_argument('--tree_builder',     '-t', help='Application to use for tree building (raxml, fasttree, hybrid), default RAxML', default = "raxml")
+      parser.add_argument('--iterations',       '-i', help='Maximum No. of iterations, default is 5', type=int,  default = 5)
+      parser.add_argument('--min_snps',         '-m', help='Min SNPs to identify a recombination block, default is 3', type=int,  default = 3)
+      parser.add_argument('--filter_percentage','-f', help='Filter out taxa with more than this percentage of gaps, default is 25', type=int,  default = 15)
+      parser.add_argument('--prefix',           '-p', help='Add a prefix to the final output filenames')
+      parser.add_argument('--threads',          '-c', help='Number of threads to run with RAXML, but only if a PTHREADS version is available', type=int,  default = 1)
+      parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
+      parser.add_argument('--version',                action='version', version=str(pkg_resources.get_distribution("gubbins").version))
+      return parser
+      
+  def default_arg_parse(self):
+      parser = self.base_arg_parse()
+      parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
+      parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
+      return parser
+         
 if __name__ == "__main__":
   unittest.main()
 
diff --git a/python/gubbins/tests/test_fastml.py b/python/gubbins/tests/test_fastml.py
old mode 100755
new mode 100644
diff --git a/python/gubbins/tests/test_pre_process_fasta.py b/python/gubbins/tests/test_pre_process_fasta.py
new file mode 100644
index 0000000..b96c2d9
--- /dev/null
+++ b/python/gubbins/tests/test_pre_process_fasta.py
@@ -0,0 +1,85 @@
+# encoding: utf-8
+
+"""
+Tests for preprocessing the input FASTA file
+"""
+
+import unittest
+import re
+import os
+import subprocess
+import filecmp
+import pprint
+from gubbins.PreProcessFasta import PreProcessFasta
+
+class TestPreProcessFasta(unittest.TestCase):
+      
+  def test_input_file_with_no_duplicate_sequences(self):   
+      preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/no_duplicates.aln')
+      self.assertEqual(preprocessfasta.hash_sequences(), 
+       {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1'],
+        b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample4'],
+        b'\xc3n`\xf5t\x00\x1e\xf3\xde\nU\x1f\x95\x0b\xdb9': ['sample2'],
+        b'\xdf\xedM\xdf1\xf2PPO\xc1\xf54T?\xdb\xa2': ['sample3']})
+      self.assertEqual(preprocessfasta.calculate_sequences_missing_data_percentage(), {'sample1': 0.0,
+ 'sample2': 0.0,
+ 'sample3': 0.0,
+ 'sample4': 0.0})
+        
+      self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),[])
+
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/no_duplicates.aln'))
+      self.cleanup()
+
+  def test_input_file_with_one_duplicate_sequences(self):   
+      preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/one_duplicate.aln')
+      self.assertEqual(preprocessfasta.hash_sequences(), 
+       {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'],
+        b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample4'],
+        b'\xc3n`\xf5t\x00\x1e\xf3\xde\nU\x1f\x95\x0b\xdb9': ['sample2']})
+        
+      self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1'])
+      
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln'))
+      self.cleanup()
+ 
+  def test_input_file_with_multiple_duplicate_sequences(self):   
+      preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln')
+      self.assertEqual(preprocessfasta.hash_sequences(), 
+       {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'],
+        b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']})
+        
+      self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
+      
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln'))
+      self.cleanup()
+ 
+  def test_input_file_with_all_duplicate_sequences(self):   
+      preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/all_same_sequence.aln')
+      self.assertEqual(preprocessfasta.hash_sequences(), 
+       {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1',
+                                                      'sample2',
+                                                      'sample3',
+                                                      'sample4']})
+                                                      
+      self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1',
+                                                      'sample2',
+                                                      'sample3'])
+      self.cleanup()
+                                                      
+  def test_filter_out_alignments_with_too_much_missing_data(self):
+    preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5)
+    preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+    self.assertTrue(filecmp.cmp('output.aln','gubbins/tests/data/preprocessfasta/expected_missing_data.aln'))
+    self.cleanup()           
+      
+  def cleanup(self):
+      for file_to_delete in ['output.aln']:
+          if os.path.exists(file_to_delete):
+              os.remove(file_to_delete) 
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/python/gubbins/tests/test_validate_fasta_input.py b/python/gubbins/tests/test_validate_fasta_input.py
index a440207..4cf7eac 100644
--- a/python/gubbins/tests/test_validate_fasta_input.py
+++ b/python/gubbins/tests/test_validate_fasta_input.py
@@ -10,34 +10,48 @@ import re
 import os
 from nose.tools import *
 from gubbins import common
+from gubbins.ValidateFastaAlignment import ValidateFastaAlignment
 
 class TestValidateInputFastaFile(unittest.TestCase):
 
   def test_input_file_exists(self):
-    assert common.GubbinsCommon.does_file_exist('non_existant_file', 'Alignment File') == 0
+      assert common.GubbinsCommon.does_file_exist('non_existant_file', 'Alignment File') == 0
     
+  def test_valid_fasta_file(self):
+      validate_fasta = ValidateFastaAlignment('gubbins/tests/data/multiple_recombinations.aln')
+      self.assertTrue(validate_fasta.is_input_fasta_file_valid())
+      
   def test_does_each_sequence_have_a_name(self):
-    assert common.GubbinsCommon.does_each_sequence_have_a_name_and_genomic_data('gubbins/tests/data/sequence_without_a_name.fa') == 0
-
+      validate_fasta = ValidateFastaAlignment('gubbins/tests/data/sequence_without_a_name.fa')
+      self.assertFalse(validate_fasta.does_each_sequence_have_a_name_and_genomic_data())
+      self.assertFalse(validate_fasta.is_input_fasta_file_valid())
+  
   def test_does_the_sequence_have_sensible_characters(self):
-    assert common.GubbinsCommon.does_each_sequence_have_a_name_and_genomic_data('gubbins/tests/data/multiple_recombinations.aln') == 1
-    assert common.GubbinsCommon.does_each_sequence_have_a_name_and_genomic_data('gubbins/tests/data/sequence_with_odd_chars.fa') == 0
+      validate_fasta = ValidateFastaAlignment('gubbins/tests/data/sequence_with_odd_chars.fa')
+      self.assertFalse(validate_fasta.does_each_sequence_have_a_name_and_genomic_data())
+      self.assertFalse(validate_fasta.is_input_fasta_file_valid())
      
   def test_are_all_sequences_the_same_length(self):
-    assert common.GubbinsCommon.does_each_sequence_have_the_same_length('gubbins/tests/data/valid_alignment.aln') == 1
-    assert common.GubbinsCommon.does_each_sequence_have_the_same_length('gubbins/tests/data/sequences_of_different_lengths.fa') == 0
-  
+      validate_fasta = ValidateFastaAlignment('gubbins/tests/data/valid_alignment.aln')
+      self.assertTrue(validate_fasta.does_each_sequence_have_the_same_length())
+      self.assertTrue(validate_fasta.is_input_fasta_file_valid())
+      
+      validate_fasta = ValidateFastaAlignment('gubbins/tests/data/sequences_of_different_lengths.fa')
+      self.assertFalse(validate_fasta.does_each_sequence_have_the_same_length())
+      self.assertFalse(validate_fasta.is_input_fasta_file_valid())
+      
   def test_are_all_sequence_names_unique(self):
-    assert common.GubbinsCommon.are_sequence_names_unique('gubbins/tests/data/all_unique_sequence_names.fa') == 1
-    assert common.GubbinsCommon.are_sequence_names_unique('gubbins/tests/data/non_unique_sequence_names.fa') == 0
+      validate_fasta = ValidateFastaAlignment('gubbins/tests/data/all_unique_sequence_names.fa')
+      self.assertTrue(validate_fasta.are_sequence_names_unique())
+      self.assertTrue(validate_fasta.is_input_fasta_file_valid())
+      
+      validate_fasta = ValidateFastaAlignment('gubbins/tests/data/non_unique_sequence_names.fa')
+      self.assertFalse(validate_fasta.are_sequence_names_unique())
+      self.assertFalse(validate_fasta.is_input_fasta_file_valid())
     
   def test_are_there_enough_sequences_to_build_a_tree(self):
-    assert common.GubbinsCommon.does_each_sequence_have_a_name_and_genomic_data('gubbins/tests/data/alignment_with_3_sequences.aln') == 0
-  
-  def test_there_is_variation_in_the_fasta(self):
-    assert common.GubbinsCommon.does_fasta_contain_variation('gubbins/tests/data/alignment_with_no_variation.aln') == 0
-    assert common.GubbinsCommon.does_fasta_contain_variation('gubbins/tests/data/multiple_recombinations.aln') == 1
+      validate_fasta = ValidateFastaAlignment('gubbins/tests/data/alignment_with_3_sequences.aln')
+      self.assertFalse(validate_fasta.is_input_fasta_file_valid())
   
 if __name__ == "__main__":
   unittest.main()
-
diff --git a/python/scripts/run_gubbins.py b/python/scripts/run_gubbins.py
index 6e454b1..592a64d 100755
--- a/python/scripts/run_gubbins.py
+++ b/python/scripts/run_gubbins.py
@@ -27,7 +27,7 @@ from gubbins import common
 
 
 
-parser = argparse.ArgumentParser(description='Croucher N. J., Page A. J., Connor T. R., Delaney A. J., Keane J. A., Bentley S. D., Parkhill J., Harris S.R. "Rapid phylogenetic analysis of large samples of recombinant bacterial whole genome sequences using Gubbins". Nucleic Acids Res. 2015 Feb 18;43(3):e15. doi: 10.1093/nar/gku1196.')
+parser = argparse.ArgumentParser(description='Croucher N. J., Page A. J., Connor T. R., Delaney A. J., Keane J. A., Bentley S. D., Parkhill J., Harris S.R. "Rapid phylogenetic analysis of large samples of recombinant bacterial whole genome sequences using Gubbins". Nucleic Acids Res. 2015 Feb 18;43(3):e15. doi: 10.1093/nar/gku1196 .')
 parser.add_argument('alignment_filename',       help='Multifasta alignment file')
 parser.add_argument('--outgroup',         '-o', help='Outgroup name for rerooting. A list of comma separated names can be used if they form a clade')
 parser.add_argument('--starting_tree',    '-s', help='Starting tree')

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gubbins.git



More information about the debian-med-commit mailing list