[med-svn] [Git][med-team/gubbins][upstream] New upstream version 3.1.0

Mon Oct 11 10:03:41 BST 2021


Andreas Tille pushed to branch upstream at Debian Med / gubbins


Commits:
099d3934 by Andreas Tille at 2021-10-11T10:59:32+02:00
New upstream version 3.1.0
- - - - -


10 changed files:

- VERSION
- python/gubbins/ValidateFastaAlignment.py
- python/gubbins/common.py
- python/gubbins/pyjar.py
- python/gubbins/run_gubbins.py
- python/gubbins/treebuilders.py
- python/gubbins/utils.py
- + python/scripts/extract_clade.py
- + python/scripts/generate_ska_alignment.py
- + python/scripts/mask_gubbins_aln.py


Changes:

=====================================
VERSION
=====================================
@@ -1 +1 @@
-3.0.0
+3.1.0


=====================================
python/gubbins/ValidateFastaAlignment.py
=====================================
@@ -6,6 +6,7 @@ from collections import Counter
 from Bio.Align import MultipleSeqAlignment
 
 class ValidateFastaAlignment(object):
+
     def __init__(self, input_filename):
       self.input_filename = input_filename
 
@@ -22,9 +23,8 @@ class ValidateFastaAlignment(object):
               return False
       except:
           return False
-    
       return True
-    
+
     def does_each_sequence_have_a_name_and_genomic_data(self):
       with  open(self.input_filename, "r") as input_handle:
         alignments = AlignIO.parse(input_handle, "fasta")
@@ -33,21 +33,19 @@ class ValidateFastaAlignment(object):
             for record in alignment:
                 number_of_sequences +=1
                 if record.name is None or record.name == "":
-                  print("Error with the input FASTA file: One of the sequence names is blank")
+                  sys.stderr.write("Error with the input FASTA file: " + record.name + " is blank\n")
                   return False
                 if record.seq is None or record.seq == "":
-                  print("Error with the input FASTA file: One of the sequences is empty")
+                  sys.stderr.write("Error with the input FASTA file: " + record.name + " is empty\n")
                   return False
                 if re.search('[^ACGTNacgtn-]', str(record.seq))  != None:
-                  print("Error with the input FASTA file: One of the sequences contains odd characters, only ACGTNacgtn- are permitted")
+                  sys.stderr.write("Error with the input FASTA file: " + record.name + " contains disallowed characters, only ACGTNacgtn- are permitted\n")
                   return False
-        input_handle.close()
       return True
-    
+
     def does_each_sequence_have_the_same_length(self):
       try:
         with open(self.input_filename) as input_handle:
-    
           alignments = AlignIO.parse(input_handle, "fasta")
           sequence_length = -1
           for alignment in alignments:
@@ -63,17 +61,25 @@ class ValidateFastaAlignment(object):
         print("Error with the input FASTA file: It is in the wrong format so check its an alignment")
         return False
       return True
-    
+
     def are_sequence_names_unique(self):
-      with open(self.input_filename) as input_handle:
-        alignments = AlignIO.parse(input_handle, "fasta")
-        sequence_names = []
-        for alignment in alignments:
+        any_modified_names = False
+        with open(self.input_filename) as input_handle:
+            alignment = AlignIO.read(input_handle, "fasta")
+            sequence_names = []
             for record in alignment:
+                # Remove disallowed characters
+                if '#' in record.name or '#' in record.name:
+                    record.name = record.name.replace("#","_").replace(":","_")
+                    record.id = record.id.replace("#", "_").replace(":", "_")
+                    record.description = record.description.replace("#", "_").replace(":", "_")
+                    any_modified_names = True
+                # Store modified names
                 sequence_names.append(record.name)
-          
         if [k for k,v in list(Counter(sequence_names).items()) if v>1] != []:
           return False
-        input_handle.close()
-      return True
+        # Update alignment if names changed
+        with open(self.input_filename, "w") as output_handle:
+            AlignIO.write(alignment,output_handle, "fasta")
+        return True
       


=====================================
python/gubbins/common.py
=====================================
@@ -256,8 +256,9 @@ def parse_and_run(input_args, program_description=""):
             # 3.5a. Joint ancestral reconstruction with new tree and info file in each iteration
             info_filename = model_fitter.get_info_filename(temp_working_dir,current_basename)
             recontree_filename = model_fitter.get_recontree_filename(temp_working_dir,current_basename)
-            # Arbitrary rooting of tree
-            reroot_tree_at_midpoint(recontree_filename)
+            # Set root of reconstruction tree to match that of the current tree
+            # Cannot just midpoint root both, because the branch lengths differ between them
+            harmonise_roots(recontree_filename, temp_rooted_tree)
             
             printer.print(["\nRunning joint ancestral reconstruction with pyjar"])
             jar(alignment = polymorphism_alignment, # complete polymorphism alignment
@@ -271,8 +272,12 @@ def parse_and_run(input_args, program_description=""):
                 verbose = input_args.verbose)
             gaps_alignment_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".joint.aln"
             raw_internal_rooted_tree_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".joint.tre"
-            transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_rooted_tree,
-                                                  current_tree_name_with_internal_nodes, "pyjar")
+            printer.print(["\nTransferring pyjar results onto original recombination-corrected tree"])
+            transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename,
+                                                  temp_rooted_tree,
+                                                  current_tree_name_with_internal_nodes,
+                                                  "pyjar")
+            printer.print(["\nDone transfer"])
 
         else:
 
@@ -309,10 +314,12 @@ def parse_and_run(input_args, program_description=""):
                                                   current_tree_name_with_internal_nodes, sequence_reconstructor)
             elif input_args.seq_recon == "iqtree" or input_args.seq_recon == "raxmlng":
                 # IQtree returns an unrooted tree
-                temp_unrooted_tree = temp_working_dir + "/" + current_tree_name + ".unrooted"
-                unroot_tree(temp_rooted_tree, temp_unrooted_tree)
-                transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_unrooted_tree,
-                                                  current_tree_name_with_internal_nodes, sequence_reconstructor)
+                harmonise_roots(raw_internal_rooted_tree_filename, temp_rooted_tree)
+                transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename,
+                                                 temp_rooted_tree,
+                                                  current_tree_name_with_internal_nodes,
+                                                  sequence_reconstructor,
+                                                  use_root = False)
             else:
                 sys.stderr.write("Unrecognised sequence reconstruction command: " + input_args.seq_recon + '\n')
                 sys.exit()
@@ -699,7 +706,6 @@ def reroot_tree_with_outgroup(tree_name, outgroups):
     with open(tree_name, 'w+') as output_file:
         output_file.write(output_tree_string.replace('\'', ''))
 
-
 def reroot_tree_at_midpoint(tree_name):
     tree = dendropy.Tree.get_from_path(tree_name, 'newick', preserve_underscores=True)
     split_all_non_bi_nodes(tree.seed_node)
@@ -718,6 +724,50 @@ def unroot_tree(input_filename, output_filename):
     with open(output_filename, 'w+') as output_file:
         output_file.write(output_tree_string.replace('\'', ''))
 
+def harmonise_roots(new_tree_fn, tree_for_root_fn):
+    # Read in tree and get nodes adjacent to root
+    taxa = dendropy.TaxonNamespace()
+    tree_for_root = dendropy.Tree.get_from_path(tree_for_root_fn,
+                                                'newick',
+                                                preserve_underscores=True,
+                                                taxon_namespace=taxa,
+                                                rooting="force-rooted")
+    tree_for_root.encode_bipartitions()
+    root = tree_for_root.seed_node
+    root_adjacent_bipartitions = []
+    for root_adjacent_node in root.child_nodes():
+        root_adjacent_bipartitions.append(root_adjacent_node.edge.bipartition)
+    # Now search the new tree for these nodes
+    new_tree = dendropy.Tree.get_from_path(new_tree_fn,
+                                            'newick',
+                                            preserve_underscores=True,
+                                            taxon_namespace=taxa,
+                                            rooting="force-rooted")
+    new_tree.encode_bipartitions()
+    for node in new_tree.preorder_node_iter():
+        if node.edge.bipartition in root_adjacent_bipartitions:
+            half_branch_length = node.edge_length/2
+            new_tree.reroot_at_edge(node.edge,
+                                    length1 = half_branch_length,
+                                    length2 = half_branch_length)
+            break
+    new_tree_string = tree_as_string(new_tree,
+                                        suppress_internal=False,
+                                        suppress_rooting=False)
+
+    # Check both trees are topologically identical
+    missing_bipartitions = dendropy.calculate.treecompare.find_missing_bipartitions(tree_for_root,
+                                                                                    new_tree,
+                                                                                    is_bipartitions_updated=False)
+    
+    if len(missing_bipartitions) > 0:
+        sys.stderr.write('Bipartitions missing when transferring node label transfer: ' + str([str(x) for x in missing_bipartitions]))
+        sys.exit(1)
+
+    # Write output
+    with open(new_tree_fn, 'w+') as output_file:
+        output_file.write(new_tree_string.replace('\'', ''))
+
 def filter_out_removed_taxa_from_tree(input_filename, output_filename, taxa_removed):
     tree = dendropy.Tree.get_from_path(input_filename, 'newick', preserve_underscores=True)
     tree.prune_taxa_with_labels(taxa_removed, update_bipartitions=True)
@@ -791,32 +841,70 @@ def get_monophyletic_outgroup(tree_name, outgroups):
 
 
 def transfer_internal_node_labels_to_tree(source_tree_filename, destination_tree_filename, output_tree_filename,
-                                          sequence_reconstructor):
-
+                                          sequence_reconstructor, use_root = True):
     # read source tree and extract node labels, to match with the ancestral sequence reconstruction
-    source_tree = dendropy.Tree.get_from_path(source_tree_filename, 'newick', preserve_underscores=True)
-    source_internal_node_labels = []
+    taxa = dendropy.TaxonNamespace()
+    source_tree = dendropy.Tree.get_from_path(source_tree_filename,
+                                              'newick',
+                                              preserve_underscores=True,
+                                              taxon_namespace=taxa,
+                                              rooting='force-rooted')
+    source_tree.encode_bipartitions()
+    source_internal_node_dict = {}
     for source_internal_node in source_tree.internal_nodes():
+        node_bipartition = source_internal_node.edge.bipartition
         if source_internal_node.label:
-            source_internal_node_labels.append(source_internal_node.label)
+            source_internal_node_dict[node_bipartition] = source_internal_node.label
         else:
-            source_internal_node_labels.append('')
-
+            source_internal_node_dict[node_bipartition] = ''
     # read original tree and add in the labels from the ancestral sequence reconstruction
-    destination_tree = dendropy.Tree.get_from_path(destination_tree_filename, 'newick', preserve_underscores=True)
-    for index, destination_internal_node in enumerate(destination_tree.internal_nodes()):
-        if sequence_reconstructor == 'pyjar':
-            new_label = str(source_internal_node_labels[index])
-        else:
-            new_label = sequence_reconstructor.replace_internal_node_label(str(source_internal_node_labels[index]))
-        destination_internal_node.label = None
-        destination_internal_node.taxon = dendropy.Taxon(new_label)
+    destination_tree = dendropy.Tree.get_from_path(destination_tree_filename,
+                                                    'newick',
+                                                    preserve_underscores=True,
+                                                    taxon_namespace=taxa,
+                                                    rooting='force-rooted')
+    destination_tree.encode_bipartitions()
+    
+    # Check both trees are topologically identical
+    missing_bipartitions = dendropy.calculate.treecompare.find_missing_bipartitions(source_tree,
+                                                                                    destination_tree,
+                                                                                    is_bipartitions_updated=False)
+    if len(missing_bipartitions) > 0:
+        sys.stderr.write('Bipartitions missing when transferring node label transfer: ' + str([str(x) for x in missing_bipartitions]))
+        sys.exit(1)
+    
+    root_alternative = ''
+    for destination_internal_node in destination_tree.internal_nodes():
+        if destination_internal_node != destination_tree.seed_node or use_root:
+            node_bipartition = destination_internal_node.edge.bipartition
+            if sequence_reconstructor == 'pyjar':
+                try:
+                    new_label = source_internal_node_dict[node_bipartition]
+                except:
+                    sys.stderr.write('Unable to find bipartition ' + str(node_bipartition) + '\n')
+                    sys.exit(1)
+            else:
+                new_label = sequence_reconstructor.replace_internal_node_label(str(source_internal_node_dict[node_bipartition]))
+            destination_internal_node.label = None
+            destination_internal_node.taxon = dendropy.Taxon(new_label)
 
     # output final tree
-    output_tree_string = tree_as_string(destination_tree, suppress_internal=False, suppress_rooting=False)
-    with open(output_tree_filename, 'w+') as output_file:
-        output_file.write(output_tree_string.replace('\'', ''))
+    if not use_root:
+        alt_root_name = ''
+        for root_child_node in destination_tree.seed_node.child_nodes():
+            alt_root_name = root_child_node.taxon.label
+            break
+        destination_tree.seed_node.label = None
+        destination_tree.seed_node.taxon = dendropy.Taxon(alt_root_name)
+    destination_tree_string = destination_tree.as_string(schema="newick",
+                                                            suppress_rooting=True,
+                                                            unquoted_underscores=True,
+                                                            suppress_internal_node_labels=True).replace("'","").replace('\'', '')
 
+    with open(output_tree_filename, 'w+') as output_file:
+        print(destination_tree_string,
+                 file=output_file,
+                 end='')
 
 def remove_internal_node_labels_from_tree(input_filename, output_filename):
     tree = dendropy.Tree.get_from_path(input_filename, 'newick', preserve_underscores=True)


=====================================
python/gubbins/pyjar.py
=====================================
@@ -20,7 +20,7 @@ try:
     NumpyShared = collections.namedtuple('NumpyShared', ('name', 'shape', 'dtype'))
 except ImportError as e:
     sys.stderr.write("This version of Gubbins requires the multiprocessing library and python v3.8 or higher for memory management\n")
-    sys.exit(0)
+    sys.exit(201)
 
 from gubbins.utils import generate_shared_mem_array
 
@@ -31,31 +31,38 @@ from gubbins.utils import generate_shared_mem_array
 def read_alignment(filename, file_type, verbose=False):
     if not os.path.isfile(filename):
         print("Error: alignment file " + filename + " does not exist")
-        sys.exit()
+        sys.exit(202)
     if verbose:
         print("Trying to open file " + filename + " as " + file_type)
     try:
-        alignmentObject = AlignIO.read(open(filename), file_type)
+        with open(filename,'r') as aln_in:
+            alignmentObject = AlignIO.read(aln_in, file_type)
         if verbose:
             print("Alignment read successfully")
     except:
         print("Cannot open alignment file " + filename + " as " + file_type)
-        sys.exit()
+        sys.exit(203)
     return alignmentObject
 
 #Calculate Pij from Q matrix and branch length
 def calculate_pij(branch_length,rate_matrix):
     if branch_length==0:
-        return numpy.array([[1, 0, 0, 0,], [0, 1, 0, 0,], [0, 0, 1, 0,], [0, 0, 0, 1,]])
+        return numpy.array([[0.0, float("-inf"), float("-inf"), float("-inf"),],
+                            [float("-inf"), 0.0, float("-inf"), float("-inf"),],
+                            [float("-inf"), float("-inf"), 0.0, float("-inf"),],
+                            [float("-inf"), float("-inf"), float("-inf"), 0.0,]])
     else:
-        return numpy.log(linalg.expm(numpy.multiply(branch_length,rate_matrix)))
+        return numpy.log(linalg.expm(numpy.multiply(branch_length,rate_matrix))) # modified
 
 #Read the tree file and root
 def read_tree(treefile):
     if not os.path.isfile(treefile):
         print("Error: tree file does not exist")
-        sys.exit()
-    t=dendropy.Tree.get(path=treefile, schema="newick", preserve_underscores=True, rooting="force-rooted")
+        sys.exit(204)
+    t=dendropy.Tree.get(path=treefile,
+                        schema="newick",
+                        preserve_underscores=True,
+                        rooting="force-rooted")
     return t
 
 # Read the RAxML info file to get rates and frequencies
@@ -63,11 +70,11 @@ def read_info(infofile, type = 'raxml'):
 
     if not os.path.isfile(infofile):
         print("Error: model information file " + infofile + " does not exist")
-        sys.exit()
+        sys.exit(205)
     
     if type not in ['raxml', 'raxmlng', 'iqtree','fasttree']:
         sys.stderr.write('Only able to parse GTR-type models from raxml, iqtree or fasttree')
-        sys.exit()
+        sys.exit(206)
     
     r=[-1.0] * 6 # initialise rates
     f=[-1.0] * 4 # initialise frequencies
@@ -146,7 +153,7 @@ def read_info(infofile, type = 'raxml'):
     # Check frequencies and rates have been extracted correctly
     if -1.0 in f or -1.0 in r:
         sys.stderr.write('Problem with extracting model parameters - frequencies are ' + str(f) + ' and rates are ' + str(r))
-        sys.exit()
+        sys.exit(207)
 
     return f, r
 
@@ -239,7 +246,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
     
     # Iterate over columns
     for column,base_pattern_columns_padded in zip(columns,column_positions):
-    
+
         ### TIMING
         if verbose:
             calc_time_start = time.process_time()
@@ -285,6 +292,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
                         
                             #1b. Set for each amino acid i: Ly(i) = Pij(ty), where ty is the branch length between y and its father.
                             node.L={"A": pij[base_matrix["A"]][base_matrix[base[taxon]]], "C": pij[base_matrix["C"]][base_matrix[base[taxon]]], "G": pij[base_matrix["G"]][base_matrix[base[taxon]]], "T": pij[base_matrix["T"]][base_matrix[base[taxon]]]}
+
                         else:
                             
                             node.C={"A": "A", "C": "C", "G": "G", "T": "T"}
@@ -292,7 +300,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
                         
                     except KeyError:
                         print("Cannot find", taxon, "in base")
-                        sys.exit()
+                        sys.exit(208)
                 
                 else:
                     node.L={}
@@ -327,7 +335,6 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
                     c+=child.L[end]
                 for start in columnbases:
                     j=log(base_frequencies[base_matrix[end]])+c
-
                     if j>node.L[start]:
                         node.L[start]=j
                         node.C[start]=end
@@ -339,7 +346,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
                     max_root_base_likelihood=node.L[root_base]
                     max_root_base=node.C[root_base]
             node.r=max_root_base
-            
+
             #Traverse the tree from the root in the direction of the OTUs, assigning to each node its most likely ancestral character as follows:
             for node in tree.preorder_node_iter():
             
@@ -354,8 +361,6 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
             rootlens=[]
             for child in tree.seed_node.child_node_iter():
                 rootlens.append([child.edge_length,child,child.r])
-            rootlens.sort(key = lambda x: x[0])
-            tree.seed_node.r=rootlens[-1][1].r
             
             ### TIMING
             if verbose:
@@ -480,7 +485,7 @@ def jar(alignment = None, base_patterns = None, base_pattern_positions = None, t
             node.taxon=tree.taxon_namespace.get_taxon(nodename)
             if nodename in alignment_sequence_names:
                 print(nodename, "already in alignment. Quitting")
-                sys.exit()
+                sys.exit(209)
             ancestral_node_names.append(nodename) # index for reconstruction
         if node.parent_node != None:
             node.pij=calculate_pij(node.edge_length, rm)
@@ -553,10 +558,17 @@ def jar(alignment = None, base_patterns = None, base_pattern_positions = None, t
                     continue
 
         # Print tree
+        from gubbins.common import tree_as_string
+        
         if verbose:
             print("Printing tree with internal nodes labelled: ", output_prefix+".joint.tre")
         with open(output_prefix+".joint.tre", "w") as tree_output:
-            print(tree.as_string(schema="newick", suppress_rooting=True, unquoted_underscores=True, suppress_internal_node_labels=True).replace("'",""), file=tree_output)
+        
+            recon_tree = tree_as_string(tree,
+                                        suppress_rooting=True,
+                                        suppress_internal=False)
+            print(recon_tree.replace('\'', ''),
+                  file = tree_output)
         
     if verbose:
         print("Done")


=====================================
python/gubbins/run_gubbins.py
=====================================
@@ -19,6 +19,7 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 #
 
+import sys
 import argparse
 from gubbins.__init__ import version
 import gubbins.common


=====================================
python/gubbins/treebuilders.py
=====================================
@@ -629,8 +629,7 @@ class RAxMLNG:
         self.citation = "https://doi.org/10.1093/bioinformatics/btz305"
         
         # Set parallelisation
-        if self.threads > 1:
-            command.extend(["--threads", str(self.threads)])
+        command.extend(["--threads", str(self.threads)])
 
         # Add model
         command.extend(["-model"])
@@ -706,6 +705,7 @@ class RAxMLNG:
 
     def convert_raw_ancestral_states_to_fasta(self, input_filename, output_filename):
         """Converts the file containing ancestral sequences into FASTA format"""
+        # Use the IUPAC codes from here: https://www.bioinformatics.org/sms/iupac.html
         with open(input_filename, 'r') as infile:
             with open(output_filename, 'w+') as outfile:
                 for sequence_line in infile:
@@ -714,7 +714,13 @@ class RAxMLNG:
                                                                              'R': 'N',
                                                                              'Y': 'N',
                                                                              'S': 'N',
-                                                                             'W': 'N'
+                                                                             'W': 'N',
+                                                                             'K': 'N',
+                                                                             'M': 'N',
+                                                                             'B': 'N',
+                                                                             'D': 'N',
+                                                                             'H': 'N',
+                                                                             'V': 'N'
                                                                             }
                                                                      )
                                                               )


=====================================
python/gubbins/utils.py
=====================================
@@ -108,18 +108,18 @@ def choose_executable_based_on_processor(list_of_executables: list):
                                   stdout=subprocess.PIPE,
                                   shell=True).communicate()[0].decode("utf-8")
     flags = output.lower().split()
-    if cpu_info:
-        # Iterate through list to match with CPU features
-        for executable in list_of_executables:
-            if 'AVX2' in executable and 'avx2' in flags and which(executable):
-                break
-            elif 'AVX' in executable and 'avx' in flags and which(executable):
-                break
-            elif 'SSE3' in executable and 'sse3' in flags and which(executable):
-                break
-    else:
-        # Final executable on list is generic
-        executable = list_of_executables[-1]
+
+    # Iterate through list to match with CPU features
+    for executable in list_of_executables:
+        if cpu_info and 'AVX2' in executable and 'avx2' in flags and which(executable):
+            break
+        elif cpu_info and 'AVX' in executable and 'avx' in flags and which(executable):
+            break
+        elif cpu_info and 'SSE3' in executable and 'sse3' in flags and which(executable):
+            break
+        # to enable selection of raxml-ng-mpi
+        elif '-mpi' in executable and which(executable):
+            break
 
     # Check executable is available
     if which(executable):


=====================================
python/scripts/extract_clade.py
=====================================
@@ -0,0 +1,115 @@
+# encoding: utf-8
+# Wellcome Trust Sanger Institute and Imperial College London
+# Copyright (C) 2020  Wellcome Trust Sanger Institute and Imperial College London
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+#
+
+# Generic imports
+import sys
+import argparse
+import re
+# Phylogenetic imports
+import dendropy
+# Biopython imports
+from Bio import AlignIO
+from Bio import Phylo
+from Bio import SeqIO
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+
+# command line parsing
+def get_options():
+
+    parser = argparse.ArgumentParser(description='Mask recombinant regions detected by'
+                                                    ' Gubbins from the input alignment',
+                                     prog='mask_gubbins_aln')
+
+    # input options
+    parser.add_argument('--list',
+                        help = 'List of sequences to extract',
+                        required = True)
+    parser.add_argument('--aln',
+                        help = 'Input alignment (FASTA format)',
+                        required = True)
+    parser.add_argument('--gff',
+                        help = 'GFF of recombinant regions detected by Gubbins',
+                        required = True)
+    parser.add_argument('--tree',
+                        help = 'Final tree generated by Gubbins',
+                        required = True)
+    parser.add_argument('--out',
+                        help = 'Output file prefix',
+                        required = True)
+    parser.add_argument('--out-fmt',
+                        help = 'Format of output alignment',
+                        default = 'fasta')
+    parser.add_argument('--missing-char',
+                        help = 'Character used to replace recombinant sequence',
+                        default = '-')
+
+    return parser.parse_args()
+
+# main code
+if __name__ == "__main__":
+
+    # Get command line options
+    args = get_options()
+    
+    # Parse list of input sequences
+    subset = set()
+    # Read in FASTA assemblies
+    with open(args.list,'r') as seq_list:
+        for line in seq_list.readlines():
+            subset.add(line.strip().split()[0])
+    
+    # Extract from alignment
+    output_aln_name = args.out + '.aln'
+    names_in_alignment = set()
+    with open(output_aln_name,'w') as out_aln:
+        alignment = AlignIO.read(args.aln,'fasta')
+        for taxon in alignment:
+            names_in_alignment.add(taxon.id)
+            if taxon.id in subset:
+                SeqIO.write(taxon, out_aln, args.out_fmt)
+
+    # Check subset sequences are found in alignment
+    not_found_in_dataset = subset - names_in_alignment
+    if len(not_found_in_dataset) > 0:
+        sys.stderr.write('Sequences in subset missing from alignment: ' + \
+                            str(not_found_in_dataset) + '\n')
+        sys.exit(1)
+    
+    # Prune from the tree
+    output_tree_name = args.out + '.tree'
+    tree = dendropy.Tree.get(path = args.tree,
+                             schema = 'newick',
+                             preserve_underscores = True)
+    tree.retain_taxa_with_labels(subset)
+    tree.write_to_path(output_tree_name,
+                        'newick')
+
+    # Identify relevant recombination blocks
+    output_gff_name = args.out + '.gff'
+    taxon_pattern = re.compile('taxa="([^"]*)"')
+    with open(args.gff,'r') as in_gff, open(output_gff_name,'w') as out_gff:
+        for line in in_gff.readlines():
+            if line.startswith('##'):
+                out_gff.write(line)
+            else:
+                info = line.rstrip().split('\t')
+                taxon_set = set(taxon_pattern.search(info[8]).group(1).split())
+                if not taxon_set.isdisjoint(subset):
+                    out_gff.write(line)


=====================================
python/scripts/generate_ska_alignment.py
=====================================
@@ -0,0 +1,152 @@
+# encoding: utf-8
+# Wellcome Trust Sanger Institute and Imperial College London
+# Copyright (C) 2020 Imperial College London and Imperial College London
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+#
+
+# Generic imports
+import sys
+import os
+import argparse
+import subprocess
+from multiprocessing import Pool
+from functools import partial
+from shutil import which
+
+# command line parsing
+def get_options():
+
+    parser = argparse.ArgumentParser(description='Generate a ska alignment from a list '
+                                                    'of assemblies',
+                                     prog='generate_ska_alignment')
+
+    # input options
+    parser.add_argument('--reference',
+                        help = 'Name of reference sequence to use for alignment',
+                        required = True)
+    parser.add_argument('--fasta',
+                        help = 'List of FASTA files to include in alignment',
+                        default = None,
+                        required = False)
+    parser.add_argument('--fastq',
+                        help = 'List of FASTQ files to include in alignment',
+                        default = None,
+                        required = False)
+    parser.add_argument('--out',
+                        help = 'Name of output alignment',
+                        required = True)
+    parser.add_argument('--k',
+                        help = 'Split kmer size',
+                        type = int,
+                        default = 15)
+    parser.add_argument('--threads',
+                        help = 'Number of threads to use',
+                        type = int,
+                        default = 1)
+    parser.add_argument('--no-cleanup',
+                        help = 'Do not remove intermediate files',
+                        action = 'store_true',
+                        default = False)
+
+    return parser.parse_args()
+
+def map_fasta_sequence(seq, k = None, names = None):
+    subprocess.check_output('ska fasta -o ' + names[seq]  + ' -k ' + str(k) + ' ' + seq,
+                            shell = True)
+
+def map_fastq_sequence(read_pair, k = None, names = None):
+    subprocess.check_output('ska fastq -o ' + names[read_pair[0]]  + ' -k ' + str(k) + ' ' + \
+                            read_pair[0] + ' ' + read_pair[1],
+                            shell = True)
+
+def ska_map_sequences(seq, k = None, ref = None):
+    subprocess.check_output('ska map -o ' + seq + '.map -k ' + str(k) + ' -r ' + ref + \
+                            ' ' + seq + '.skf',
+                            shell = True)
+
+# main code
+if __name__ == "__main__":
+
+    # Get command line options
+    args = get_options()
+
+    # Check if ska is installed
+    if which('ska') is None:
+        sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"')
+        sys.exit(1)
+
+    # Dictionary for sequence names
+    seq_names = {}
+    all_names = []
+
+    # Make split kmers from assemblies
+    fasta_names = []
+    if args.fasta is not None:
+        # Read in FASTA assemblies
+        with open(args.fasta,'r') as fasta_list:
+            for line in fasta_list.readlines():
+                info = line.strip().split()
+                if os.path.isfile(info[1]):
+                    fasta_names.append(info[1])
+                    seq_names[info[1]] = info[0]
+                    all_names.append(info[0])
+                else:
+                    sys.stderr.write('Unable to find file ' + info[1] + '\n')
+        # Sketch into split kmers
+        with Pool(processes = args.threads) as pool:
+            pool.map(partial(map_fasta_sequence,
+                                k = args.k,
+                                names = seq_names),
+                                fasta_names)
+    
+    # Make split kmers from FASTQs
+    fastq_names = []
+    if args.fastq is not None:
+        # Read in FASTQ reads
+        with open(args.fastq,'r') as fastq_list:
+            for line in fastq_list.readlines():
+                info = line.strip().split()
+                if os.path.isfile(fns[1]) and os.path.isfile(fns[2]):
+                    fastq_names.append((info[1],info[2]))
+                    seq_names[info[1]] = info[0]
+                    all_names.append(info[0])
+                else:
+                    sys.stderr.write('Unable to find files ' + fns[1] + ' and ' + fns[2] + '\n')
+        # Sketch into split kmers
+        with Pool(processes = args.threads) as pool:
+            return_codes = pool.map(partial(map_fastq_sequence,
+                                            k = args.k,
+                                            names = seq_names),
+                                            fastq_names)
+    
+    # Map sequences
+    with Pool(processes = args.threads) as pool:
+        return_codes = pool.map(partial(ska_map_sequences,
+                                        k = args.k,
+                                        ref = args.reference),
+                                        all_names)
+
+    # Generate alignment
+    subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out,
+                            shell = True)
+
+    # Clean up
+    if not args.no_cleanup:
+        subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]),
+                                shell = True)
+        subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]),
+                                shell = True)
+


=====================================
python/scripts/mask_gubbins_aln.py
=====================================
@@ -0,0 +1,87 @@
+# encoding: utf-8
+# Wellcome Trust Sanger Institute and Imperial College London
+# Copyright (C) 2020  Wellcome Trust Sanger Institute and Imperial College London
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+#
+
+# Generic imports
+import sys
+import argparse
+import re
+# Biopython imports
+from Bio import AlignIO
+from Bio import Phylo
+from Bio import SeqIO
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+
+# command line parsing
+def get_options():
+
+    parser = argparse.ArgumentParser(description='Mask recombinant regions detected by'
+                                                    ' Gubbins from the input alignment',
+                                     prog='mask_gubbins_aln')
+
+    # input options
+    parser.add_argument('--aln',
+                        help = 'Input alignment (FASTA format)',
+                        required = True)
+    parser.add_argument('--gff',
+                        help = 'GFF of recombinant regions detected by Gubbins',
+                        required = True)
+    parser.add_argument('--out',
+                        help = 'Output file name',
+                        required = True)
+    parser.add_argument('--out-fmt',
+                        help = 'Format of output alignment',
+                        default = 'fasta')
+    parser.add_argument('--missing-char',
+                        help = 'Character used to replace recombinant sequence',
+                        default = '-')
+
+    return parser.parse_args()
+
+# main code
+if __name__ == "__main__":
+
+    # Get command line options
+    args = get_options()
+    
+    # Read in alignment
+    overall_taxon_list = []
+    alignment = AlignIO.read(args.aln,'fasta')
+    for taxon in alignment:
+        overall_taxon_list.append(taxon.id)
+        taxon.seq = taxon.seq.tomutable()
+    overall_taxon_set = set(overall_taxon_list)
+    
+    # Read recombinant regions from GFF
+    taxon_pattern = re.compile('taxa="([^"]*)"')
+    with open(args.gff,'r') as gff_file:
+        for line in gff_file.readlines():
+            if not line.startswith('##'):
+                info = line.rstrip().split('\t')
+                start = int(info[3])-1
+                end = int(info[4])-1
+                taxon_set = set(taxon_pattern.search(info[8]).group(1).split())
+                if taxon_set.issubset(overall_taxon_set):
+                    for taxon in alignment:
+                        if taxon.id in taxon_set:
+                            taxon.seq[start:end+1] = args.missing_char*(end-start+1)
+
+    # Write out masked sequence
+    with open(args.out,'w') as out_aln:
+        AlignIO.write(alignment, out_aln, args.out_fmt)



View it on GitLab: https://salsa.debian.org/med-team/gubbins/-/commit/099d39347d036549525167aafc660bc8b80a1bb3

-- 
View it on GitLab: https://salsa.debian.org/med-team/gubbins/-/commit/099d39347d036549525167aafc660bc8b80a1bb3
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211011/b9a2f751/attachment-0001.htm>