[med-svn] [Git][med-team/gubbins][upstream] New upstream version 3.1.0
Andreas Tille (@tille)
gitlab at salsa.debian.org
Mon Oct 11 10:03:41 BST 2021
Andreas Tille pushed to branch upstream at Debian Med / gubbins
Commits:
099d3934 by Andreas Tille at 2021-10-11T10:59:32+02:00
New upstream version 3.1.0
- - - - -
10 changed files:
- VERSION
- python/gubbins/ValidateFastaAlignment.py
- python/gubbins/common.py
- python/gubbins/pyjar.py
- python/gubbins/run_gubbins.py
- python/gubbins/treebuilders.py
- python/gubbins/utils.py
- + python/scripts/extract_clade.py
- + python/scripts/generate_ska_alignment.py
- + python/scripts/mask_gubbins_aln.py
Changes:
=====================================
VERSION
=====================================
@@ -1 +1 @@
-3.0.0
+3.1.0
=====================================
python/gubbins/ValidateFastaAlignment.py
=====================================
@@ -6,6 +6,7 @@ from collections import Counter
from Bio.Align import MultipleSeqAlignment
class ValidateFastaAlignment(object):
+
def __init__(self, input_filename):
self.input_filename = input_filename
@@ -22,9 +23,8 @@ class ValidateFastaAlignment(object):
return False
except:
return False
-
return True
-
+
def does_each_sequence_have_a_name_and_genomic_data(self):
with open(self.input_filename, "r") as input_handle:
alignments = AlignIO.parse(input_handle, "fasta")
@@ -33,21 +33,19 @@ class ValidateFastaAlignment(object):
for record in alignment:
number_of_sequences +=1
if record.name is None or record.name == "":
- print("Error with the input FASTA file: One of the sequence names is blank")
+ sys.stderr.write("Error with the input FASTA file: " + record.name + " is blank\n")
return False
if record.seq is None or record.seq == "":
- print("Error with the input FASTA file: One of the sequences is empty")
+ sys.stderr.write("Error with the input FASTA file: " + record.name + " is empty\n")
return False
if re.search('[^ACGTNacgtn-]', str(record.seq)) != None:
- print("Error with the input FASTA file: One of the sequences contains odd characters, only ACGTNacgtn- are permitted")
+ sys.stderr.write("Error with the input FASTA file: " + record.name + " contains disallowed characters, only ACGTNacgtn- are permitted\n")
return False
- input_handle.close()
return True
-
+
def does_each_sequence_have_the_same_length(self):
try:
with open(self.input_filename) as input_handle:
-
alignments = AlignIO.parse(input_handle, "fasta")
sequence_length = -1
for alignment in alignments:
@@ -63,17 +61,25 @@ class ValidateFastaAlignment(object):
print("Error with the input FASTA file: It is in the wrong format so check its an alignment")
return False
return True
-
+
def are_sequence_names_unique(self):
- with open(self.input_filename) as input_handle:
- alignments = AlignIO.parse(input_handle, "fasta")
- sequence_names = []
- for alignment in alignments:
+ any_modified_names = False
+ with open(self.input_filename) as input_handle:
+ alignment = AlignIO.read(input_handle, "fasta")
+ sequence_names = []
for record in alignment:
+ # Remove disallowed characters
+ if '#' in record.name or '#' in record.name:
+ record.name = record.name.replace("#","_").replace(":","_")
+ record.id = record.id.replace("#", "_").replace(":", "_")
+ record.description = record.description.replace("#", "_").replace(":", "_")
+ any_modified_names = True
+ # Store modified names
sequence_names.append(record.name)
-
if [k for k,v in list(Counter(sequence_names).items()) if v>1] != []:
return False
- input_handle.close()
- return True
+ # Update alignment if names changed
+ with open(self.input_filename, "w") as output_handle:
+ AlignIO.write(alignment,output_handle, "fasta")
+ return True
=====================================
python/gubbins/common.py
=====================================
@@ -256,8 +256,9 @@ def parse_and_run(input_args, program_description=""):
# 3.5a. Joint ancestral reconstruction with new tree and info file in each iteration
info_filename = model_fitter.get_info_filename(temp_working_dir,current_basename)
recontree_filename = model_fitter.get_recontree_filename(temp_working_dir,current_basename)
- # Arbitrary rooting of tree
- reroot_tree_at_midpoint(recontree_filename)
+ # Set root of reconstruction tree to match that of the current tree
+ # Cannot just midpoint root both, because the branch lengths differ between them
+ harmonise_roots(recontree_filename, temp_rooted_tree)
printer.print(["\nRunning joint ancestral reconstruction with pyjar"])
jar(alignment = polymorphism_alignment, # complete polymorphism alignment
@@ -271,8 +272,12 @@ def parse_and_run(input_args, program_description=""):
verbose = input_args.verbose)
gaps_alignment_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".joint.aln"
raw_internal_rooted_tree_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".joint.tre"
- transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_rooted_tree,
- current_tree_name_with_internal_nodes, "pyjar")
+ printer.print(["\nTransferring pyjar results onto original recombination-corrected tree"])
+ transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename,
+ temp_rooted_tree,
+ current_tree_name_with_internal_nodes,
+ "pyjar")
+ printer.print(["\nDone transfer"])
else:
@@ -309,10 +314,12 @@ def parse_and_run(input_args, program_description=""):
current_tree_name_with_internal_nodes, sequence_reconstructor)
elif input_args.seq_recon == "iqtree" or input_args.seq_recon == "raxmlng":
# IQtree returns an unrooted tree
- temp_unrooted_tree = temp_working_dir + "/" + current_tree_name + ".unrooted"
- unroot_tree(temp_rooted_tree, temp_unrooted_tree)
- transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_unrooted_tree,
- current_tree_name_with_internal_nodes, sequence_reconstructor)
+ harmonise_roots(raw_internal_rooted_tree_filename, temp_rooted_tree)
+ transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename,
+ temp_rooted_tree,
+ current_tree_name_with_internal_nodes,
+ sequence_reconstructor,
+ use_root = False)
else:
sys.stderr.write("Unrecognised sequence reconstruction command: " + input_args.seq_recon + '\n')
sys.exit()
@@ -699,7 +706,6 @@ def reroot_tree_with_outgroup(tree_name, outgroups):
with open(tree_name, 'w+') as output_file:
output_file.write(output_tree_string.replace('\'', ''))
-
def reroot_tree_at_midpoint(tree_name):
tree = dendropy.Tree.get_from_path(tree_name, 'newick', preserve_underscores=True)
split_all_non_bi_nodes(tree.seed_node)
@@ -718,6 +724,50 @@ def unroot_tree(input_filename, output_filename):
with open(output_filename, 'w+') as output_file:
output_file.write(output_tree_string.replace('\'', ''))
+def harmonise_roots(new_tree_fn, tree_for_root_fn):
+ # Read in tree and get nodes adjacent to root
+ taxa = dendropy.TaxonNamespace()
+ tree_for_root = dendropy.Tree.get_from_path(tree_for_root_fn,
+ 'newick',
+ preserve_underscores=True,
+ taxon_namespace=taxa,
+ rooting="force-rooted")
+ tree_for_root.encode_bipartitions()
+ root = tree_for_root.seed_node
+ root_adjacent_bipartitions = []
+ for root_adjacent_node in root.child_nodes():
+ root_adjacent_bipartitions.append(root_adjacent_node.edge.bipartition)
+ # Now search the new tree for these nodes
+ new_tree = dendropy.Tree.get_from_path(new_tree_fn,
+ 'newick',
+ preserve_underscores=True,
+ taxon_namespace=taxa,
+ rooting="force-rooted")
+ new_tree.encode_bipartitions()
+ for node in new_tree.preorder_node_iter():
+ if node.edge.bipartition in root_adjacent_bipartitions:
+ half_branch_length = node.edge_length/2
+ new_tree.reroot_at_edge(node.edge,
+ length1 = half_branch_length,
+ length2 = half_branch_length)
+ break
+ new_tree_string = tree_as_string(new_tree,
+ suppress_internal=False,
+ suppress_rooting=False)
+
+ # Check both trees are topologically identical
+ missing_bipartitions = dendropy.calculate.treecompare.find_missing_bipartitions(tree_for_root,
+ new_tree,
+ is_bipartitions_updated=False)
+
+ if len(missing_bipartitions) > 0:
+ sys.stderr.write('Bipartitions missing when transferring node label transfer: ' + str([str(x) for x in missing_bipartitions]))
+ sys.exit(1)
+
+ # Write output
+ with open(new_tree_fn, 'w+') as output_file:
+ output_file.write(new_tree_string.replace('\'', ''))
+
def filter_out_removed_taxa_from_tree(input_filename, output_filename, taxa_removed):
tree = dendropy.Tree.get_from_path(input_filename, 'newick', preserve_underscores=True)
tree.prune_taxa_with_labels(taxa_removed, update_bipartitions=True)
@@ -791,32 +841,70 @@ def get_monophyletic_outgroup(tree_name, outgroups):
def transfer_internal_node_labels_to_tree(source_tree_filename, destination_tree_filename, output_tree_filename,
- sequence_reconstructor):
-
+ sequence_reconstructor, use_root = True):
# read source tree and extract node labels, to match with the ancestral sequence reconstruction
- source_tree = dendropy.Tree.get_from_path(source_tree_filename, 'newick', preserve_underscores=True)
- source_internal_node_labels = []
+ taxa = dendropy.TaxonNamespace()
+ source_tree = dendropy.Tree.get_from_path(source_tree_filename,
+ 'newick',
+ preserve_underscores=True,
+ taxon_namespace=taxa,
+ rooting='force-rooted')
+ source_tree.encode_bipartitions()
+ source_internal_node_dict = {}
for source_internal_node in source_tree.internal_nodes():
+ node_bipartition = source_internal_node.edge.bipartition
if source_internal_node.label:
- source_internal_node_labels.append(source_internal_node.label)
+ source_internal_node_dict[node_bipartition] = source_internal_node.label
else:
- source_internal_node_labels.append('')
-
+ source_internal_node_dict[node_bipartition] = ''
# read original tree and add in the labels from the ancestral sequence reconstruction
- destination_tree = dendropy.Tree.get_from_path(destination_tree_filename, 'newick', preserve_underscores=True)
- for index, destination_internal_node in enumerate(destination_tree.internal_nodes()):
- if sequence_reconstructor == 'pyjar':
- new_label = str(source_internal_node_labels[index])
- else:
- new_label = sequence_reconstructor.replace_internal_node_label(str(source_internal_node_labels[index]))
- destination_internal_node.label = None
- destination_internal_node.taxon = dendropy.Taxon(new_label)
+ destination_tree = dendropy.Tree.get_from_path(destination_tree_filename,
+ 'newick',
+ preserve_underscores=True,
+ taxon_namespace=taxa,
+ rooting='force-rooted')
+ destination_tree.encode_bipartitions()
+
+ # Check both trees are topologically identical
+ missing_bipartitions = dendropy.calculate.treecompare.find_missing_bipartitions(source_tree,
+ destination_tree,
+ is_bipartitions_updated=False)
+ if len(missing_bipartitions) > 0:
+ sys.stderr.write('Bipartitions missing when transferring node label transfer: ' + str([str(x) for x in missing_bipartitions]))
+ sys.exit(1)
+
+ root_alternative = ''
+ for destination_internal_node in destination_tree.internal_nodes():
+ if destination_internal_node != destination_tree.seed_node or use_root:
+ node_bipartition = destination_internal_node.edge.bipartition
+ if sequence_reconstructor == 'pyjar':
+ try:
+ new_label = source_internal_node_dict[node_bipartition]
+ except:
+ sys.stderr.write('Unable to find bipartition ' + str(node_bipartition) + '\n')
+ sys.exit(1)
+ else:
+ new_label = sequence_reconstructor.replace_internal_node_label(str(source_internal_node_dict[node_bipartition]))
+ destination_internal_node.label = None
+ destination_internal_node.taxon = dendropy.Taxon(new_label)
# output final tree
- output_tree_string = tree_as_string(destination_tree, suppress_internal=False, suppress_rooting=False)
- with open(output_tree_filename, 'w+') as output_file:
- output_file.write(output_tree_string.replace('\'', ''))
+ if not use_root:
+ alt_root_name = ''
+ for root_child_node in destination_tree.seed_node.child_nodes():
+ alt_root_name = root_child_node.taxon.label
+ break
+ destination_tree.seed_node.label = None
+ destination_tree.seed_node.taxon = dendropy.Taxon(alt_root_name)
+ destination_tree_string = destination_tree.as_string(schema="newick",
+ suppress_rooting=True,
+ unquoted_underscores=True,
+ suppress_internal_node_labels=True).replace("'","").replace('\'', '')
+ with open(output_tree_filename, 'w+') as output_file:
+ print(destination_tree_string,
+ file=output_file,
+ end='')
def remove_internal_node_labels_from_tree(input_filename, output_filename):
tree = dendropy.Tree.get_from_path(input_filename, 'newick', preserve_underscores=True)
=====================================
python/gubbins/pyjar.py
=====================================
@@ -20,7 +20,7 @@ try:
NumpyShared = collections.namedtuple('NumpyShared', ('name', 'shape', 'dtype'))
except ImportError as e:
sys.stderr.write("This version of Gubbins requires the multiprocessing library and python v3.8 or higher for memory management\n")
- sys.exit(0)
+ sys.exit(201)
from gubbins.utils import generate_shared_mem_array
@@ -31,31 +31,38 @@ from gubbins.utils import generate_shared_mem_array
def read_alignment(filename, file_type, verbose=False):
if not os.path.isfile(filename):
print("Error: alignment file " + filename + " does not exist")
- sys.exit()
+ sys.exit(202)
if verbose:
print("Trying to open file " + filename + " as " + file_type)
try:
- alignmentObject = AlignIO.read(open(filename), file_type)
+ with open(filename,'r') as aln_in:
+ alignmentObject = AlignIO.read(aln_in, file_type)
if verbose:
print("Alignment read successfully")
except:
print("Cannot open alignment file " + filename + " as " + file_type)
- sys.exit()
+ sys.exit(203)
return alignmentObject
#Calculate Pij from Q matrix and branch length
def calculate_pij(branch_length,rate_matrix):
if branch_length==0:
- return numpy.array([[1, 0, 0, 0,], [0, 1, 0, 0,], [0, 0, 1, 0,], [0, 0, 0, 1,]])
+ return numpy.array([[0.0, float("-inf"), float("-inf"), float("-inf"),],
+ [float("-inf"), 0.0, float("-inf"), float("-inf"),],
+ [float("-inf"), float("-inf"), 0.0, float("-inf"),],
+ [float("-inf"), float("-inf"), float("-inf"), 0.0,]])
else:
- return numpy.log(linalg.expm(numpy.multiply(branch_length,rate_matrix)))
+ return numpy.log(linalg.expm(numpy.multiply(branch_length,rate_matrix))) # modified
#Read the tree file and root
def read_tree(treefile):
if not os.path.isfile(treefile):
print("Error: tree file does not exist")
- sys.exit()
- t=dendropy.Tree.get(path=treefile, schema="newick", preserve_underscores=True, rooting="force-rooted")
+ sys.exit(204)
+ t=dendropy.Tree.get(path=treefile,
+ schema="newick",
+ preserve_underscores=True,
+ rooting="force-rooted")
return t
# Read the RAxML info file to get rates and frequencies
@@ -63,11 +70,11 @@ def read_info(infofile, type = 'raxml'):
if not os.path.isfile(infofile):
print("Error: model information file " + infofile + " does not exist")
- sys.exit()
+ sys.exit(205)
if type not in ['raxml', 'raxmlng', 'iqtree','fasttree']:
sys.stderr.write('Only able to parse GTR-type models from raxml, iqtree or fasttree')
- sys.exit()
+ sys.exit(206)
r=[-1.0] * 6 # initialise rates
f=[-1.0] * 4 # initialise frequencies
@@ -146,7 +153,7 @@ def read_info(infofile, type = 'raxml'):
# Check frequencies and rates have been extracted correctly
if -1.0 in f or -1.0 in r:
sys.stderr.write('Problem with extracting model parameters - frequencies are ' + str(f) + ' and rates are ' + str(r))
- sys.exit()
+ sys.exit(207)
return f, r
@@ -239,7 +246,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
# Iterate over columns
for column,base_pattern_columns_padded in zip(columns,column_positions):
-
+
### TIMING
if verbose:
calc_time_start = time.process_time()
@@ -285,6 +292,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
#1b. Set for each amino acid i: Ly(i) = Pij(ty), where ty is the branch length between y and its father.
node.L={"A": pij[base_matrix["A"]][base_matrix[base[taxon]]], "C": pij[base_matrix["C"]][base_matrix[base[taxon]]], "G": pij[base_matrix["G"]][base_matrix[base[taxon]]], "T": pij[base_matrix["T"]][base_matrix[base[taxon]]]}
+
else:
node.C={"A": "A", "C": "C", "G": "G", "T": "T"}
@@ -292,7 +300,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
except KeyError:
print("Cannot find", taxon, "in base")
- sys.exit()
+ sys.exit(208)
else:
node.L={}
@@ -327,7 +335,6 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
c+=child.L[end]
for start in columnbases:
j=log(base_frequencies[base_matrix[end]])+c
-
if j>node.L[start]:
node.L[start]=j
node.C[start]=end
@@ -339,7 +346,7 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
max_root_base_likelihood=node.L[root_base]
max_root_base=node.C[root_base]
node.r=max_root_base
-
+
#Traverse the tree from the root in the direction of the OTUs, assigning to each node its most likely ancestral character as follows:
for node in tree.preorder_node_iter():
@@ -354,8 +361,6 @@ def reconstruct_alignment_column(column_indices, tree = None, alignment_sequence
rootlens=[]
for child in tree.seed_node.child_node_iter():
rootlens.append([child.edge_length,child,child.r])
- rootlens.sort(key = lambda x: x[0])
- tree.seed_node.r=rootlens[-1][1].r
### TIMING
if verbose:
@@ -480,7 +485,7 @@ def jar(alignment = None, base_patterns = None, base_pattern_positions = None, t
node.taxon=tree.taxon_namespace.get_taxon(nodename)
if nodename in alignment_sequence_names:
print(nodename, "already in alignment. Quitting")
- sys.exit()
+ sys.exit(209)
ancestral_node_names.append(nodename) # index for reconstruction
if node.parent_node != None:
node.pij=calculate_pij(node.edge_length, rm)
@@ -553,10 +558,17 @@ def jar(alignment = None, base_patterns = None, base_pattern_positions = None, t
continue
# Print tree
+ from gubbins.common import tree_as_string
+
if verbose:
print("Printing tree with internal nodes labelled: ", output_prefix+".joint.tre")
with open(output_prefix+".joint.tre", "w") as tree_output:
- print(tree.as_string(schema="newick", suppress_rooting=True, unquoted_underscores=True, suppress_internal_node_labels=True).replace("'",""), file=tree_output)
+
+ recon_tree = tree_as_string(tree,
+ suppress_rooting=True,
+ suppress_internal=False)
+ print(recon_tree.replace('\'', ''),
+ file = tree_output)
if verbose:
print("Done")
=====================================
python/gubbins/run_gubbins.py
=====================================
@@ -19,6 +19,7 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
+import sys
import argparse
from gubbins.__init__ import version
import gubbins.common
=====================================
python/gubbins/treebuilders.py
=====================================
@@ -629,8 +629,7 @@ class RAxMLNG:
self.citation = "https://doi.org/10.1093/bioinformatics/btz305"
# Set parallelisation
- if self.threads > 1:
- command.extend(["--threads", str(self.threads)])
+ command.extend(["--threads", str(self.threads)])
# Add model
command.extend(["-model"])
@@ -706,6 +705,7 @@ class RAxMLNG:
def convert_raw_ancestral_states_to_fasta(self, input_filename, output_filename):
"""Converts the file containing ancestral sequences into FASTA format"""
+ # Use the IUPAC codes from here: https://www.bioinformatics.org/sms/iupac.html
with open(input_filename, 'r') as infile:
with open(output_filename, 'w+') as outfile:
for sequence_line in infile:
@@ -714,7 +714,13 @@ class RAxMLNG:
'R': 'N',
'Y': 'N',
'S': 'N',
- 'W': 'N'
+ 'W': 'N',
+ 'K': 'N',
+ 'M': 'N',
+ 'B': 'N',
+ 'D': 'N',
+ 'H': 'N',
+ 'V': 'N'
}
)
)
=====================================
python/gubbins/utils.py
=====================================
@@ -108,18 +108,18 @@ def choose_executable_based_on_processor(list_of_executables: list):
stdout=subprocess.PIPE,
shell=True).communicate()[0].decode("utf-8")
flags = output.lower().split()
- if cpu_info:
- # Iterate through list to match with CPU features
- for executable in list_of_executables:
- if 'AVX2' in executable and 'avx2' in flags and which(executable):
- break
- elif 'AVX' in executable and 'avx' in flags and which(executable):
- break
- elif 'SSE3' in executable and 'sse3' in flags and which(executable):
- break
- else:
- # Final executable on list is generic
- executable = list_of_executables[-1]
+
+ # Iterate through list to match with CPU features
+ for executable in list_of_executables:
+ if cpu_info and 'AVX2' in executable and 'avx2' in flags and which(executable):
+ break
+ elif cpu_info and 'AVX' in executable and 'avx' in flags and which(executable):
+ break
+ elif cpu_info and 'SSE3' in executable and 'sse3' in flags and which(executable):
+ break
+ # to enable selection of raxml-ng-mpi
+ elif '-mpi' in executable and which(executable):
+ break
# Check executable is available
if which(executable):
=====================================
python/scripts/extract_clade.py
=====================================
@@ -0,0 +1,115 @@
+# encoding: utf-8
+# Wellcome Trust Sanger Institute and Imperial College London
+# Copyright (C) 2020 Wellcome Trust Sanger Institute and Imperial College London
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+
+# Generic imports
+import sys
+import argparse
+import re
+# Phylogenetic imports
+import dendropy
+# Biopython imports
+from Bio import AlignIO
+from Bio import Phylo
+from Bio import SeqIO
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+
+# command line parsing
+def get_options():
+
+ parser = argparse.ArgumentParser(description='Mask recombinant regions detected by'
+ ' Gubbins from the input alignment',
+ prog='mask_gubbins_aln')
+
+ # input options
+ parser.add_argument('--list',
+ help = 'List of sequences to extract',
+ required = True)
+ parser.add_argument('--aln',
+ help = 'Input alignment (FASTA format)',
+ required = True)
+ parser.add_argument('--gff',
+ help = 'GFF of recombinant regions detected by Gubbins',
+ required = True)
+ parser.add_argument('--tree',
+ help = 'Final tree generated by Gubbins',
+ required = True)
+ parser.add_argument('--out',
+ help = 'Output file prefix',
+ required = True)
+ parser.add_argument('--out-fmt',
+ help = 'Format of output alignment',
+ default = 'fasta')
+ parser.add_argument('--missing-char',
+ help = 'Character used to replace recombinant sequence',
+ default = '-')
+
+ return parser.parse_args()
+
+# main code
+if __name__ == "__main__":
+
+ # Get command line options
+ args = get_options()
+
+ # Parse list of input sequences
+ subset = set()
+ # Read in FASTA assemblies
+ with open(args.list,'r') as seq_list:
+ for line in seq_list.readlines():
+ subset.add(line.strip().split()[0])
+
+ # Extract from alignment
+ output_aln_name = args.out + '.aln'
+ names_in_alignment = set()
+ with open(output_aln_name,'w') as out_aln:
+ alignment = AlignIO.read(args.aln,'fasta')
+ for taxon in alignment:
+ names_in_alignment.add(taxon.id)
+ if taxon.id in subset:
+ SeqIO.write(taxon, out_aln, args.out_fmt)
+
+ # Check subset sequences are found in alignment
+ not_found_in_dataset = subset - names_in_alignment
+ if len(not_found_in_dataset) > 0:
+ sys.stderr.write('Sequences in subset missing from alignment: ' + \
+ str(not_found_in_dataset) + '\n')
+ sys.exit(1)
+
+ # Prune from the tree
+ output_tree_name = args.out + '.tree'
+ tree = dendropy.Tree.get(path = args.tree,
+ schema = 'newick',
+ preserve_underscores = True)
+ tree.retain_taxa_with_labels(subset)
+ tree.write_to_path(output_tree_name,
+ 'newick')
+
+ # Identify relevant recombination blocks
+ output_gff_name = args.out + '.gff'
+ taxon_pattern = re.compile('taxa="([^"]*)"')
+ with open(args.gff,'r') as in_gff, open(output_gff_name,'w') as out_gff:
+ for line in in_gff.readlines():
+ if line.startswith('##'):
+ out_gff.write(line)
+ else:
+ info = line.rstrip().split('\t')
+ taxon_set = set(taxon_pattern.search(info[8]).group(1).split())
+ if not taxon_set.isdisjoint(subset):
+ out_gff.write(line)
=====================================
python/scripts/generate_ska_alignment.py
=====================================
@@ -0,0 +1,152 @@
+# encoding: utf-8
+# Wellcome Trust Sanger Institute and Imperial College London
+# Copyright (C) 2020 Imperial College London and Imperial College London
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+
+# Generic imports
+import sys
+import os
+import argparse
+import subprocess
+from multiprocessing import Pool
+from functools import partial
+from shutil import which
+
+# command line parsing
+def get_options():
+
+ parser = argparse.ArgumentParser(description='Generate a ska alignment from a list '
+ 'of assemblies',
+ prog='generate_ska_alignment')
+
+ # input options
+ parser.add_argument('--reference',
+ help = 'Name of reference sequence to use for alignment',
+ required = True)
+ parser.add_argument('--fasta',
+ help = 'List of FASTA files to include in alignment',
+ default = None,
+ required = False)
+ parser.add_argument('--fastq',
+ help = 'List of FASTQ files to include in alignment',
+ default = None,
+ required = False)
+ parser.add_argument('--out',
+ help = 'Name of output alignment',
+ required = True)
+ parser.add_argument('--k',
+ help = 'Split kmer size',
+ type = int,
+ default = 15)
+ parser.add_argument('--threads',
+ help = 'Number of threads to use',
+ type = int,
+ default = 1)
+ parser.add_argument('--no-cleanup',
+ help = 'Do not remove intermediate files',
+ action = 'store_true',
+ default = False)
+
+ return parser.parse_args()
+
+def map_fasta_sequence(seq, k = None, names = None):
+ subprocess.check_output('ska fasta -o ' + names[seq] + ' -k ' + str(k) + ' ' + seq,
+ shell = True)
+
+def map_fastq_sequence(read_pair, k = None, names = None):
+ subprocess.check_output('ska fastq -o ' + names[read_pair[0]] + ' -k ' + str(k) + ' ' + \
+ read_pair[0] + ' ' + read_pair[1],
+ shell = True)
+
+def ska_map_sequences(seq, k = None, ref = None):
+ subprocess.check_output('ska map -o ' + seq + '.map -k ' + str(k) + ' -r ' + ref + \
+ ' ' + seq + '.skf',
+ shell = True)
+
+# main code
+if __name__ == "__main__":
+
+ # Get command line options
+ args = get_options()
+
+ # Check if ska is installed
+ if which('ska') is None:
+ sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"')
+ sys.exit(1)
+
+ # Dictionary for sequence names
+ seq_names = {}
+ all_names = []
+
+ # Make split kmers from assemblies
+ fasta_names = []
+ if args.fasta is not None:
+ # Read in FASTA assemblies
+ with open(args.fasta,'r') as fasta_list:
+ for line in fasta_list.readlines():
+ info = line.strip().split()
+ if os.path.isfile(info[1]):
+ fasta_names.append(info[1])
+ seq_names[info[1]] = info[0]
+ all_names.append(info[0])
+ else:
+ sys.stderr.write('Unable to find file ' + info[1] + '\n')
+ # Sketch into split kmers
+ with Pool(processes = args.threads) as pool:
+ pool.map(partial(map_fasta_sequence,
+ k = args.k,
+ names = seq_names),
+ fasta_names)
+
+ # Make split kmers from FASTQs
+ fastq_names = []
+ if args.fastq is not None:
+ # Read in FASTQ reads
+ with open(args.fastq,'r') as fastq_list:
+ for line in fastq_list.readlines():
+ info = line.strip().split()
+ if os.path.isfile(fns[1]) and os.path.isfile(fns[2]):
+ fastq_names.append((info[1],info[2]))
+ seq_names[info[1]] = info[0]
+ all_names.append(info[0])
+ else:
+ sys.stderr.write('Unable to find files ' + fns[1] + ' and ' + fns[2] + '\n')
+ # Sketch into split kmers
+ with Pool(processes = args.threads) as pool:
+ return_codes = pool.map(partial(map_fastq_sequence,
+ k = args.k,
+ names = seq_names),
+ fastq_names)
+
+ # Map sequences
+ with Pool(processes = args.threads) as pool:
+ return_codes = pool.map(partial(ska_map_sequences,
+ k = args.k,
+ ref = args.reference),
+ all_names)
+
+ # Generate alignment
+ subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out,
+ shell = True)
+
+ # Clean up
+ if not args.no_cleanup:
+ subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]),
+ shell = True)
+ subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]),
+ shell = True)
+
=====================================
python/scripts/mask_gubbins_aln.py
=====================================
@@ -0,0 +1,87 @@
+# encoding: utf-8
+# Wellcome Trust Sanger Institute and Imperial College London
+# Copyright (C) 2020 Wellcome Trust Sanger Institute and Imperial College London
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+
+# Generic imports
+import sys
+import argparse
+import re
+# Biopython imports
+from Bio import AlignIO
+from Bio import Phylo
+from Bio import SeqIO
+from Bio.Align import MultipleSeqAlignment
+from Bio.Seq import Seq
+
+# command line parsing
+def get_options():
+
+ parser = argparse.ArgumentParser(description='Mask recombinant regions detected by'
+ ' Gubbins from the input alignment',
+ prog='mask_gubbins_aln')
+
+ # input options
+ parser.add_argument('--aln',
+ help = 'Input alignment (FASTA format)',
+ required = True)
+ parser.add_argument('--gff',
+ help = 'GFF of recombinant regions detected by Gubbins',
+ required = True)
+ parser.add_argument('--out',
+ help = 'Output file name',
+ required = True)
+ parser.add_argument('--out-fmt',
+ help = 'Format of output alignment',
+ default = 'fasta')
+ parser.add_argument('--missing-char',
+ help = 'Character used to replace recombinant sequence',
+ default = '-')
+
+ return parser.parse_args()
+
+# main code
+if __name__ == "__main__":
+
+ # Get command line options
+ args = get_options()
+
+ # Read in alignment
+ overall_taxon_list = []
+ alignment = AlignIO.read(args.aln,'fasta')
+ for taxon in alignment:
+ overall_taxon_list.append(taxon.id)
+ taxon.seq = taxon.seq.tomutable()
+ overall_taxon_set = set(overall_taxon_list)
+
+ # Read recombinant regions from GFF
+ taxon_pattern = re.compile('taxa="([^"]*)"')
+ with open(args.gff,'r') as gff_file:
+ for line in gff_file.readlines():
+ if not line.startswith('##'):
+ info = line.rstrip().split('\t')
+ start = int(info[3])-1
+ end = int(info[4])-1
+ taxon_set = set(taxon_pattern.search(info[8]).group(1).split())
+ if taxon_set.issubset(overall_taxon_set):
+ for taxon in alignment:
+ if taxon.id in taxon_set:
+ taxon.seq[start:end+1] = args.missing_char*(end-start+1)
+
+ # Write out masked sequence
+ with open(args.out,'w') as out_aln:
+ AlignIO.write(alignment, out_aln, args.out_fmt)
View it on GitLab: https://salsa.debian.org/med-team/gubbins/-/commit/099d39347d036549525167aafc660bc8b80a1bb3
--
View it on GitLab: https://salsa.debian.org/med-team/gubbins/-/commit/099d39347d036549525167aafc660bc8b80a1bb3
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211011/b9a2f751/attachment-0001.htm>
More information about the debian-med-commit
mailing list