[med-svn] [Git][med-team/metaphlan][master] 5 commits: New upstream version 4.0.4
Andreas Tille (@tille)
gitlab at salsa.debian.org
Wed Feb 1 14:15:43 GMT 2023
Andreas Tille pushed to branch master at Debian Med / metaphlan2
Commits:
dd4058b3 by Andreas Tille at 2023-02-01T15:08:55+01:00
New upstream version 4.0.4
- - - - -
715de899 by Andreas Tille at 2023-02-01T15:08:55+01:00
routine-update: New upstream version
- - - - -
b14eeab5 by Andreas Tille at 2023-02-01T15:09:01+01:00
Update upstream source from tag 'upstream/4.0.4'
Update to upstream version '4.0.4'
with Debian dir 7f264b9a8641702704d93d5c13fae3a636ed6ee3
- - - - -
a7ea61f8 by Andreas Tille at 2023-02-01T15:09:01+01:00
routine-update: Standards-Version: 4.6.2
- - - - -
e386ea76 by Andreas Tille at 2023-02-01T15:11:16+01:00
routine-update: Ready to upload to unstable
- - - - -
20 changed files:
- CHANGELOG.md
- bioconda_recipe/meta.yaml
- debian/changelog
- debian/control
- metaphlan/__init__.py
- metaphlan/metaphlan.py
- metaphlan/strainphlan.py
- metaphlan/utils/add_metadata_tree.py
- metaphlan/utils/consensus_markers.py
- metaphlan/utils/database_controller.py
- metaphlan/utils/external_exec.py
- metaphlan/utils/extract_markers.py
- metaphlan/utils/parallelisation.py
- metaphlan/utils/phylophlan_controller.py
- metaphlan/utils/plot_tree_graphlan.py
- metaphlan/utils/sample2markers.py
- metaphlan/utils/sgb_to_gtdb_profile.py
- metaphlan/utils/strain_transmission.py
- metaphlan/utils/util_fun.py
- setup.py
Changes:
=====================================
CHANGELOG.md
=====================================
@@ -1,3 +1,12 @@
+## Version 4.0.4 (Jan 17nd, 2023)
+### Changed features
+* [MetaPhlAn] Download of the pre-computed Bowtie2 database is now the default option during installation
+* [StrainPhlAn] Improved StrainPhlAn's sample2makers.py script performance and speed
+### Fixes
+* [StrainPhlAn] Fixes error when using --abs_n_samples_threshold in the PhyloPhlAn call
+
+<br/>
+
## Version 4.0.3 (Oct 24nd, 2022)
### Changed features
* [MetaPhlAn] Removal of the NCBI taxID from the merged profiles produced by the `merge_metaphlan_profiles.py` script
=====================================
bioconda_recipe/meta.yaml
=====================================
@@ -1,5 +1,5 @@
{% set name = "metaphlan" %}
-{% set version = "4.0.3" %}
+{% set version = "4.0.4" %}
package:
name: {{ name }}
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+metaphlan (4.0.4-1) unstable; urgency=medium
+
+ * New upstream version
+ * Standards-Version: 4.6.2 (routine-update)
+
+ -- Andreas Tille <tille at debian.org> Wed, 01 Feb 2023 15:09:53 +0100
+
metaphlan (4.0.3-1) unstable; urgency=medium
* Fix watch file
=====================================
debian/control
=====================================
@@ -10,7 +10,7 @@ Build-Depends: debhelper-compat (= 13),
bowtie2,
python3-setuptools,
python3-biopython <!nocheck>
-Standards-Version: 4.6.1
+Standards-Version: 4.6.2
Vcs-Browser: https://salsa.debian.org/med-team/metaphlan
Vcs-Git: https://salsa.debian.org/med-team/metaphlan.git
Homepage: https://github.com/biobakery/MetaPhlAn
=====================================
metaphlan/__init__.py
=====================================
@@ -83,46 +83,26 @@ def download(url, download_file, force=False):
sys.stderr.write("\nWarning: Unable to download " + url + "\n")
else:
sys.stderr.write("\nFile {} already present!\n".format(download_file))
-
-def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_zenodo):
- """
- Download the url to the file and decompress into the folder
- """
-
- # Create the folder if it does not already exist
- if not os.path.isdir(folder):
- try:
- os.makedirs(folder)
- except EnvironmentError:
- sys.exit("ERROR: Unable to create folder for database install: " + folder)
-
- # Check the directory permissions
- if not os.access(folder, os.W_OK):
- sys.exit("ERROR: The directory is not writeable: " + folder + ". "
- "Please modify the permissions.")
-
- #local path of the tarfile and md5file
+
+
+def download_and_untar(download_file_name, folder, origin):
+ #local path of the tarfile and md5file
tar_file = os.path.join(folder, download_file_name + ".tar")
md5_file = os.path.join(folder, download_file_name + ".md5")
-
#Download the list of all the files in the FPT
- url_tar_file = "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/{}.tar".format(download_file_name)
- url_md5_file = "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/{}.md5".format(download_file_name)
-
+ url_tar_file = "{}/{}.tar".format(origin, download_file_name)
+ url_md5_file = "{}/{}.md5".format(origin, download_file_name)
# download tar and MD5 checksum
download(url_tar_file, tar_file)
download(url_md5_file, md5_file)
-
md5_md5 = None
md5_tar = None
-
if os.path.isfile(md5_file):
with open(md5_file) as f:
for row in f:
md5_md5 = row.strip().split(' ')[0]
else:
sys.stderr.write('File "{}" not found!\n'.format(md5_file))
-
# compute MD5 of .tar.bz2
if os.path.isfile(tar_file):
hash_md5 = hashlib.md5()
@@ -134,23 +114,44 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
md5_tar = hash_md5.hexdigest()[:32]
else:
sys.stderr.write('File "{}" not found!\n'.format(tar_file))
-
if (md5_tar is None) or (md5_md5 is None):
sys.exit("MD5 checksums not found, something went wrong!")
-
# compare checksums
if md5_tar != md5_md5:
sys.exit("MD5 checksums do not correspond! If this happens again, you should remove the database files and "
"rerun MetaPhlAn so they are re-downloaded")
-
# untar
try:
tarfile_handle = tarfile.open(tar_file)
tarfile_handle.extractall(path=folder)
tarfile_handle.close()
+ os.remove(tar_file)
+ os.remove(md5_file)
except EnvironmentError:
sys.stderr.write("Warning: Unable to extract {}.\n".format(tar_file))
+def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_zenodo):
+ """
+ Download the url to the file and decompress into the folder
+ """
+
+ # Create the folder if it does not already exist
+ if not os.path.isdir(folder):
+ try:
+ os.makedirs(folder)
+ except EnvironmentError:
+ sys.exit("ERROR: Unable to create folder for database install: " + folder)
+
+ # Check the directory permissions
+ if not os.access(folder, os.W_OK):
+ sys.exit("ERROR: The directory is not writeable: " + folder + ". "
+ "Please modify the permissions.")
+
+ sys.stderr.write('\n\Downloading and uncompressing indexes\n')
+ download_and_untar("{}_bt2".format(download_file_name), folder, "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/bowtie2_indexes")
+ sys.stderr.write('\nDownloading and uncompressing additional files\n')
+ download_and_untar(download_file_name, folder, "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases")
+
# uncompress sequences
for bz2_file in iglob(os.path.join(folder, download_file_name + "_*.fna.bz2")):
fna_file = bz2_file[:-4]
@@ -162,37 +163,20 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
bz2.BZ2File(bz2_file, 'rb') as bz2_h:
for data in iter(lambda: bz2_h.read(100 * 1024), b''):
fna_h.write(data)
- os.remove(bz2_file)
-
- # join fasta files
- sys.stderr.write('\n\nJoining FASTA databases\n')
- with open(os.path.join(folder, download_file_name + ".fna"), 'w') as fna_h:
- for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
- with open(fna_file, 'r') as fna_r:
- for line in fna_r:
- fna_h.write(line)
- fna_file = os.path.join(folder, download_file_name + ".fna")
+ os.remove(bz2_file)
# build bowtie2 indexes
if not glob(os.path.join(folder, download_file_name + "*.bt2l")):
- bt2_base = os.path.join(folder, download_file_name)
- bt2_cmd = [bowtie2_build, '--quiet']
-
- if nproc > 1:
- bt2_build_output = subp.check_output([bowtie2_build, '--usage'], stderr=subp.STDOUT)
-
- if 'threads' in str(bt2_build_output):
- bt2_cmd += ['--threads', str(nproc)]
-
- bt2_cmd += ['-f', fna_file, bt2_base]
-
- sys.stderr.write('\nBuilding Bowtie2 indexes\n')
-
+ build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc)
+ else:
try:
- subp.check_call(bt2_cmd)
+ subp.check_call(['bowtie2-inspect', '-n', os.path.join(folder, download_file_name)], stdout=subp.DEVNULL, stderr=subp.DEVNULL)
except Exception as e:
- sys.stderr.write("Fatal error running '{}'\nError message: '{}'\n\n".format(' '.join(bt2_cmd), e))
- sys.exit(1)
+ sys.stderr.write('Downloaded indexes are not compatible with the installed verion of Bowtie2\n')
+ sys.stderr.write('Building indexes from the FASTA files\n')
+ for btw_file in iglob(os.path.join(folder, download_file_name + "*.bt2l")):
+ os.remove(btw_file)
+ build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc)
try:
for bt2 in glob(os.path.join(folder, download_file_name + "*.bt2l")):
@@ -207,6 +191,35 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
if not fna_file.endswith('_VSG.fna'):
os.remove(fna_file)
+
+
+def build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc):
+ sys.stderr.write('\n\nJoining FASTA databases\n')
+ with open(os.path.join(folder, download_file_name + ".fna"), 'w') as fna_h:
+ for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
+ with open(fna_file, 'r') as fna_r:
+ for line in fna_r:
+ fna_h.write(line)
+ fna_file = os.path.join(folder, download_file_name + ".fna")
+
+ bt2_base = os.path.join(folder, download_file_name)
+ bt2_cmd = [bowtie2_build, '--quiet']
+
+ if nproc > 1:
+ bt2_build_output = subp.check_output([bowtie2_build, '--usage'], stderr=subp.STDOUT)
+
+ if 'threads' in str(bt2_build_output):
+ bt2_cmd += ['--threads', str(nproc)]
+
+ bt2_cmd += ['-f', fna_file, bt2_base]
+
+ sys.stderr.write('\nBuilding Bowtie2 indexes\n')
+
+ try:
+ subp.check_call(bt2_cmd)
+ except Exception as e:
+ sys.stderr.write("Fatal error running '{}'\nError message: '{}'\n\n".format(' '.join(bt2_cmd), e))
+ sys.exit(1)
def download_unpack_zip(url,download_file_name,folder,software_name):
"""
=====================================
metaphlan/metaphlan.py
=====================================
@@ -4,8 +4,8 @@ __author__ = ('Aitor Blanco-Miguez (aitor.blancomiguez at unitn.it), '
'Nicola Segata (nicola.segata at unitn.it), '
'Duy Tin Truong, '
'Francesco Asnicar (f.asnicar at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import sys
try:
=====================================
metaphlan/strainphlan.py
=====================================
@@ -4,8 +4,8 @@ __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
'Francesco Asnicar (f.asnicar at unitn.it), '
'Moreno Zolfo (moreno.zolfo at unitn.it), '
'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import argparse as ap
=====================================
metaphlan/utils/add_metadata_tree.py
=====================================
@@ -1,8 +1,8 @@
#!/usr/bin/env python
__author__ = ('Duy Tin Truong (duytin.truong at unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import argparse as ap
import pandas
=====================================
metaphlan/utils/consensus_markers.py
=====================================
@@ -1,6 +1,6 @@
__author__ = 'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it)'
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import os
=====================================
metaphlan/utils/database_controller.py
=====================================
@@ -1,6 +1,6 @@
__author__ = 'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it)'
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import os
@@ -51,10 +51,11 @@ class MetaphlanDatabaseController:
clades (list): the list of clades
Returns:
- list: the list of markers from the input clades
+ set: the list of markers from the input clades
"""
self.load_database()
- return [marker for marker in self.database_pkl['markers'] if self.database_pkl['markers'][marker]['clade'] in clades]
+ return set((marker for marker in self.database_pkl['markers']
+ if self.database_pkl['markers'][marker]['clade'] in clades))
def get_species2sgbs(self):
"""Retrieve information from the MetaPhlAn database
=====================================
metaphlan/utils/external_exec.py
=====================================
@@ -3,8 +3,8 @@ __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
'Francesco Asnicar (f.asnicar at unitn.it), '
'Moreno Zolfo (moreno.zolfo at unitn.it), '
'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import os
=====================================
metaphlan/utils/extract_markers.py
=====================================
@@ -1,8 +1,8 @@
#!/usr/bin/env python
__author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import os
=====================================
metaphlan/utils/parallelisation.py
=====================================
@@ -3,10 +3,12 @@ __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
'Francesco Asnicar (f.asnicar at unitn.it), '
'Moreno Zolfo (moreno.zolfo at unitn.it), '
'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
+from typing import Iterable, Callable, Any
+
try:
from .util_fun import error
except ImportError:
@@ -31,7 +33,7 @@ def parallel_execution(arguments):
"""Executes each parallelised call of a function
Args:
- arguments ((str, list)): tuple with the function and its arguments
+ arguments ((Callable, *Any)): tuple with the function and its arguments
Returns:
function: the call to the function
@@ -42,15 +44,16 @@ def parallel_execution(arguments):
return function(*args)
except Exception as e:
terminating.set()
- raise
+ raise e
else:
terminating.set()
+
def execute_pool(args, nprocs):
"""Creates a pool for a parallelised function and returns the results of each execution as a list
Args:
- args ((str, list)): tuple with the function and its arguments
+ args (Iterable[tuple]): tuple with the function and its arguments
nprocs (int): number of procs to use
Returns:
=====================================
metaphlan/utils/phylophlan_controller.py
=====================================
@@ -1,6 +1,6 @@
__author__ = 'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it'
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import os
@@ -81,7 +81,7 @@ class Phylophlan3Controller(PhylophlanController):
info("\tDone.")
self.fake_phylophlan_inputs(samples_markers_dir)
info("\tProcessing samples...")
- min_entries = self.marker_in_n_samples if self.abs_n_markers_thres else self.marker_in_n_samples * num_samples // 100
+ min_entries = self.marker_in_n_samples if self.abs_n_samples_thres else self.marker_in_n_samples * num_samples // 100
execute_phylophlan(samples_markers_dir, self.phylophlan_configuration, min_entries, int(round(min_markers, 0)),
self.tmp_dir, self.output_dir, self.clade, self.phylophlan_mode, self.mutation_rates, self.nprocs)
if self.mutation_rates:
@@ -102,7 +102,7 @@ class Phylophlan3Controller(PhylophlanController):
self.samples = args.samples
self.secondary_samples = args.secondary_samples
self.marker_in_n_samples = args.marker_in_n_samples
- self.abs_n_markers_thres = args.abs_n_markers_thres
+ self.abs_n_samples_thres = args.abs_n_samples_thres
self.phylophlan_mode = args.phylophlan_mode
self.phylophlan_configuration = args.phylophlan_configuration
self.mutation_rates = args.mutation_rates
=====================================
metaphlan/utils/plot_tree_graphlan.py
=====================================
@@ -1,8 +1,8 @@
#!/usr/bin/env python
__author__ = ('Duy Tin Truong (duytin.truong at unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import argparse as ap
import dendropy
=====================================
metaphlan/utils/sample2markers.py
=====================================
@@ -1,11 +1,13 @@
#!/usr/bin/env python
-__author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
+__author__ = ('Michal Puncochar (michal.puncochar at unitn.it), '
+ 'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
'Duy Tin Truong (duytin.truong at unitn.it), '
'Francesco Asnicar (f.asnicar at unitn.it), '
'Moreno Zolfo (moreno.zolfo at unitn.it), '
'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
+
import os
@@ -15,8 +17,12 @@ import tempfile
import bz2
import pickletools
import argparse as ap
-from cmseq import cmseq
+from collections import Counter
from shutil import rmtree
+
+import pysam
+import scipy.stats as sps
+
try:
from .external_exec import samtools_sam_to_bam, samtools_sort_bam_v1, decompress_bz2
from .util_fun import info, error
@@ -34,6 +40,15 @@ except ImportError:
class SampleToMarkers:
"""SampleToMarkers class"""
+ class DEFAULTS:
+ pileup_stepper = 'nofilter'
+ poly_error_rate = 0.001
+ min_base_coverage = 1
+ min_base_quality = 30
+ min_mapping_quality = 10
+ poly_dominant_frq_thrsh = 0.8
+ poly_pvalue_threshold = 0.05
+
def decompress_from_bz2(self):
""" Decompressed SAM.BZ2 files
@@ -41,7 +56,7 @@ class SampleToMarkers:
(list, str): tuple with the list of decompressed files and their format
"""
results = execute_pool(
- ((self.decompress_bz2_file, i, self.tmp_dir) for i in self.input), self.nprocs)
+ ((SampleToMarkers.decompress_bz2_file, i, self.tmp_dir) for i in self.input), self.nprocs)
decompressed = [r[0] for r in results]
decompressed_format = [r[1] for r in results]
if decompressed_format[1:] == decompressed_format[:-1]:
@@ -54,6 +69,7 @@ class SampleToMarkers:
else:
error("Decompressed files have different formats", exit=True)
+ @staticmethod
def decompress_bz2_file(input_file, output_dir):
"""Decompress a BZ2 file and returns the decompressed file and the decompressed file format
@@ -82,12 +98,16 @@ class SampleToMarkers:
self.input = execute_pool(
((samtools_sort_bam_v1, i, self.tmp_dir) for i in self.input), self.nprocs)
info("\tDone.")
- elif self.sorted == False:
+ elif not self.sorted:
info("\tSorting BAM samples...")
self.input = execute_pool(
((samtools_sort_bam_v1, i, self.tmp_dir) for i in self.input), self.nprocs)
info("\tDone.")
+ info('\tIndexing BAM samples...')
+ execute_pool(((pysam.index, i) for i in self.input), self.nprocs)
+ info('\tDone.')
+
def build_consensus_markers(self, filtered):
"""Gets the markers for each sample and writes the Pickle files
@@ -96,24 +116,65 @@ class SampleToMarkers:
"""
for i in self.input:
info("\tProcessing sample: {}".format(i))
- results = self.execute_cmseq(i)
+ results = self.get_consensuses_for_sample(i)
self.write_results_as_pkl(filtered, os.path.splitext(
os.path.basename(i))[0], results)
info("\tDone.")
- def execute_cmseq(self, input_bam):
- """cmseq on the specified BAM file
+
+ def get_consensuses_for_sample(self, input_bam, stepper=DEFAULTS.pileup_stepper,
+ error_rate=DEFAULTS.poly_error_rate,
+ poly_pvalue_threshold=DEFAULTS.poly_pvalue_threshold):
+ """Pileup on the specified BAM file
Args:
input_bam (str): the path to the input BAM file
+ stepper (str): stepper to use in the pileup
+ error_rate (float): the sequencing error rate
+ poly_pvalue_threshold (float): the p-value threshold to call polymorphic position
Returns:
- list: the list with the reconstructed consensus markers
+ list[tuple[str, str]]: the list with the marker names and consensus sequences
"""
- collection = cmseq.BamFile(
- input_bam, index=True, minlen=self.min_read_len, minimumReadsAligning=self.min_reads_aligning)
- return collection.parallel_reference_free_consensus(ncores=self.nprocs, mincov=self.min_base_coverage, minqual=self.min_base_quality,
- consensus_rule=cmseq.BamContig.majority_rule_polymorphicLoci, dominant_frq_thrsh=self.dominant_frq_threshold)
+
+ info('\t\tLoading the bam file and extracting information...')
+ sam_file = pysam.AlignmentFile(input_bam)
+
+ all_markers = sam_file.references
+ marker_lengths = sam_file.lengths
+ marker_to_length = dict(zip(all_markers, marker_lengths))
+ info('\t\tDone.')
+
+ ACTGactg = set(list('ACTGactg'))
+
+ info('\t\tRunning the pileup...')
+ consensuses = {m: bytearray(b'-' * marker_to_length[m]) for m in all_markers}
+ for base_pileup in sam_file.pileup(contig=None, stepper=stepper, min_base_quality=self.min_base_quality):
+ marker = base_pileup.reference_name
+ pos = base_pileup.pos
+ bases = [b.upper() for b in base_pileup.get_query_sequences() if b in ACTGactg]
+ base_coverage = len(bases)
+ if base_coverage < self.min_base_coverage:
+ continue
+
+ base_frequencies = Counter(bases)
+ max_frequency = max(base_frequencies.values())
+ max_ratio = max_frequency / base_coverage
+ base_consensus = base_frequencies.most_common(1)[0][0]
+
+ if max_ratio < self.dominant_frq_threshold:
+ p_value = sps.binom.cdf(max_frequency, base_coverage, 1 - error_rate)
+ if p_value <= poly_pvalue_threshold:
+ base_consensus = '*' # mask out likely polymorphic positions
+
+ consensuses[marker][pos] = ord(base_consensus)
+
+ info('\t\tDone.')
+
+ consensuses = [(m, c.decode()) for m, c in consensuses.items()] # convert bytearrays to strings
+ return consensuses
+
+
def write_results_as_pkl(self, filtered, name, results):
"""Writes the consensus sequences as a PKL file
@@ -134,11 +195,15 @@ class SampleToMarkers:
pickle.dumps(consensus, pickle.HIGHEST_PROTOCOL)))
def parallel_filter_sam(self, input_file, filtered_markers):
- """Filters an input SAM file with the hits against markers of specific clades
+ """
+ Filters an input SAM file
+ * filters out viral markers (VDB)
+ * filters out hits with low mapQ (argument --min_mapping_quality)
+ * filters out markers with not enough mapped reads (argument --min_reads_aligning)
Args:
input_file (str): the input SAM file
- filtered_markers (list): the list with the markers of the filtered clades
+ filtered_markers (set): the list with the markers of the filtered clades, None if to use all markers
Returns:
str: the path to the output file
@@ -147,22 +212,46 @@ class SampleToMarkers:
if self.input_format.lower() == "bz2":
ifn = bz2.open(input_file, 'rt')
output_file = output_file.replace('.bz2', '')
- elif self.input_format.lower() == "sam":
+ else:
+ assert self.input_format.lower() == "sam"
ifn = open(input_file, 'r')
+
+ def filter_mapping_line(line_fields_, markers_subset):
+ marker_ = line_fields_[2]
+ if marker_.startswith('VDB'):
+ return False
+ if markers_subset is not None and marker_ not in markers_subset:
+ return False
+ mapq_ = int(line_fields_[4])
+ if mapq_ < self.min_mapping_quality:
+ return False
+ return True
+
+ marker_to_reads = Counter()
+ for line in ifn:
+ if line.startswith('@'):
+ continue
+ line_fields = line.rstrip('\n').split('\t')
+ if filter_mapping_line(line_fields, filtered_markers):
+ marker = line_fields[2]
+ marker_to_reads[marker] += 1
+
+ selected_markers = set((m for m, c in marker_to_reads.items() if c >= self.min_reads_aligning))
+
+ ifn.seek(0) # second pass of the file
with open(output_file, 'w') as ofn:
for line in ifn:
- s_line = line.strip().split('\t')
- if line.startswith('@'):
- if line.startswith('@HD'):
- ofn.write(line)
- elif filtered_markers is not None and s_line[1].split(':')[1] in filtered_markers:
+ line_fields = line.rstrip('\n').split('\t')
+ if line.startswith('@HD'):
+ ofn.write(line)
+ elif line.startswith('@'):
+ marker = line_fields[1].split(':')[1]
+ if marker in selected_markers:
ofn.write(line)
- elif filtered_markers is None and not s_line[1].split(':')[1].startswith('VDB'):
+ else:
+ if filter_mapping_line(line_fields, selected_markers):
ofn.write(line)
- elif filtered_markers is not None and s_line[2] in filtered_markers:
- ofn.write(line)
- elif filtered_markers is None and not s_line[2].startswith('VDB'):
- ofn.write(line)
+
ifn.close()
return output_file
@@ -193,7 +282,7 @@ class SampleToMarkers:
info("Done.")
if not self.debug:
info("Removing temporary files...")
- rmtree(self.tmp_dir, ignore_errors=False, onerror=None)
+ rmtree(self.tmp_dir)
info("Done.")
def __init__(self, args):
@@ -204,9 +293,9 @@ class SampleToMarkers:
self.database_controller = MetaphlanDatabaseController(args.database)
self.breadth_threshold = args.breadth_threshold
self.min_reads_aligning = args.min_reads_aligning
- self.min_read_len = args.min_read_len
self.min_base_coverage = args.min_base_coverage
self.min_base_quality = args.min_base_quality
+ self.min_mapping_quality = args.min_mapping_quality
self.dominant_frq_threshold = args.dominant_frq_threshold
self.clades = args.clades
self.tmp_dir = args.output_dir if args.tmp is None else args.tmp
@@ -237,20 +326,21 @@ def read_params():
help="The breadth of coverage threshold for the consensus markers")
p.add_argument('--min_reads_aligning', type=int, default=8,
help="The minimum number of reads to cover a marker")
- p.add_argument('--min_read_len', type=int, default=cmseq.CMSEQ_DEFAULTS.minlen,
- help="The minimum lenght for a read to be considered")
- p.add_argument('--min_base_coverage', type=int, default=cmseq.CMSEQ_DEFAULTS.mincov,
+ p.add_argument('--min_base_coverage', type=int, default=SampleToMarkers.DEFAULTS.min_base_coverage,
help="The minimum depth of coverage for a base to be considered")
- p.add_argument('--min_base_quality', type=int, default=cmseq.CMSEQ_DEFAULTS.minqual,
+ p.add_argument('--min_base_quality', type=int, default=SampleToMarkers.DEFAULTS.min_base_quality,
help="The minimum quality for a base to be considered. This is performed BEFORE --min_base_coverage")
- p.add_argument('--dominant_frq_threshold', type=float, default=cmseq.CMSEQ_DEFAULTS.poly_dominant_frq_thrsh,
+ p.add_argument('--min_mapping_quality', type=int, default=SampleToMarkers.DEFAULTS.min_mapping_quality,
+ help="The minimum quality for a mapping of the read to be considered.")
+ p.add_argument('--dominant_frq_threshold', type=float, default=SampleToMarkers.DEFAULTS.poly_dominant_frq_thrsh,
help="The cutoff for degree of 'allele dominance' for a position to be considered polymorphic")
p.add_argument('--clades', type=str, nargs='+', default=[],
help="Restricts the reconstruction of the markers to the specified clades")
p.add_argument('--tmp', type=str, default=None,
help="If specified, the directory where to store the temporal files")
p.add_argument('--debug', action='store_true', default=False,
- help="If specified, StrainPhlAn will not remove the temporal folder. Not available with inputs in BAM format")
+ help="If specified, StrainPhlAn will not remove the temporal folder. "
+ "Not available with inputs in BAM format")
p.add_argument('-n', '--nprocs', type=int, default=1,
help="The number of threads to execute the script")
return p.parse_args()
@@ -268,7 +358,7 @@ def check_params(args):
error('-f (or --input_format) must be specified', exit=True)
elif not args.output_dir:
error('-o (or --output_dir) must be specified', exit=True)
- elif args.input_format.lower() != "bam" and args.input_format.lower() != "sam" and args.input_format.lower() != "bz2":
+ elif args.input_format.lower() not in ['bam', 'sam', 'bz2']:
error('The input format must be SAM, BAM, or compressed in BZ2 format', exit=True)
elif args.input_format.lower() == "bam" and len(args.clades) > 0:
error('The --clades option cannot be used with inputs in BAM format', exit=True)
=====================================
metaphlan/utils/sgb_to_gtdb_profile.py
=====================================
@@ -1,7 +1,7 @@
#!/usr/bin/env python
__author__ = 'Aitor Blanco (aitor.blancomiguez at unitn.it'
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import os
import time
=====================================
metaphlan/utils/strain_transmission.py
=====================================
@@ -1,8 +1,8 @@
#!/usr/bin/env python
__author__ = ('Aitor Blanco (aitor.blancomiguez at unitn.it), '
'Mireia Valles-Colomer (mireia.vallescolomer at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import os
=====================================
metaphlan/utils/util_fun.py
=====================================
@@ -3,8 +3,8 @@ __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
'Francesco Asnicar (f.asnicar at unitn.it), '
'Moreno Zolfo (moreno.zolfo at unitn.it), '
'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
import os
=====================================
setup.py
=====================================
@@ -14,7 +14,7 @@ if sys.version_info[0] < 3:
setuptools.setup(
name='MetaPhlAn',
- version='4.0.3',
+ version='4.0.4',
author='Aitor Blanco-Miguez',
author_email='aitor.blancomiguez at unitn.it',
url='http://github.com/biobakery/MetaPhlAn/',
View it on GitLab: https://salsa.debian.org/med-team/metaphlan/-/compare/11c6ede615d4d893fa4b7873ca94d08ebe9c756a...e386ea765e93f574bf43b45044555b5d9f4a45b2
--
View it on GitLab: https://salsa.debian.org/med-team/metaphlan/-/compare/11c6ede615d4d893fa4b7873ca94d08ebe9c756a...e386ea765e93f574bf43b45044555b5d9f4a45b2
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230201/647f6d6f/attachment-0001.htm>
More information about the debian-med-commit
mailing list