[med-svn] [Git][med-team/metaphlan][master] 5 commits: New upstream version 4.0.4

Andreas Tille (@tille) gitlab at salsa.debian.org
Wed Feb 1 14:15:43 GMT 2023



Andreas Tille pushed to branch master at Debian Med / metaphlan2


Commits:
dd4058b3 by Andreas Tille at 2023-02-01T15:08:55+01:00
New upstream version 4.0.4
- - - - -
715de899 by Andreas Tille at 2023-02-01T15:08:55+01:00
routine-update: New upstream version

- - - - -
b14eeab5 by Andreas Tille at 2023-02-01T15:09:01+01:00
Update upstream source from tag 'upstream/4.0.4'

Update to upstream version '4.0.4'
with Debian dir 7f264b9a8641702704d93d5c13fae3a636ed6ee3
- - - - -
a7ea61f8 by Andreas Tille at 2023-02-01T15:09:01+01:00
routine-update: Standards-Version: 4.6.2

- - - - -
e386ea76 by Andreas Tille at 2023-02-01T15:11:16+01:00
routine-update: Ready to upload to unstable

- - - - -


20 changed files:

- CHANGELOG.md
- bioconda_recipe/meta.yaml
- debian/changelog
- debian/control
- metaphlan/__init__.py
- metaphlan/metaphlan.py
- metaphlan/strainphlan.py
- metaphlan/utils/add_metadata_tree.py
- metaphlan/utils/consensus_markers.py
- metaphlan/utils/database_controller.py
- metaphlan/utils/external_exec.py
- metaphlan/utils/extract_markers.py
- metaphlan/utils/parallelisation.py
- metaphlan/utils/phylophlan_controller.py
- metaphlan/utils/plot_tree_graphlan.py
- metaphlan/utils/sample2markers.py
- metaphlan/utils/sgb_to_gtdb_profile.py
- metaphlan/utils/strain_transmission.py
- metaphlan/utils/util_fun.py
- setup.py


Changes:

=====================================
CHANGELOG.md
=====================================
@@ -1,3 +1,12 @@
+## Version 4.0.4 (Jan 17nd, 2023)
+### Changed features
+* [MetaPhlAn] Download of the pre-computed Bowtie2 database is now the default option during installation
+* [StrainPhlAn] Improved StrainPhlAn's sample2makers.py script performance and speed
+### Fixes
+* [StrainPhlAn] Fixes error when using --abs_n_samples_threshold in the PhyloPhlAn call
+
+<br/>
+
 ## Version 4.0.3 (Oct 24nd, 2022)
 ### Changed features
 * [MetaPhlAn] Removal of the NCBI taxID from the merged profiles produced by the `merge_metaphlan_profiles.py` script


=====================================
bioconda_recipe/meta.yaml
=====================================
@@ -1,5 +1,5 @@
 {% set name = "metaphlan" %}
-{% set version = "4.0.3" %}
+{% set version = "4.0.4" %}
 
 package:
   name: {{ name }}


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+metaphlan (4.0.4-1) unstable; urgency=medium
+
+  * New upstream version
+  * Standards-Version: 4.6.2 (routine-update)
+
+ -- Andreas Tille <tille at debian.org>  Wed, 01 Feb 2023 15:09:53 +0100
+
 metaphlan (4.0.3-1) unstable; urgency=medium
 
   * Fix watch file


=====================================
debian/control
=====================================
@@ -10,7 +10,7 @@ Build-Depends: debhelper-compat (= 13),
                bowtie2,
                python3-setuptools,
                python3-biopython <!nocheck>
-Standards-Version: 4.6.1
+Standards-Version: 4.6.2
 Vcs-Browser: https://salsa.debian.org/med-team/metaphlan
 Vcs-Git: https://salsa.debian.org/med-team/metaphlan.git
 Homepage: https://github.com/biobakery/MetaPhlAn


=====================================
metaphlan/__init__.py
=====================================
@@ -83,46 +83,26 @@ def download(url, download_file, force=False):
             sys.stderr.write("\nWarning: Unable to download " + url + "\n")
     else:
         sys.stderr.write("\nFile {} already present!\n".format(download_file))
-
-def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_zenodo):
-    """
-    Download the url to the file and decompress into the folder
-    """
-
-    # Create the folder if it does not already exist
-    if not os.path.isdir(folder):
-        try:
-            os.makedirs(folder)
-        except EnvironmentError:
-            sys.exit("ERROR: Unable to create folder for database install: " + folder)
-
-    # Check the directory permissions
-    if not os.access(folder, os.W_OK):
-        sys.exit("ERROR: The directory is not writeable: " + folder + ". "
-                 "Please modify the permissions.")
-
-    #local path of the tarfile and md5file
+        
+        
+def download_and_untar(download_file_name, folder, origin):
+        #local path of the tarfile and md5file
     tar_file = os.path.join(folder, download_file_name + ".tar")
     md5_file = os.path.join(folder, download_file_name + ".md5")
-
     #Download the list of all the files in the FPT    
-    url_tar_file = "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/{}.tar".format(download_file_name)
-    url_md5_file = "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/{}.md5".format(download_file_name)
-
+    url_tar_file = "{}/{}.tar".format(origin, download_file_name)
+    url_md5_file = "{}/{}.md5".format(origin, download_file_name)
     # download tar and MD5 checksum
     download(url_tar_file, tar_file)
     download(url_md5_file, md5_file)
-
     md5_md5 = None
     md5_tar = None
-
     if os.path.isfile(md5_file):
         with open(md5_file) as f:
             for row in f:
                 md5_md5 = row.strip().split(' ')[0]
     else:
         sys.stderr.write('File "{}" not found!\n'.format(md5_file))
-
     # compute MD5 of .tar.bz2
     if os.path.isfile(tar_file):
         hash_md5 = hashlib.md5()
@@ -134,23 +114,44 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
         md5_tar = hash_md5.hexdigest()[:32]
     else:
         sys.stderr.write('File "{}" not found!\n'.format(tar_file))
-
     if (md5_tar is None) or (md5_md5 is None):
         sys.exit("MD5 checksums not found, something went wrong!")
-
     # compare checksums
     if md5_tar != md5_md5:
         sys.exit("MD5 checksums do not correspond! If this happens again, you should remove the database files and "
                  "rerun MetaPhlAn so they are re-downloaded")
-
     # untar
     try:
         tarfile_handle = tarfile.open(tar_file)
         tarfile_handle.extractall(path=folder)
         tarfile_handle.close()
+        os.remove(tar_file)
+        os.remove(md5_file)
     except EnvironmentError:
         sys.stderr.write("Warning: Unable to extract {}.\n".format(tar_file))
 
+def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_zenodo):
+    """
+    Download the url to the file and decompress into the folder
+    """
+
+    # Create the folder if it does not already exist
+    if not os.path.isdir(folder):
+        try:
+            os.makedirs(folder)
+        except EnvironmentError:
+            sys.exit("ERROR: Unable to create folder for database install: " + folder)
+
+    # Check the directory permissions
+    if not os.access(folder, os.W_OK):
+        sys.exit("ERROR: The directory is not writeable: " + folder + ". "
+                 "Please modify the permissions.")
+        
+    sys.stderr.write('\n\Downloading and uncompressing indexes\n')
+    download_and_untar("{}_bt2".format(download_file_name), folder, "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/bowtie2_indexes")
+    sys.stderr.write('\nDownloading and uncompressing additional files\n')
+    download_and_untar(download_file_name, folder, "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases")
+
     # uncompress sequences
     for bz2_file in iglob(os.path.join(folder, download_file_name + "_*.fna.bz2")):
         fna_file = bz2_file[:-4]
@@ -162,37 +163,20 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
                 bz2.BZ2File(bz2_file, 'rb') as bz2_h:
                 for data in iter(lambda: bz2_h.read(100 * 1024), b''):
                     fna_h.write(data)
-        os.remove(bz2_file)
-
-    # join fasta files
-    sys.stderr.write('\n\nJoining FASTA databases\n')
-    with open(os.path.join(folder, download_file_name + ".fna"), 'w') as fna_h:
-        for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
-            with open(fna_file, 'r') as fna_r:
-                for line in fna_r:
-                    fna_h.write(line)
-    fna_file = os.path.join(folder, download_file_name + ".fna")
+        os.remove(bz2_file)  
 
     # build bowtie2 indexes
     if not glob(os.path.join(folder, download_file_name + "*.bt2l")):
-        bt2_base = os.path.join(folder, download_file_name)
-        bt2_cmd = [bowtie2_build, '--quiet']
-
-        if nproc > 1:
-            bt2_build_output = subp.check_output([bowtie2_build, '--usage'], stderr=subp.STDOUT)
-
-            if 'threads' in str(bt2_build_output):
-                bt2_cmd += ['--threads', str(nproc)]
-
-        bt2_cmd += ['-f', fna_file, bt2_base]
-
-        sys.stderr.write('\nBuilding Bowtie2 indexes\n')
-
+        build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc)
+    else:
         try:
-            subp.check_call(bt2_cmd)
+            subp.check_call(['bowtie2-inspect', '-n', os.path.join(folder, download_file_name)], stdout=subp.DEVNULL, stderr=subp.DEVNULL)
         except Exception as e:
-            sys.stderr.write("Fatal error running '{}'\nError message: '{}'\n\n".format(' '.join(bt2_cmd), e))
-            sys.exit(1)
+            sys.stderr.write('Downloaded indexes are not compatible with the installed verion of Bowtie2\n')
+            sys.stderr.write('Building indexes from the FASTA files\n')
+            for btw_file in iglob(os.path.join(folder, download_file_name + "*.bt2l")):
+                os.remove(btw_file)
+            build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc)
 
     try:
         for bt2 in glob(os.path.join(folder, download_file_name + "*.bt2l")):
@@ -207,6 +191,35 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
     for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
         if not fna_file.endswith('_VSG.fna'):
             os.remove(fna_file)
+            
+   
+def build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc):
+    sys.stderr.write('\n\nJoining FASTA databases\n')
+    with open(os.path.join(folder, download_file_name + ".fna"), 'w') as fna_h:
+        for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
+            with open(fna_file, 'r') as fna_r:
+                for line in fna_r:
+                    fna_h.write(line)
+    fna_file = os.path.join(folder, download_file_name + ".fna")
+    
+    bt2_base = os.path.join(folder, download_file_name)
+    bt2_cmd = [bowtie2_build, '--quiet']
+
+    if nproc > 1:
+        bt2_build_output = subp.check_output([bowtie2_build, '--usage'], stderr=subp.STDOUT)
+
+        if 'threads' in str(bt2_build_output):
+            bt2_cmd += ['--threads', str(nproc)]
+
+    bt2_cmd += ['-f', fna_file, bt2_base]
+
+    sys.stderr.write('\nBuilding Bowtie2 indexes\n')
+
+    try:
+        subp.check_call(bt2_cmd)
+    except Exception as e:
+        sys.stderr.write("Fatal error running '{}'\nError message: '{}'\n\n".format(' '.join(bt2_cmd), e))
+        sys.exit(1)
     
 def download_unpack_zip(url,download_file_name,folder,software_name):
     """


=====================================
metaphlan/metaphlan.py
=====================================
@@ -4,8 +4,8 @@ __author__ = ('Aitor Blanco-Miguez (aitor.blancomiguez at unitn.it), '
               'Nicola Segata (nicola.segata at unitn.it), '
               'Duy Tin Truong, '
               'Francesco Asnicar (f.asnicar at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 import sys
 try:


=====================================
metaphlan/strainphlan.py
=====================================
@@ -4,8 +4,8 @@ __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
               'Francesco Asnicar (f.asnicar at unitn.it), '
               'Moreno Zolfo (moreno.zolfo at unitn.it), '
               'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
 import argparse as ap


=====================================
metaphlan/utils/add_metadata_tree.py
=====================================
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 __author__ = ('Duy Tin Truong (duytin.truong at unitn.it), '
               'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it)')
-__version__ = '4.0.3'
-__date__    = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__    = '17 Jan 2023'
 
 import argparse as ap
 import pandas


=====================================
metaphlan/utils/consensus_markers.py
=====================================
@@ -1,6 +1,6 @@
 __author__ = 'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it)'
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
 import os


=====================================
metaphlan/utils/database_controller.py
=====================================
@@ -1,6 +1,6 @@
 __author__ = 'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it)'
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
 import os
@@ -51,10 +51,11 @@ class MetaphlanDatabaseController:
             clades (list): the list of clades
 
         Returns:
-            list: the list of markers from the input clades
+            set: the list of markers from the input clades
         """
         self.load_database()
-        return [marker for marker in self.database_pkl['markers'] if self.database_pkl['markers'][marker]['clade'] in clades]
+        return set((marker for marker in self.database_pkl['markers']
+                    if self.database_pkl['markers'][marker]['clade'] in clades))
 
     def get_species2sgbs(self):
         """Retrieve information from the MetaPhlAn database


=====================================
metaphlan/utils/external_exec.py
=====================================
@@ -3,8 +3,8 @@ __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
               'Francesco Asnicar (f.asnicar at unitn.it), '
               'Moreno Zolfo (moreno.zolfo at unitn.it), '
               'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
 import os


=====================================
metaphlan/utils/extract_markers.py
=====================================
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
               'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
 import os


=====================================
metaphlan/utils/parallelisation.py
=====================================
@@ -3,10 +3,12 @@ __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
               'Francesco Asnicar (f.asnicar at unitn.it), '
               'Moreno Zolfo (moreno.zolfo at unitn.it), '
               'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
+from typing import Iterable, Callable, Any
+
 try:
     from .util_fun import error
 except ImportError:
@@ -31,7 +33,7 @@ def parallel_execution(arguments):
     """Executes each parallelised call of a function
 
     Args:
-        arguments ((str, list)): tuple with the function and its arguments
+        arguments ((Callable, *Any)): tuple with the function and its arguments
 
     Returns:
         function: the call to the function
@@ -42,15 +44,16 @@ def parallel_execution(arguments):
             return function(*args)
         except Exception as e:
             terminating.set()
-            raise
+            raise e
     else:
         terminating.set()
 
+
 def execute_pool(args, nprocs):
     """Creates a pool for a parallelised function and returns the results of each execution as a list
 
     Args:
-        args ((str, list)): tuple with the function and its arguments
+        args (Iterable[tuple]): tuple with the function and its arguments
         nprocs (int): number of procs to use
 
     Returns:


=====================================
metaphlan/utils/phylophlan_controller.py
=====================================
@@ -1,6 +1,6 @@
 __author__ = 'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it'
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
 import os
@@ -81,7 +81,7 @@ class Phylophlan3Controller(PhylophlanController):
             info("\tDone.")
         self.fake_phylophlan_inputs(samples_markers_dir)
         info("\tProcessing samples...")
-        min_entries = self.marker_in_n_samples if self.abs_n_markers_thres else self.marker_in_n_samples * num_samples // 100
+        min_entries = self.marker_in_n_samples if self.abs_n_samples_thres else self.marker_in_n_samples * num_samples // 100
         execute_phylophlan(samples_markers_dir, self.phylophlan_configuration, min_entries, int(round(min_markers, 0)),
                            self.tmp_dir, self.output_dir, self.clade, self.phylophlan_mode, self.mutation_rates, self.nprocs)
         if self.mutation_rates:
@@ -102,7 +102,7 @@ class Phylophlan3Controller(PhylophlanController):
         self.samples = args.samples
         self.secondary_samples = args.secondary_samples
         self.marker_in_n_samples = args.marker_in_n_samples
-        self.abs_n_markers_thres = args.abs_n_markers_thres
+        self.abs_n_samples_thres = args.abs_n_samples_thres
         self.phylophlan_mode = args.phylophlan_mode
         self.phylophlan_configuration = args.phylophlan_configuration
         self.mutation_rates = args.mutation_rates


=====================================
metaphlan/utils/plot_tree_graphlan.py
=====================================
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 __author__ = ('Duy Tin Truong (duytin.truong at unitn.it), '
               'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it)')
-__version__ = '4.0.3'
-__date__    = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__    = '17 Jan 2023'
 
 import argparse as ap
 import dendropy


=====================================
metaphlan/utils/sample2markers.py
=====================================
@@ -1,11 +1,13 @@
 #!/usr/bin/env python
-__author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
+__author__ = ('Michal Puncochar (michal.puncochar at unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
               'Duy Tin Truong (duytin.truong at unitn.it), '
               'Francesco Asnicar (f.asnicar at unitn.it), '
               'Moreno Zolfo (moreno.zolfo at unitn.it), '
               'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
+
 
 
 import os
@@ -15,8 +17,12 @@ import tempfile
 import bz2
 import pickletools
 import argparse as ap
-from cmseq import cmseq
+from collections import Counter
 from shutil import rmtree
+
+import pysam
+import scipy.stats as sps
+
 try:
     from .external_exec import samtools_sam_to_bam, samtools_sort_bam_v1, decompress_bz2
     from .util_fun import info, error
@@ -34,6 +40,15 @@ except ImportError:
 class SampleToMarkers:
     """SampleToMarkers class"""
 
+    class DEFAULTS:
+        pileup_stepper = 'nofilter'
+        poly_error_rate = 0.001
+        min_base_coverage = 1
+        min_base_quality = 30
+        min_mapping_quality = 10
+        poly_dominant_frq_thrsh = 0.8
+        poly_pvalue_threshold = 0.05
+
     def decompress_from_bz2(self):
         """ Decompressed SAM.BZ2 files
 
@@ -41,7 +56,7 @@ class SampleToMarkers:
             (list, str): tuple with the list of decompressed files and their format
         """
         results = execute_pool(
-            ((self.decompress_bz2_file, i, self.tmp_dir) for i in self.input), self.nprocs)
+            ((SampleToMarkers.decompress_bz2_file, i, self.tmp_dir) for i in self.input), self.nprocs)
         decompressed = [r[0] for r in results]
         decompressed_format = [r[1] for r in results]
         if decompressed_format[1:] == decompressed_format[:-1]:
@@ -54,6 +69,7 @@ class SampleToMarkers:
         else:
             error("Decompressed files have different formats", exit=True)
 
+    @staticmethod
     def decompress_bz2_file(input_file, output_dir):
         """Decompress a BZ2 file and returns the decompressed file and the decompressed file format
 
@@ -82,12 +98,16 @@ class SampleToMarkers:
             self.input = execute_pool(
                 ((samtools_sort_bam_v1, i, self.tmp_dir) for i in self.input), self.nprocs)
             info("\tDone.")
-        elif self.sorted == False:
+        elif not self.sorted:
             info("\tSorting BAM samples...")
             self.input = execute_pool(
                 ((samtools_sort_bam_v1, i, self.tmp_dir) for i in self.input), self.nprocs)
             info("\tDone.")
 
+        info('\tIndexing BAM samples...')
+        execute_pool(((pysam.index, i) for i in self.input), self.nprocs)
+        info('\tDone.')
+
     def build_consensus_markers(self, filtered):
         """Gets the markers for each sample and writes the Pickle files
 
@@ -96,24 +116,65 @@ class SampleToMarkers:
         """
         for i in self.input:
             info("\tProcessing sample: {}".format(i))
-            results = self.execute_cmseq(i)
+            results = self.get_consensuses_for_sample(i)
             self.write_results_as_pkl(filtered, os.path.splitext(
                 os.path.basename(i))[0], results)
             info("\tDone.")
 
-    def execute_cmseq(self, input_bam):
-        """cmseq on the specified BAM file
+
+    def get_consensuses_for_sample(self, input_bam, stepper=DEFAULTS.pileup_stepper,
+                                   error_rate=DEFAULTS.poly_error_rate,
+                                   poly_pvalue_threshold=DEFAULTS.poly_pvalue_threshold):
+        """Pileup on the specified BAM file
 
         Args:
             input_bam (str): the path to the input BAM file
+            stepper (str): stepper to use in the pileup
+            error_rate (float): the sequencing error rate
+            poly_pvalue_threshold (float): the p-value threshold to call polymorphic position
 
         Returns:
-            list: the list with the reconstructed consensus markers
+            list[tuple[str, str]]: the list with the marker names and consensus sequences
         """
-        collection = cmseq.BamFile(
-            input_bam, index=True, minlen=self.min_read_len, minimumReadsAligning=self.min_reads_aligning)
-        return collection.parallel_reference_free_consensus(ncores=self.nprocs, mincov=self.min_base_coverage, minqual=self.min_base_quality,
-                                                            consensus_rule=cmseq.BamContig.majority_rule_polymorphicLoci, dominant_frq_thrsh=self.dominant_frq_threshold)
+
+        info('\t\tLoading the bam file and extracting information...')
+        sam_file = pysam.AlignmentFile(input_bam)
+
+        all_markers = sam_file.references
+        marker_lengths = sam_file.lengths
+        marker_to_length = dict(zip(all_markers, marker_lengths))
+        info('\t\tDone.')
+
+        ACTGactg = set(list('ACTGactg'))
+
+        info('\t\tRunning the pileup...')
+        consensuses = {m: bytearray(b'-' * marker_to_length[m]) for m in all_markers}
+        for base_pileup in sam_file.pileup(contig=None, stepper=stepper, min_base_quality=self.min_base_quality):
+            marker = base_pileup.reference_name
+            pos = base_pileup.pos
+            bases = [b.upper() for b in base_pileup.get_query_sequences() if b in ACTGactg]
+            base_coverage = len(bases)
+            if base_coverage < self.min_base_coverage:
+                continue
+
+            base_frequencies = Counter(bases)
+            max_frequency = max(base_frequencies.values())
+            max_ratio = max_frequency / base_coverage
+            base_consensus = base_frequencies.most_common(1)[0][0]
+
+            if max_ratio < self.dominant_frq_threshold:
+                p_value = sps.binom.cdf(max_frequency, base_coverage, 1 - error_rate)
+                if p_value <= poly_pvalue_threshold:
+                    base_consensus = '*'  # mask out likely polymorphic positions
+
+            consensuses[marker][pos] = ord(base_consensus)
+
+        info('\t\tDone.')
+
+        consensuses = [(m, c.decode()) for m, c in consensuses.items()]  # convert bytearrays to strings
+        return consensuses
+
+
 
     def write_results_as_pkl(self, filtered, name, results):
         """Writes the consensus sequences as a PKL file
@@ -134,11 +195,15 @@ class SampleToMarkers:
                 pickle.dumps(consensus, pickle.HIGHEST_PROTOCOL)))
 
     def parallel_filter_sam(self, input_file, filtered_markers):
-        """Filters an input SAM file with the hits against markers of specific clades
+        """
+        Filters an input SAM file
+            * filters out viral markers (VDB)
+            * filters out hits with low mapQ (argument --min_mapping_quality)
+            * filters out markers with not enough mapped reads (argument --min_reads_aligning)
 
         Args:
             input_file (str): the input SAM file
-            filtered_markers (list): the list with the markers of the filtered clades
+            filtered_markers (set): the list with the markers of the filtered clades, None if to use all markers
 
         Returns:
             str: the path to the output file
@@ -147,22 +212,46 @@ class SampleToMarkers:
         if self.input_format.lower() == "bz2":
             ifn = bz2.open(input_file, 'rt')
             output_file = output_file.replace('.bz2', '')
-        elif self.input_format.lower() == "sam":
+        else:
+            assert self.input_format.lower() == "sam"
             ifn = open(input_file, 'r')
+
+        def filter_mapping_line(line_fields_, markers_subset):
+            marker_ = line_fields_[2]
+            if marker_.startswith('VDB'):
+                return False
+            if markers_subset is not None and marker_ not in markers_subset:
+                return False
+            mapq_ = int(line_fields_[4])
+            if mapq_ < self.min_mapping_quality:
+                return False
+            return True
+
+        marker_to_reads = Counter()
+        for line in ifn:
+            if line.startswith('@'):
+                continue
+            line_fields = line.rstrip('\n').split('\t')
+            if filter_mapping_line(line_fields, filtered_markers):
+                marker = line_fields[2]
+                marker_to_reads[marker] += 1
+
+        selected_markers = set((m for m, c in marker_to_reads.items() if c >= self.min_reads_aligning))
+
+        ifn.seek(0)  # second pass of the file
         with open(output_file, 'w') as ofn:
             for line in ifn:
-                s_line = line.strip().split('\t')
-                if line.startswith('@'):
-                    if line.startswith('@HD'):
-                        ofn.write(line)
-                    elif filtered_markers is not None and s_line[1].split(':')[1] in filtered_markers:
+                line_fields = line.rstrip('\n').split('\t')
+                if line.startswith('@HD'):
+                    ofn.write(line)
+                elif line.startswith('@'):
+                    marker = line_fields[1].split(':')[1]
+                    if marker in selected_markers:
                         ofn.write(line)
-                    elif filtered_markers is None and not s_line[1].split(':')[1].startswith('VDB'):
+                else:
+                    if filter_mapping_line(line_fields, selected_markers):
                         ofn.write(line)
-                elif filtered_markers is not None and s_line[2] in filtered_markers:
-                    ofn.write(line)
-                elif filtered_markers is None and not s_line[2].startswith('VDB'):
-                    ofn.write(line)
+
         ifn.close()
         return output_file
 
@@ -193,7 +282,7 @@ class SampleToMarkers:
         info("Done.")
         if not self.debug:
             info("Removing temporary files...")
-            rmtree(self.tmp_dir, ignore_errors=False, onerror=None)
+            rmtree(self.tmp_dir)
             info("Done.")
 
     def __init__(self, args):
@@ -204,9 +293,9 @@ class SampleToMarkers:
         self.database_controller = MetaphlanDatabaseController(args.database)
         self.breadth_threshold = args.breadth_threshold
         self.min_reads_aligning = args.min_reads_aligning
-        self.min_read_len = args.min_read_len
         self.min_base_coverage = args.min_base_coverage
         self.min_base_quality = args.min_base_quality
+        self.min_mapping_quality = args.min_mapping_quality
         self.dominant_frq_threshold = args.dominant_frq_threshold
         self.clades = args.clades
         self.tmp_dir = args.output_dir if args.tmp is None else args.tmp
@@ -237,20 +326,21 @@ def read_params():
                    help="The breadth of coverage threshold for the consensus markers")
     p.add_argument('--min_reads_aligning', type=int, default=8,
                    help="The minimum number of reads to cover a marker")
-    p.add_argument('--min_read_len', type=int, default=cmseq.CMSEQ_DEFAULTS.minlen,
-                   help="The minimum lenght for a read to be considered")
-    p.add_argument('--min_base_coverage', type=int, default=cmseq.CMSEQ_DEFAULTS.mincov,
+    p.add_argument('--min_base_coverage', type=int, default=SampleToMarkers.DEFAULTS.min_base_coverage,
                    help="The minimum depth of coverage for a base to be considered")
-    p.add_argument('--min_base_quality', type=int, default=cmseq.CMSEQ_DEFAULTS.minqual,
+    p.add_argument('--min_base_quality', type=int, default=SampleToMarkers.DEFAULTS.min_base_quality,
                    help="The minimum quality for a base to be considered. This is performed BEFORE --min_base_coverage")
-    p.add_argument('--dominant_frq_threshold', type=float, default=cmseq.CMSEQ_DEFAULTS.poly_dominant_frq_thrsh,
+    p.add_argument('--min_mapping_quality', type=int, default=SampleToMarkers.DEFAULTS.min_mapping_quality,
+                   help="The minimum quality for a mapping of the read to be considered.")
+    p.add_argument('--dominant_frq_threshold', type=float, default=SampleToMarkers.DEFAULTS.poly_dominant_frq_thrsh,
                    help="The cutoff for degree of 'allele dominance' for a position to be considered polymorphic")
     p.add_argument('--clades', type=str, nargs='+', default=[],
                    help="Restricts the reconstruction of the markers to the specified clades")
     p.add_argument('--tmp', type=str, default=None,
                    help="If specified, the directory where to store the temporal files")
     p.add_argument('--debug', action='store_true', default=False,
-                   help="If specified, StrainPhlAn will not remove the temporal folder. Not available with inputs in BAM format")
+                   help="If specified, StrainPhlAn will not remove the temporal folder. "
+                        "Not available with inputs in BAM format")
     p.add_argument('-n', '--nprocs', type=int, default=1,
                    help="The number of threads to execute the script")
     return p.parse_args()
@@ -268,7 +358,7 @@ def check_params(args):
         error('-f (or --input_format) must be specified', exit=True)
     elif not args.output_dir:
         error('-o (or --output_dir) must be specified', exit=True)
-    elif args.input_format.lower() != "bam" and args.input_format.lower() != "sam" and args.input_format.lower() != "bz2":
+    elif args.input_format.lower() not in ['bam', 'sam', 'bz2']:
         error('The input format must be SAM, BAM, or compressed in BZ2 format', exit=True)
     elif args.input_format.lower() == "bam" and len(args.clades) > 0:
         error('The --clades option cannot be used with inputs in BAM format', exit=True)


=====================================
metaphlan/utils/sgb_to_gtdb_profile.py
=====================================
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 __author__ = 'Aitor Blanco (aitor.blancomiguez at unitn.it'
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 import os
 import time


=====================================
metaphlan/utils/strain_transmission.py
=====================================
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 __author__ = ('Aitor Blanco (aitor.blancomiguez at unitn.it), '
               'Mireia Valles-Colomer (mireia.vallescolomer at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
 import os


=====================================
metaphlan/utils/util_fun.py
=====================================
@@ -3,8 +3,8 @@ __author__ = ('Aitor Blanco Miguez (aitor.blancomiguez at unitn.it), '
               'Francesco Asnicar (f.asnicar at unitn.it), '
               'Moreno Zolfo (moreno.zolfo at unitn.it), '
               'Francesco Beghini (francesco.beghini at unitn.it)')
-__version__ = '4.0.3'
-__date__ = '24 Oct 2022'
+__version__ = '4.0.4'
+__date__ = '17 Jan 2023'
 
 
 import os


=====================================
setup.py
=====================================
@@ -14,7 +14,7 @@ if sys.version_info[0] < 3:
 
 setuptools.setup(
     name='MetaPhlAn',
-    version='4.0.3',
+    version='4.0.4',
     author='Aitor Blanco-Miguez',
     author_email='aitor.blancomiguez at unitn.it',
     url='http://github.com/biobakery/MetaPhlAn/',



View it on GitLab: https://salsa.debian.org/med-team/metaphlan/-/compare/11c6ede615d4d893fa4b7873ca94d08ebe9c756a...e386ea765e93f574bf43b45044555b5d9f4a45b2

-- 
View it on GitLab: https://salsa.debian.org/med-team/metaphlan/-/compare/11c6ede615d4d893fa4b7873ca94d08ebe9c756a...e386ea765e93f574bf43b45044555b5d9f4a45b2
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230201/647f6d6f/attachment-0001.htm>


More information about the debian-med-commit mailing list