[med-svn] [Git][med-team/metaphlan][master] Unapply use-packaging-version.patch.

Mon May 5 21:03:13 BST 2025


Étienne Mollier pushed to branch master at Debian Med / metaphlan


Commits:
fd779d06 by Étienne Mollier at 2025-05-05T21:56:37+02:00
Unapply use-packaging-version.patch.

Gbp-Dch: ignore

- - - - -


7 changed files:

- − .pc/.quilt_patches
- − .pc/.quilt_series
- − .pc/.version
- − .pc/applied-patches
- − .pc/use-packaging-version.patch/.timestamp
- − .pc/use-packaging-version.patch/metaphlan/metaphlan.py
- metaphlan/metaphlan.py


Changes:

=====================================
.pc/.quilt_patches deleted
=====================================
@@ -1 +0,0 @@
-debian/patches


=====================================
.pc/.quilt_series deleted
=====================================
@@ -1 +0,0 @@
-series


=====================================
.pc/.version deleted
=====================================
@@ -1 +0,0 @@
-2


=====================================
.pc/applied-patches deleted
=====================================
@@ -1 +0,0 @@
-use-packaging-version.patch


=====================================
.pc/use-packaging-version.patch/.timestamp deleted
=====================================


=====================================
.pc/use-packaging-version.patch/metaphlan/metaphlan.py deleted
=====================================
@@ -1,1304 +0,0 @@
-#!/usr/bin/env python
-__author__ = ('Aitor Blanco-Miguez (aitor.blancomiguez at unitn.it), '
-              'Francesco Beghini (francesco.beghini at unitn.it), '
-              'Nicola Segata (nicola.segata at unitn.it), '
-              'Duy Tin Truong, '
-              'Francesco Asnicar (f.asnicar at unitn.it)')
-__version__ = '4.0.4'
-__date__ = '17 Jan 2023'
-
-import sys
-try:
-    from metaphlan import mybytes, plain_read_and_split, plain_read_and_split_line, read_and_split, read_and_split_line, check_and_install_database, remove_prefix
-except ImportError:
-    sys.exit("CRITICAL ERROR: Unable to find the MetaPhlAn python package. Please check your install.") 
-
-if float(sys.version_info[0]) < 3.0:
-    sys.stderr.write("MetaPhlAn requires Python 3, your current Python version is {}.{}.{}\n"
-                    .format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))
-    sys.exit(1)
-import os
-import stat
-import re
-import time
-import random
-from collections import defaultdict as defdict
-from distutils.version import LooseVersion
-from glob import glob
-from subprocess import DEVNULL
-import argparse as ap
-import bz2
-import pickle
-import subprocess as subp
-import tempfile as tf
-
-try:
-    import numpy as np
-except ImportError:
-    sys.stderr.write("Error! numpy python library not detected!!\n")
-    sys.exit(1)
-
-#**********************************************************************************************
-#  Modification of Code :                                                                     *
-#  Modified the code so instead of using the current clade IDs, which are numbers, we will    *
-#      use the clade_names                                                                    *
-#      Users reported the biom output is invalid and also the IDs were changing from run to   *
-#      run.                                                                                   *
-#  George Weingart    05/22/2017   george.weingart at mail.com                                   *
-#**********************************************************************************************
-
-#*************************************************************
-#*  Imports related to biom file generation                  *
-#*************************************************************
-try:
-    import biom
-    import biom.table
-except ImportError:
-    sys.stderr.write("Warning! Biom python library not detected!"
-                     "\n Exporting to biom format will not work!\n")
-import json
-
-# get the directory that contains this script
-metaphlan_script_install_folder = os.path.dirname(os.path.abspath(__file__))
-# get the default database folder
-DEFAULT_DB_FOLDER = os.path.join(metaphlan_script_install_folder, "metaphlan_databases")
-DEFAULT_DB_FOLDER= os.environ.get('METAPHLAN_DB_DIR', DEFAULT_DB_FOLDER)
-#Wether to execute a SGB-based analysis
-SGB_ANALYSIS = True
-INDEX = 'latest'
-tax_units = "kpcofgst"
-
-def read_params(args):
-    p = ap.ArgumentParser( description=
-            "DESCRIPTION\n"
-            " MetaPhlAn version "+__version__+" ("+__date__+"): \n"
-            " METAgenomic PHyLogenetic ANalysis for metagenomic taxonomic profiling.\n\n"
-            "AUTHORS: "+__author__+"\n\n"
-            "COMMON COMMANDS\n\n"
-            " We assume here that MetaPhlAn is installed using the several options available (pip, conda, PyPi)\n"
-            " Also BowTie2 should be in the system path with execution and read permissions, and Perl should be installed)\n\n"
-
-            "\n========== MetaPhlAn clade-abundance estimation ================= \n\n"
-            "The basic usage of MetaPhlAn consists in the identification of the clades (from phyla to species ) \n"
-            "present in the metagenome obtained from a microbiome sample and their \n"
-            "relative abundance. This correspond to the default analysis type (-t rel_ab).\n\n"
-
-            "*  Profiling a metagenome from raw reads:\n"
-            "$ metaphlan metagenome.fastq --input_type fastq -o profiled_metagenome.txt\n\n"
-
-            "*  You can take advantage of multiple CPUs and save the intermediate BowTie2 output for re-running\n"
-            "   MetaPhlAn extremely quickly:\n"
-            "$ metaphlan metagenome.fastq --bowtie2out metagenome.bowtie2.bz2 --nproc 5 --input_type fastq -o profiled_metagenome.txt\n\n"
-
-            "*  If you already mapped your metagenome against the marker DB (using a previous MetaPhlAn run), you\n"
-            "   can obtain the results in few seconds by using the previously saved --bowtie2out file and \n"
-            "   specifying the input (--input_type bowtie2out):\n"
-            "$ metaphlan metagenome.bowtie2.bz2 --nproc 5 --input_type bowtie2out -o profiled_metagenome.txt\n\n"
-            
-            "*  bowtie2out files generated with MetaPhlAn versions below 3 are not compatibile.\n"
-            "   Starting from MetaPhlAn 3.0, the BowTie2 ouput now includes the size of the profiled metagenome and the average read length.\n"
-            "   If you want to re-run MetaPhlAn using these file you should provide the metagenome size via --nreads:\n"
-            "$ metaphlan metagenome.bowtie2.bz2 --nproc 5 --input_type bowtie2out --nreads 520000 -o profiled_metagenome.txt\n\n"
-
-            "*  You can also provide an externally BowTie2-mapped SAM if you specify this format with \n"
-            "   --input_type. Two steps: first apply BowTie2 and then feed MetaPhlAn with the obtained sam:\n"
-            "$ bowtie2 --sam-no-hd --sam-no-sq --no-unal --very-sensitive -S metagenome.sam -x ${mpa_dir}/metaphlan_databases/mpa_v30_CHOCOPhlAn_201901 -U metagenome.fastq\n"
-            "$ metaphlan metagenome.sam --input_type sam -o profiled_metagenome.txt\n\n"
-
-            "*  We can also natively handle paired-end metagenomes, and, more generally, metagenomes stored in \n"
-            "  multiple files (but you need to specify the --bowtie2out parameter):\n"
-            "$ metaphlan metagenome_1.fastq,metagenome_2.fastq --bowtie2out metagenome.bowtie2.bz2 --nproc 5 --input_type fastq\n\n"
-            "\n------------------------------------------------------------------- \n \n\n"
-
-            "\n========== Marker level analysis ============================ \n\n"
-            "MetaPhlAn introduces the capability of characterizing organisms at the strain level using non\n"
-            "aggregated marker information. Such capability comes with several slightly different flavours and \n"
-            "are a way to perform strain tracking and comparison across multiple samples.\n"
-            "Usually, MetaPhlAn is first ran with the default -t to profile the species present in\n"
-            "the community, and then a strain-level profiling can be performed to zoom-in into specific species\n"
-            "of interest. This operation can be performed quickly as it exploits the --bowtie2out intermediate \n"
-            "file saved during the execution of the default analysis type.\n\n"
-
-            "*  The following command will output the abundance of each marker with a RPK (reads per kilo-base) \n"
-            "   higher 0.0. (we are assuming that metagenome_outfmt.bz2 has been generated before as \n"
-            "   shown above).\n"
-            "$ metaphlan -t marker_ab_table metagenome_outfmt.bz2 --input_type bowtie2out -o marker_abundance_table.txt\n"
-            "   The obtained RPK can be optionally normalized by the total number of reads in the metagenome \n"
-            "   to guarantee fair comparisons of abundances across samples. The number of reads in the metagenome\n"
-            "   needs to be passed with the '--nreads' argument\n\n"
-
-            "*  The list of markers present in the sample can be obtained with '-t marker_pres_table'\n"
-            "$ metaphlan -t marker_pres_table metagenome_outfmt.bz2 --input_type bowtie2out -o marker_abundance_table.txt\n"
-            "   The --pres_th argument (default 1.0) set the minimum RPK value to consider a marker present\n\n"
-
-            "*  The list '-t clade_profiles' analysis type reports the same information of '-t marker_ab_table'\n"
-            "   but the markers are reported on a clade-by-clade basis.\n"
-            "$ metaphlan -t clade_profiles metagenome_outfmt.bz2 --input_type bowtie2out -o marker_abundance_table.txt\n\n"
-
-            "*  Finally, to obtain all markers present for a specific clade and all its subclades, the \n"
-            "   '-t clade_specific_strain_tracker' should be used. For example, the following command\n"
-            "   is reporting the presence/absence of the markers for the B. fragilis species and its strains\n"
-            "   the optional argument --min_ab specifies the minimum clade abundance for reporting the markers\n\n"
-            "$ metaphlan -t clade_specific_strain_tracker --clade s__Bacteroides_fragilis metagenome_outfmt.bz2 --input_type bowtie2out -o marker_abundance_table.txt\n"
-
-            "\n------------------------------------------------------------------- \n\n"
-            "",
-            formatter_class=ap.RawTextHelpFormatter,
-            add_help=False )
-    arg = p.add_argument
-
-    arg( 'inp', metavar='INPUT_FILE', type=str, nargs='?', default=None, help=
-         "the input file can be:\n"
-         "* a fastq file containing metagenomic reads\n"
-         "OR\n"
-         "* a BowTie2 produced SAM file. \n"
-         "OR\n"
-         "* an intermediary mapping file of the metagenome generated by a previous MetaPhlAn run \n"
-         "If the input file is missing, the script assumes that the input is provided using the standard \n"
-         "input, or named pipes.\n"
-         "IMPORTANT: the type of input needs to be specified with --input_type" )
-
-    arg( 'output', metavar='OUTPUT_FILE', type=str, nargs='?', default=None,
-         help= "the tab-separated output file of the predicted taxon relative abundances \n"
-               "[stdout if not present]")
-
-
-    g = p.add_argument_group('Required arguments')
-    arg = g.add_argument
-    input_type_choices = ['fastq','fasta','bowtie2out','sam']
-    arg( '--input_type', choices=input_type_choices, required = '--install' not in args, help =
-         "set whether the input is the FASTA file of metagenomic reads or \n"
-         "the SAM file of the mapping of the reads against the MetaPhlAn db.\n"
-        )
-
-    g = p.add_argument_group('Mapping arguments')
-    arg = g.add_argument
-    arg('--force', action='store_true', help="Force profiling of the input file by removing the bowtie2out file")
-    arg('--bowtie2db', metavar="METAPHLAN_BOWTIE2_DB", type=str, default=DEFAULT_DB_FOLDER,
-        help=("Folder containing the MetaPhlAn database. You can specify the location by exporting the DEFAULT_DB_FOLDER variable in the shell."
-              "[default "+DEFAULT_DB_FOLDER+"]\n"))
-
-    arg('-x', '--index', type=str, default=INDEX,
-        help=("Specify the id of the database version to use. "
-              "If \"latest\", MetaPhlAn will get the latest version.\n"
-              "If an index name is provided, MetaPhlAn will try to use it, if available, and skip the online check.\n"
-              "If the database files are not found on the local MetaPhlAn installation they\n"
-              "will be automatically downloaded [default "+INDEX+"]\n"))
-
-    bt2ps = ['sensitive', 'very-sensitive', 'sensitive-local', 'very-sensitive-local']
-    arg('--bt2_ps', metavar="BowTie2 presets", default='very-sensitive',
-        choices=bt2ps, help="Presets options for BowTie2 (applied only when a "
-                            "FASTA file is provided)\n"
-                            "The choices enabled in MetaPhlAn are:\n"
-                            " * sensitive\n"
-                            " * very-sensitive\n"
-                            " * sensitive-local\n"
-                            " * very-sensitive-local\n"
-                            "[default very-sensitive]\n")
-    arg('--bowtie2_exe', type=str, default=None,
-        help='Full path and name of the BowTie2 executable. This option allows'
-             'MetaPhlAn to reach the executable even when it is not in the '
-             'system PATH or the system PATH is unreachable')
-    arg('--bowtie2_build', type=str, default='bowtie2-build',
-        help="Full path to the bowtie2-build command to use, deafult assumes "
-             "that 'bowtie2-build is present in the system path")
-    arg('--bowtie2out', metavar="FILE_NAME", type=str, default=None,
-        help="The file for saving the output of BowTie2")
-    arg('--min_mapq_val', type=int, default=5,
-        help="Minimum mapping quality value (MAPQ) [default 5]")
-    arg('--no_map', action='store_true',
-        help="Avoid storing the --bowtie2out map file")
-    arg('--tmp_dir', metavar="", default=None, type=str,
-        help="The folder used to store temporary files [default is the OS "
-             "dependent tmp dir]")
-
-    g = p.add_argument_group('Post-mapping arguments')
-    arg = g.add_argument
-    stat_choices = ['avg_g','avg_l','tavg_g','tavg_l','wavg_g','wavg_l','med']
-    arg( '--tax_lev', metavar='TAXONOMIC_LEVEL', type=str,
-         choices='a'+tax_units, default='a', help =
-         "The taxonomic level for the relative abundance output:\n"
-         "'a' : all taxonomic levels\n"
-         "'k' : kingdoms\n"
-         "'p' : phyla only\n"
-         "'c' : classes only\n"
-         "'o' : orders only\n"
-         "'f' : families only\n"
-         "'g' : genera only\n"
-         "'s' : species only\n"
-         "'t' : SGBs only\n"
-         "[default 'a']" )
-    arg( '--min_cu_len', metavar="", default="2000", type=int, help =
-         "minimum total nucleotide length for the markers in a clade for\n"
-         "estimating the abundance without considering sub-clade abundances\n"
-         "[default 2000]\n"   )
-    arg( '--min_alignment_len', metavar="", default=None, type=int, help =
-         "The sam records for aligned reads with the longest subalignment\n"
-         "length smaller than this threshold will be discarded.\n"
-         "[default None]\n"   )
-    arg( '--add_viruses', action='store_true', help=
-         "Together with --mpa3, allow the profiling of viral organisms" )
-    arg( '--ignore_eukaryotes', action='store_true', help=
-         "Do not profile eukaryotic organisms" )
-    arg( '--ignore_bacteria', action='store_true', help=
-         "Do not profile bacterial organisms" )
-    arg( '--ignore_archaea', action='store_true', help=
-         "Do not profile archeal organisms" )
-    arg( '--ignore_ksgbs', action='store_true', help=
-         "Do not profile known SGBs (together with --sgb option)" )
-    arg( '--ignore_usgbs', action='store_true', help=
-         "Do not profile unknown SGBs (together with --sgb option)" )
-    arg( '--stat_q', metavar="", type = float, default=0.2, help =
-         "Quantile value for the robust average\n"
-         "[default 0.2]" )
-    arg( '--perc_nonzero', metavar="", type = float, default=0.33, help =
-         "Percentage of markers with a non zero relative abundance for misidentify a species\n"
-         "[default 0.33]"   )
-    arg( '--ignore_markers', type=str, default = None, help =
-         "File containing a list of markers to ignore. \n")
-    arg( '--avoid_disqm', action="store_true", help =
-         "Deactivate the procedure of disambiguating the quasi-markers based on the \n"
-         "marker abundance pattern found in the sample. It is generally recommended \n"
-         "to keep the disambiguation procedure in order to minimize false positives\n")
-    arg( '--stat', metavar="", choices=stat_choices, default="tavg_g", type=str, help =
-         "Statistical approach for converting marker abundances into clade abundances\n"
-         "'avg_g'  : clade global (i.e. normalizing all markers together) average\n"
-         "'avg_l'  : average of length-normalized marker counts\n"
-         "'tavg_g' : truncated clade global average at --stat_q quantile\n"
-         "'tavg_l' : truncated average of length-normalized marker counts (at --stat_q)\n"
-         "'wavg_g' : winsorized clade global average (at --stat_q)\n"
-         "'wavg_l' : winsorized average of length-normalized marker counts (at --stat_q)\n"
-         "'med'    : median of length-normalized marker counts\n"
-         "[default tavg_g]"   )
-
-    arg = p.add_argument
-
-    g = p.add_argument_group('Additional analysis types and arguments')
-    arg = g.add_argument
-    analysis_types = ['rel_ab', 'rel_ab_w_read_stats', 'reads_map', 'clade_profiles', 'marker_ab_table', 'marker_counts', 'marker_pres_table', 'clade_specific_strain_tracker']
-    arg( '-t', metavar='ANALYSIS TYPE', type=str, choices = analysis_types,
-         default='rel_ab', help =
-         "Type of analysis to perform: \n"
-         " * rel_ab: profiling a metagenomes in terms of relative abundances\n"
-         " * rel_ab_w_read_stats: profiling a metagenomes in terms of relative abundances and estimate the number of reads coming from each clade.\n"
-         " * reads_map: mapping from reads to clades (only reads hitting a marker)\n"
-         " * clade_profiles: normalized marker counts for clades with at least a non-null marker\n"
-         " * marker_ab_table: normalized marker counts (only when > 0.0 and normalized by metagenome size if --nreads is specified)\n"
-         " * marker_counts: non-normalized marker counts [use with extreme caution]\n"
-         " * marker_pres_table: list of markers present in the sample (threshold at 1.0 if not differently specified with --pres_th\n"
-         " * clade_specific_strain_tracker: list of markers present for a specific clade, specified with --clade, and all its subclades\n"
-         "[default 'rel_ab']" )
-    arg( '--nreads', metavar="NUMBER_OF_READS", type=int, default = None, help =
-         "The total number of reads in the original metagenome. It is used only when \n"
-         "-t marker_table is specified for normalizing the length-normalized counts \n"
-         "with the metagenome size as well. No normalization applied if --nreads is not \n"
-         "specified" )
-    arg( '--pres_th', metavar="PRESENCE_THRESHOLD", type=int, default = 1.0, help =
-         'Threshold for calling a marker present by the -t marker_pres_table option' )
-    arg( '--clade', metavar="", default=None, type=str, help =
-         "The clade for clade_specific_strain_tracker analysis\n"  )
-    arg( '--min_ab', metavar="", default=0.1, type=float, help =
-         "The minimum percentage abundance for the clade in the clade_specific_strain_tracker analysis\n"  )
-
-    g = p.add_argument_group('Output arguments')
-    arg = g.add_argument
-    arg( '-o', '--output_file',  metavar="output file", type=str, default=None, help =
-         "The output file (if not specified as positional argument)\n")
-    arg('--sample_id_key',  metavar="name", type=str, default="SampleID",
-        help =("Specify the sample ID key for this analysis."
-               " Defaults to 'SampleID'."))
-    arg('--use_group_representative', action='store_true',  help =("Use a species as representative for species groups."))
-    arg('--sample_id',  metavar="value", type=str,
-        default="Metaphlan_Analysis",
-        help =("Specify the sample ID for this analysis."
-               " Defaults to 'Metaphlan_Analysis'."))
-    arg( '-s', '--samout', metavar="sam_output_file",
-        type=str, default=None, help="The sam output file\n")
-
-    arg( '--legacy-output', action='store_true', help="Old MetaPhlAn2 two columns output\n")
-    arg( '--CAMI_format_output', action='store_true', help="Report the profiling using the CAMI output format\n")
-    arg( '--unclassified_estimation', action='store_true', help="Scale relative abundances to the number of reads mapping to identified clades in order to estimate unclassified taxa\n")
-    arg( '--mpa3', action='store_true', help="Perform the analysis using the MetaPhlAn 3 algorithm\n")
-
-    #*************************************************************
-    #* Parameters related to biom file generation                *
-    #*************************************************************
-    arg( '--biom', '--biom_output_file',  metavar="biom_output", type=str, default=None, help =
-         "If requesting biom file output: The name of the output file in biom format \n")
-
-    arg( '--mdelim', '--metadata_delimiter_char',  metavar="mdelim", type=str, default="|", help =
-         "Delimiter for bug metadata: - defaults to pipe. e.g. the pipe in k__Bacteria|p__Proteobacteria \n")
-    #*************************************************************
-    #* End parameters related to biom file generation            *
-    #*************************************************************
-
-    g = p.add_argument_group('Other arguments')
-    arg = g.add_argument
-    arg('--nproc', metavar="N", type=int, default=4,
-        help="The number of CPUs to use for parallelizing the mapping [default 4]")
-    arg('--subsampling', type=int, default=None,
-        help="Specify the number of reads to be considered from the input metagenomes [default None]")
-    arg('--subsampling_seed', type=str, default='1992',
-        help="Random seed to use in the selection of the subsampled reads. Choose \"random\r for a random behaviour")
-    arg('--install', action='store_true',
-        help="Only checks if the MetaPhlAn DB is installed and installs it if not. All other parameters are ignored.")
-    arg('--offline', action='store_true',
-        help="If used, MetaPhlAn will not check for new database updates.")
-    arg('--force_download', action='store_true',
-        help="Force the re-download of the latest MetaPhlAn database.")
-    arg('--read_min_len', type=int, default=70,
-        help="Specify the minimum length of the reads to be considered when parsing the input file with "
-             "'read_fastx.py' script, default value is 70")
-    arg('-v', '--version', action='version',
-        version="MetaPhlAn version {} ({})".format(__version__, __date__),
-        help="Prints the current MetaPhlAn version and exit")
-    arg("-h", "--help", action="help", help="show this help message and exit")
-
-    return vars(p.parse_args())
-
-def set_mapping_arguments(index, bowtie2_db):
-    mpa_pkl = 'mpa_pkl'
-    bowtie2db = 'bowtie2db'
-    bt2_ext = 'bt2l' if SGB_ANALYSIS else 'bt2'
-
-    if os.path.isfile(os.path.join(bowtie2_db, "{}.pkl".format(index))):
-        mpa_pkl = os.path.join(bowtie2_db, "{}.pkl".format(index))
-
-    if glob(os.path.join(bowtie2_db, "{}*.{}".format(index, bt2_ext))):
-        bowtie2db = os.path.join(bowtie2_db, "{}".format(index))
-
-    return (mpa_pkl, bowtie2db)
-
-def run_bowtie2(fna_in, outfmt6_out, bowtie2_db, preset, nproc, min_mapq_val, file_format="fasta",
-                exe=None, samout=None, min_alignment_len=None, read_min_len=0):
-    # checking read_fastx.py
-    read_fastx = "read_fastx.py"
-
-    try:
-        subp.check_call([read_fastx, "-h"], stdout=DEVNULL, stderr=DEVNULL)
-    except Exception as e:
-        try:
-            read_fastx = os.path.join(os.path.join(os.path.dirname(__file__), "utils"), read_fastx)
-            subp.check_call([read_fastx, "-h"], stdout=DEVNULL, stderr=DEVNULL)
-        except Exception as e:
-            sys.stderr.write("OSError: fatal error running '{}'. Is it in the system path?\n".format(read_fastx))
-            sys.exit(1)
-
-    # checking bowtie2
-    try:
-        subp.check_call([exe if exe else 'bowtie2', "-h"], stdout=DEVNULL)
-    except Exception as e:
-        sys.stderr.write('OSError: "{}"\nFatal error running BowTie2. Is BowTie2 in the system path?\n'.format(e))
-        sys.exit(1)
-
-    try:    
-        if fna_in:
-            readin = subp.Popen([read_fastx, '-l', str(read_min_len), fna_in], stdout=subp.PIPE, stderr=subp.PIPE)
-
-        else:
-            readin = subp.Popen([read_fastx, '-l', str(read_min_len)], stdin=sys.stdin, stdout=subp.PIPE, stderr=subp.PIPE)
-
-        bowtie2_cmd = [exe if exe else 'bowtie2', "--seed", "1992", "--quiet", "--no-unal", "--{}".format(preset),
-                       "-S", "-", "-x", bowtie2_db]
-
-        if int(nproc) > 1:
-            bowtie2_cmd += ["-p", str(nproc)]
-
-        bowtie2_cmd += ["-U", "-"]  # if not stat.S_ISFIFO(os.stat(fna_in).st_mode) else []
-
-        if file_format == "fasta":
-            bowtie2_cmd += ["-f"]
-
-        p = subp.Popen(bowtie2_cmd, stdout=subp.PIPE, stdin=readin.stdout)
-        readin.stdout.close()
-        lmybytes, outf = (mybytes, bz2.BZ2File(outfmt6_out, "w")) if outfmt6_out.endswith(".bz2") else (str, open(outfmt6_out, "w"))
-        try:
-            if samout:
-                if samout[-4:] == '.bz2':
-                    sam_file = bz2.BZ2File(samout, 'w')
-                else:
-                    sam_file = open(samout, 'wb')
-        except IOError as e:
-            sys.stderr.write('IOError: "{}"\nUnable to open sam output file.\n'.format(e))
-            sys.exit(1)
-        for line in p.stdout:
-            if samout:
-                sam_file.write(line)
-
-            o = read_and_split_line(line)
-            if not o[0].startswith('@'):
-                if not o[2].endswith('*'):
-                    if (hex(int(o[1]) & 0x100) == '0x0'): #no secondary
-                        if mapq_filter(o[2], int(o[4]), min_mapq_val) :  # filter low mapq reads
-                            if ((min_alignment_len is None) or
-                                    (max([int(x.strip('M')) for x in re.findall(r'(\d*M)', o[5]) if x]) >= min_alignment_len)):
-                                outf.write(lmybytes("\t".join([ o[0], o[2].split('/')[0] ]) + "\n"))
-
-        if samout:  
-            sam_file.close()
-
-        p.communicate()
-        read_fastx_stderr = readin.stderr.readlines()
-        nreads = None
-        avg_read_length = None
-        try:
-            nreads, avg_read_length = list(map(float, read_fastx_stderr[0].decode().split()))
-            if not nreads:
-                sys.stderr.write('Fatal error running MetaPhlAn. Total metagenome size was not estimated.\nPlease check your input files.\n')
-                sys.exit(1)
-            if not avg_read_length:
-                sys.stderr.write('Fatal error running MetaPhlAn. The average read length was not estimated.\nPlease check your input files.\n')
-                sys.exit(1)
-            outf.write(lmybytes('#nreads\t{}\n'.format(int(nreads))))
-            outf.write(lmybytes('#avg_read_length\t{}'.format(avg_read_length)))
-            outf.close()
-        except ValueError:
-            sys.stderr.write(b''.join(read_fastx_stderr).decode())
-            outf.close()
-            os.unlink(outfmt6_out)
-            sys.exit(1)
-
-    except OSError as e:
-        sys.stderr.write('OSError: "{}"\nFatal error running BowTie2.\n'.format(e))
-        sys.exit(1)
-    except IOError as e:
-        sys.stderr.write('IOError: "{}"\nFatal error running BowTie2.\n'.format(e))
-        sys.exit(1)
-
-    if p.returncode == 13:
-        sys.stderr.write("Permission Denied Error: fatal error running BowTie2."
-                         "Is the BowTie2 file in the path with execution and read permissions?\n")
-        sys.exit(1)
-    elif p.returncode != 0:
-        sys.stderr.write("Error while running bowtie2.\n")
-        sys.exit(1)
-
-class TaxClade:
-    min_cu_len = -1
-    markers2lens = None
-    stat = None
-    perc_nonzero = None
-    quantile = None
-    avoid_disqm = False
-    avg_read_length = 1
-
-    def __init__( self, name, tax_id, uncl = False):
-        self.children, self.markers2nreads = {}, {}
-        self.name, self.father = name, None
-        self.uncl, self.subcl_uncl = uncl, False
-        self.abundance, self.uncl_abundance = None, 0
-        self.nreads, self.uncl_nreads = 0, 0
-        self.tax_id = tax_id
-
-    def add_child( self, name, tax_id ):
-        new_clade = TaxClade( name, tax_id )
-        self.children[name] = new_clade
-        new_clade.father = self
-        return new_clade
-
-
-    def get_terminals( self ):
-        terms = []
-        if not self.children:
-            return [self]
-        for c in self.children.values():
-            terms += c.get_terminals()
-        return terms
-
-    def get_full_taxids( self ):
-        fullname = ['']
-        if self.tax_id:
-            fullname = [self.tax_id]
-        cl = self.father
-        while cl:
-            fullname = [cl.tax_id] + fullname
-            cl = cl.father
-        return "|".join(fullname[1:])
-
-    def get_full_name( self ):
-        fullname = [self.name]
-        cl = self.father
-        while cl:
-            fullname = [cl.name] + fullname
-            cl = cl.father
-        return "|".join(fullname[1:])
-
-    def get_normalized_counts( self ):
-        return [(m,float(n)*1000.0/(np.absolute(self.markers2lens[m] - self.avg_read_length) +1) )
-                    for m,n in self.markers2nreads.items()]
-
-    def compute_mapped_reads( self ):    
-        tax_level = 't__' if SGB_ANALYSIS else 's__'
-        if self.nreads != 0 or self.name.startswith(tax_level):
-            return self.nreads
-        for c in self.children.values():
-            self.nreads += c.compute_mapped_reads()
-        return self.nreads
-        
-    def compute_abundance( self ):
-        if self.abundance is not None: return self.abundance
-
-        sum_ab = sum([c.compute_abundance() for c in self.children.values()])
-
-        # rat_nreads = sorted([(self.markers2lens[marker], n_reads)
-        #                             for marker,n_reads in self.markers2nreads.items()],
-        #                                     key = lambda x: x[1])
-
-        rat_nreads, removed = [], []
-        for marker, n_reads in sorted(self.markers2nreads.items(),key=lambda x:x[0]):
-            misidentified = False
-
-            if not self.avoid_disqm:
-                for ext in self.markers2exts[marker]:
-                    ext_clade = self.taxa2clades[ext]
-                    m2nr = ext_clade.markers2nreads
-                    
-                    tocladetmp = ext_clade
-                    while len(tocladetmp.children) == 1:
-                        tocladetmp = list(tocladetmp.children.values())[0]
-                        m2nr = tocladetmp.markers2nreads
-
-                    nonzeros = sum([v>0 for v in m2nr.values()])
-                    if len(m2nr):
-                        if float(nonzeros) / len(m2nr) > self.perc_nonzero:
-                            misidentified = True
-                            removed.append( (self.markers2lens[marker],n_reads) )
-                            break
-            if not misidentified:
-                rat_nreads.append( (self.markers2lens[marker],n_reads) )
-
-        if not self.avoid_disqm and len(removed):
-            n_rat_nreads = float(len(rat_nreads))
-            n_removed = float(len(removed))
-            n_tot = n_rat_nreads + n_removed
-            n_ripr = 10
-
-            if len(self.get_terminals()) < 2:
-                n_ripr = 0
-
-            if "k__Viruses" in self.get_full_name():
-                n_ripr = 0
-
-            if n_rat_nreads < n_ripr and n_tot > n_rat_nreads:
-                rat_nreads += removed[:n_ripr-int(n_rat_nreads)]
-
-
-        rat_nreads = sorted(rat_nreads, key = lambda x: x[1])
-
-        rat_v,nreads_v = zip(*rat_nreads) if rat_nreads else ([],[])
-        rat, nrawreads, loc_ab = float(sum(rat_v)) or -1.0, sum(nreads_v), 0.0
-        quant = int(self.quantile*len(rat_nreads))
-        ql,qr,qn = (quant,-quant,quant) if quant else (None,None,0)
-
-        if not SGB_ANALYSIS and self.name[0] == 't' and (len(self.father.children) > 1 or "_sp" in self.father.name or "k__Viruses" in self.get_full_name()):
-            non_zeros = float(len([n for r,n in rat_nreads if n > 0]))
-            nreads = float(len(rat_nreads))
-            if nreads == 0.0 or non_zeros / nreads < 0.7:
-                self.abundance = 0.0
-                return 0.0
-
-        if rat < 0.0:
-            pass
-        elif self.stat == 'avg_g' or (not qn and self.stat in ['wavg_g','tavg_g']):
-            loc_ab = nrawreads / rat if rat >= 0 else 0.0
-        elif self.stat == 'avg_l' or (not qn and self.stat in ['wavg_l','tavg_l']):
-            loc_ab = np.mean([float(n)/(np.absolute(r - self.avg_read_length) + 1) for r,n in rat_nreads])
-        elif self.stat == 'tavg_g':
-            wnreads = sorted([(float(n)/(np.absolute(r-self.avg_read_length)+1),(np.absolute(r - self.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0])
-            den,num = zip(*[v[1:] for v in wnreads[ql:qr]])
-            loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0
-        elif self.stat == 'tavg_l':
-            loc_ab = np.mean(sorted([float(n)/(np.absolute(r - self.avg_read_length) + 1) for r,n in rat_nreads])[ql:qr])
-        elif self.stat == 'wavg_g':
-            vmin, vmax = nreads_v[ql], nreads_v[qr]
-            wnreads = [vmin]*qn+list(nreads_v[ql:qr])+[vmax]*qn
-            loc_ab = float(sum(wnreads)) / rat
-        elif self.stat == 'wavg_l':
-            wnreads = sorted([float(n)/(np.absolute(r - self.avg_read_length) + 1) for r,n in rat_nreads])
-            vmin, vmax = wnreads[ql], wnreads[qr]
-            wnreads = [vmin]*qn+list(wnreads[ql:qr])+[vmax]*qn
-            loc_ab = np.mean(wnreads)
-        elif self.stat == 'med':
-            loc_ab = np.median(sorted([float(n)/(np.absolute(r - self.avg_read_length) +1) for r,n in rat_nreads])[ql:qr])
-
-        self.abundance = loc_ab
-        if rat < self.min_cu_len and self.children:
-            self.abundance = sum_ab
-        elif loc_ab < sum_ab:
-            self.abundance = sum_ab
-
-        if self.abundance > sum_ab and self.children: # *1.1??
-            self.uncl_abundance = self.abundance - sum_ab
-        self.subcl_uncl = not self.children and self.name[0] not in tax_units[-2:]
-
-        return self.abundance
-
-    def get_all_abundances( self ):
-        ret = [(self.name, self.tax_id, self.abundance)]
-        if self.uncl_abundance > 0.0:
-            lchild = list(self.children.values())[0].name[:3]
-            ret += [(lchild+self.name[3:]+"_unclassified", "", self.uncl_abundance)]
-        if self.subcl_uncl and self.name[0] != tax_units[-2]:
-            cind = tax_units.index( self.name[0] )
-            ret += [(   tax_units[cind+1]+self.name[1:]+"_unclassified","",
-                        self.abundance)]
-        for c in self.children.values():
-            ret += c.get_all_abundances()
-        return ret
-
-class TaxTree:
-    def __init__( self, mpa, markers_to_ignore = None ): #, min_cu_len ):
-        self.root = TaxClade( "root", 0)
-        self.all_clades, self.markers2lens, self.markers2clades, self.taxa2clades, self.markers2exts = {}, {}, {}, {}, {}
-        TaxClade.markers2lens = self.markers2lens
-        TaxClade.markers2exts = self.markers2exts
-        TaxClade.taxa2clades = self.taxa2clades
-        self.avg_read_length = 1
-
-        for clade, value in mpa['taxonomy'].items():
-            clade = clade.strip().split("|")
-            if isinstance(value,tuple):
-                taxids, lenc = value
-                taxids = taxids.strip().split("|")
-            if isinstance(value,int):
-                lenc = value
-                taxids = None
-
-            father = self.root
-            for i in range(len(clade)):
-                clade_lev = clade[i]
-                if SGB_ANALYSIS:
-                    clade_taxid = taxids[i] if i < 8 and taxids is not None else None
-                else:
-                    clade_taxid = taxids[i] if i < 7 and taxids is not None else None
-                if not clade_lev in father.children:
-                    father.add_child(clade_lev, tax_id=clade_taxid)
-                    self.all_clades[clade_lev] = father.children[clade_lev]
-                if SGB_ANALYSIS: father = father.children[clade_lev]
-                if clade_lev[0] == "t":
-                    self.taxa2clades[clade_lev[3:]] = father
-                if not SGB_ANALYSIS: father = father.children[clade_lev]
-                if clade_lev[0] == "t":
-                    father.glen = lenc
-
-        def add_lens( node ):
-            if not node.children:
-                return node.glen
-            lens = []
-            for c in node.children.values():
-                lens.append( add_lens( c ) )
-            node.glen = min(np.mean(lens), np.median(lens))
-            return node.glen
-        
-        add_lens(self.root)
-
-        # for k,p in mpa_pkl['markers'].items():
-        for k, p in mpa['markers'].items():
-            if k in markers_to_ignore:
-                continue
-            self.markers2lens[k] = p['len']
-            self.markers2clades[k] = p['clade']
-            self.add_reads(k, 0)
-            self.markers2exts[k] = p['ext']
-
-    def set_min_cu_len( self, min_cu_len ):
-        TaxClade.min_cu_len = min_cu_len
-
-    def set_stat( self, stat, quantile, perc_nonzero, avg_read_length, avoid_disqm = False):
-        TaxClade.stat = stat
-        TaxClade.perc_nonzero = perc_nonzero
-        TaxClade.quantile = quantile
-        TaxClade.avoid_disqm = avoid_disqm
-        TaxClade.avg_read_length = avg_read_length
-
-    def add_reads(  self, marker, n,
-                    add_viruses = False,
-                    ignore_eukaryotes = False,
-                    ignore_bacteria = False, ignore_archaea = False, 
-                    ignore_ksgbs = False, ignore_usgbs = False  ):
-        clade = self.markers2clades[marker]
-        cl = self.all_clades[clade]
-        if ignore_bacteria or ignore_archaea or ignore_eukaryotes:
-            cn = cl.get_full_name()
-            if ignore_archaea and cn.startswith("k__Archaea"):
-                return (None, None)
-            if ignore_bacteria and cn.startswith("k__Bacteria"):
-                return (None, None)
-            if ignore_eukaryotes and cn.startswith("k__Eukaryota"):
-                return (None, None)
-        if not SGB_ANALYSIS and not add_viruses:
-            cn = cl.get_full_name()
-            if not add_viruses and cn.startswith("k__Vir"):
-                return (None, None)
-        if SGB_ANALYSIS and (ignore_ksgbs or ignore_usgbs):
-            cn = cl.get_full_name()
-            if ignore_ksgbs and not '_SGB' in cn.split('|')[-2]:
-                return (None, None)
-            if ignore_usgbs and '_SGB' in cn.split('|')[-2]:
-                return (None, None)
-        # while len(cl.children) == 1:
-            # cl = list(cl.children.values())[0]
-        cl.markers2nreads[marker] = n
-        return (cl.get_full_name(), cl.get_full_taxids(), )
-
-
-    def markers2counts( self ):
-        m2c = {}
-        for _ ,v in self.all_clades.items():
-            for m,c in v.markers2nreads.items():
-                m2c[m] = c
-        return m2c
-
-    def clade_profiles( self, tax_lev, get_all = False  ):
-        cl2pr = {}
-        for k,v in self.all_clades.items():
-            if tax_lev and not k.startswith(tax_lev):
-                continue
-            prof = v.get_normalized_counts()
-            if not get_all and ( len(prof) < 1 or not sum([p[1] for p in prof]) > 0.0 ):
-                continue
-            cl2pr[v.get_full_name()] = prof
-        return cl2pr
-
-    def relative_abundances( self, tax_lev  ):
-        clade2abundance_n = dict([(tax_label, clade) for tax_label, clade in self.all_clades.items()
-                    if tax_label.startswith("k__") and not clade.uncl])
-
-        clade2abundance, clade2est_nreads, tot_ab, tot_reads = {}, {}, 0.0, 0
-
-        for tax_label, clade in clade2abundance_n.items():
-            tot_ab += clade.compute_abundance()
-
-        for tax_label, clade in clade2abundance_n.items():
-            for clade_label, tax_id, abundance in sorted(clade.get_all_abundances(), key=lambda pars:pars[0]):
-                if SGB_ANALYSIS or clade_label[:3] != 't__':
-                    if not tax_lev:
-                        if clade_label not in self.all_clades:
-                            to = tax_units.index(clade_label[0])
-                            t = tax_units[to-1]
-                            clade_label = t + clade_label.split("_unclassified")[0][1:]
-                            tax_id = self.all_clades[clade_label].get_full_taxids()
-                            clade_label = self.all_clades[clade_label].get_full_name()
-                            spl = clade_label.split("|")
-                            clade_label = "|".join(spl+[tax_units[to]+spl[-1][1:]+"_unclassified"])
-                            glen = self.all_clades[spl[-1]].glen
-                        else:
-                            glen = self.all_clades[clade_label].glen
-                            tax_id = self.all_clades[clade_label].get_full_taxids()
-                            tax_level = 't__' if SGB_ANALYSIS else 's__' 
-                            if tax_level in clade_label and abundance > 0:
-                                self.all_clades[clade_label].nreads = int(np.floor(abundance*glen))
-
-                            clade_label = self.all_clades[clade_label].get_full_name()
-                    elif not clade_label.startswith(tax_lev):
-                        if clade_label in self.all_clades:
-                            glen = self.all_clades[clade_label].glen
-                        else:
-                            glen = 1.0
-                        continue
-                    clade2abundance[(clade_label, tax_id)] = abundance
-        
-        for tax_label, clade in clade2abundance_n.items():
-            tot_reads += clade.compute_mapped_reads()
-
-        for clade_label, clade in self.all_clades.items():
-            if SGB_ANALYSIS or clade.name[:3] != 't__':
-                nreads = clade.nreads
-                clade_label = clade.get_full_name()
-                tax_id = clade.get_full_taxids()
-                clade2est_nreads[(clade_label, tax_id)] = nreads
-
-        ret_d = dict([( tax, float(abundance) / tot_ab if tot_ab else 0.0) for tax, abundance in clade2abundance.items()])
-
-        ret_r = dict([( tax, (abundance, clade2est_nreads[tax] )) for tax, abundance in clade2abundance.items() if tax in clade2est_nreads])
-
-        if tax_lev:
-            ret_d[("UNCLASSIFIED", '-1')] = 1.0 - sum(ret_d.values())
-        return ret_d, ret_r, tot_reads
-
-def mapq_filter(marker_name, mapq_value, min_mapq_val):
-    if 'GeneID:' in marker_name:
-        return True
-    else:
-        if mapq_value > min_mapq_val:
-            return True
-    return False
-
-
-def separate_reads2markers(reads2markers):
-    if not SGB_ANALYSIS:
-        return reads2markers, {}
-    else:
-        return {r: m for r, m in reads2markers.items() if ('SGB' in m or 'EUK' in m) and not 'VDB' in m}, {r: m for r, m in reads2markers.items() if 'VDB' in m and not ('SGB' in m or 'EUK' in m)}
-
-def map2bbh(mapping_f, min_mapq_val, input_type='bowtie2out', min_alignment_len=None, nreads=None, subsampling=None, subsampling_seed='1992'):
-    if not mapping_f:
-        ras, ras_line, inpf = plain_read_and_split, plain_read_and_split_line, sys.stdin
-    else:
-        if mapping_f.endswith(".bz2"):
-            ras, ras_line, inpf = read_and_split, read_and_split_line, bz2.BZ2File(mapping_f, "r")
-        else:
-            ras, ras_line, inpf = plain_read_and_split, plain_read_and_split_line, open(mapping_f)
-
-    reads2markers = {}
-    n_metagenome_reads = None
-    avg_read_length = 1 #Set to 1 if it is not calculated from read_fastx
-
-    if input_type == 'bowtie2out':
-        for r, c in ras(inpf):
-            if r.startswith('#') and 'nreads' in r:
-                n_metagenome_reads = int(c)
-            if r.startswith('#') and 'avg_read_length' in r:
-                avg_read_length = float(c)
-            else:
-                reads2markers[r] = c
-    elif input_type == 'sam':
-        n_metagenome_reads = nreads 
-        for line in inpf:
-            o = ras_line(line)
-            if ((o[0][0] != '@') and #no header
-                (o[2][-1] != '*') and # no unmapped reads
-                (hex(int(o[1]) & 0x100) == '0x0') and #no secondary
-                mapq_filter(o[2], int(o[4]), min_mapq_val) and # filter low mapq reads
-                ( (min_alignment_len is None) or ( max(int(x.strip('M')) for x in re.findall(r'(\d*M)', o[5]) if x) >= min_alignment_len ) )
-            ):
-                    reads2markers[o[0]] = o[2].split('/')[0]
-    inpf.close()
-    
-    if subsampling != None:
-        if subsampling >= n_metagenome_reads:
-            sys.stderr.write("WARNING: The specified subsampling ({}) is higher than the original number of reads ({}).".format(subsampling, n_metagenome_reads))
-        elif subsampling < 10000:
-            sys.stderr.write("WARNING: The specified subsampling ({}) is below the recommended minimum of 10,000 reads.".format(subsampling))
-        else:
-            reads2markers =  dict(sorted(reads2markers.items()))
-            if subsampling_seed.lower() != 'random':
-                random.seed(int(subsampling_seed))
-            reads2filtmarkers = {}
-            sgb_reads2markers, viral_reads2markers = separate_reads2markers(reads2markers)            
-            n_sgb_mapped_reads = int((len(sgb_reads2markers) * subsampling) / n_metagenome_reads)
-            reads2filtmarkers = { r:sgb_reads2markers[r] for r in random.sample(list(sgb_reads2markers.keys()), n_sgb_mapped_reads) }     
-            if SGB_ANALYSIS:       
-                n_viral_mapped_reads = int((len(viral_reads2markers) * subsampling) / n_metagenome_reads)
-                reads2filtmarkers.update({ r:viral_reads2markers[r] for r in random.sample(list(viral_reads2markers.keys()), n_viral_mapped_reads) })            
-            reads2markers = reads2filtmarkers
-            sgb_reads2markers.clear()
-            viral_reads2markers.clear()
-            n_metagenome_reads = subsampling
-    elif n_metagenome_reads < 10000:
-        sys.stderr.write("WARNING: The number of reads in the sample ({}) is below the recommended minimum of 10,000 reads.".format(subsampling))
-        
-    markers2reads = defdict(set)   
-    for r, m in reads2markers.items():
-        markers2reads[m].add(r)
-
-    return (markers2reads, n_metagenome_reads, avg_read_length)
-
-def maybe_generate_biom_file(tree, pars, abundance_predictions):
-    json_key = "MetaPhlAn"
-
-    if not pars['biom']:
-        return None
-    if not abundance_predictions:
-        biom_table = biom.Table([], [], [])  # create empty BIOM table
-
-        with open(pars['biom'], 'w') as outfile:
-            biom_table.to_json(json_key, direct_io=outfile)
-
-        return True
-
-    delimiter = "|" if len(pars['mdelim']) > 1 else pars['mdelim']
-
-    def istip(clade_name):
-        end_name = clade_name.split(delimiter)[-1]
-        return end_name.startswith("s__") or end_name.endswith("_unclassified")
-
-    def findclade(clade_name):
-        if clade_name.endswith('_unclassified'):
-            name = clade_name.split(delimiter)[-2]
-        else:
-            name = clade_name.split(delimiter)[-1]
-        return tree.all_clades[name]
-
-    def to_biomformat(clade_name):
-        return {'taxonomy': clade_name.split(delimiter)}
-
-    clades = iter((abundance, findclade(name))
-                  for (name, taxid, abundance) in abundance_predictions if istip(name))
-    packed = iter(([abundance], clade.get_full_name(), clade.tax_id)
-                  for (abundance, clade) in clades)
-
-    # unpack that tuple here to stay under 80 chars on a line
-    data, clade_names, _ = zip(*packed)
-    # biom likes column vectors, so we give it an array like this:
-    # np.array([a],[b],[c])
-    data = np.array(data)
-    sample_ids = [pars['sample_id']]
-    table_id = 'MetaPhlAn_Analysis'
-
-
-
-
-    #**********************************************************************************************
-    #  Modification of Code :                                                                     *
-    #  Modified the code so instead of using the current clade IDs, which are numbers, we will    *
-    #      use the clade_names                                                                    *
-    #      Users reported the biom output is invalid and also the IDs were changing from run to   *
-    #      run.                                                                                   *
-    #  George Weingart    05/22/2017   george.weingart at mail.com                                   *
-    #**********************************************************************************************
-    if LooseVersion(biom.__version__) < LooseVersion("2.0.0"):
-        biom_table = biom.table.table_factory(
-            data,
-        sample_ids,
-            ######## clade_ids,     #Modified by George Weingart 5/22/2017 - We will use instead the clade_names
-            clade_names,            #Modified by George Weingart 5/22/2017 - We will use instead the clade_names
-            sample_metadata      = None,
-            observation_metadata = list(map(to_biomformat, clade_names)),
-            table_id             = table_id,
-            constructor          = biom.table.DenseOTUTable
-        )
-        with open(pars['biom'], 'w') as outfile:
-            json.dump( biom_table.getBiomFormatObject(json_key),
-                           outfile )
-    else:  # Below is the biom2 compatible code
-        biom_table = biom.table.Table(
-            data,
-            #clade_ids,           #Modified by George Weingart 5/22/2017 - We will use instead the clade_names
-            clade_names,          #Modified by George Weingart 5/22/2017 - We will use instead the clade_names
-            sample_ids,
-            sample_metadata      = None,
-            observation_metadata = list(map(to_biomformat, clade_names)),
-            table_id             = table_id,
-            input_is_dense       = True
-        )
-
-        with open(pars['biom'], 'w') as outfile:
-            biom_table.to_json( json_key,
-                                direct_io = outfile )
-
-    return True
-
-def main():
-    ranks2code = { 'k' : 'superkingdom', 'p' : 'phylum', 'c':'class',
-                   'o' : 'order', 'f' : 'family', 'g' : 'genus', 's' : 'species'}
-    pars = read_params(sys.argv)
-
-    #Set SGB- / species- analysis
-    global SGB_ANALYSIS
-    SGB_ANALYSIS = not pars['mpa3']
-
-    ESTIMATE_UNK = pars['unclassified_estimation']
-    
-    if not (pars['subsampling_seed'].lower() == 'random' or pars['subsampling_seed'].isdigit()):
-        sys.stderr.write("Error: The --subsampling_seed parameter is not accepted. It should contain an integer number or \"random\". Exiting...\n\n")
-        sys.exit(1)
-    
-
-    # check if the database is installed, if not then install
-    pars['index'] = check_and_install_database(pars['index'], pars['bowtie2db'], pars['bowtie2_build'], pars['nproc'], pars['force_download'], pars['offline'])
-
-    if pars['install']:
-        sys.stderr.write('The database is installed\n')
-        return
-
-    # set correct map_pkl and bowtie2db variables
-    pars['mpa_pkl'], pars['bowtie2db'] = set_mapping_arguments(pars['index'], pars['bowtie2db'])
-
-    if (pars['bt2_ps'] in ["sensitive-local", "very-sensitive-local"]) and (pars['min_alignment_len'] is None):
-            pars['min_alignment_len'] = 100
-            sys.stderr.write('Warning! bt2_ps is set to local mode, and min_alignment_len is None, I automatically '
-                             'set min_alignment_len to 100! If you do not like, rerun the command and set '
-                             'min_alignment_len to a specific value.\n')
-
-    # check for the mpa_pkl file
-    if not os.path.isfile(pars['mpa_pkl']):
-        sys.stderr.write("Error: Unable to find the mpa_pkl file at: " + pars['mpa_pkl'] +
-                         "Exiting...\n\n")
-        sys.exit(1)
-
-    if pars['ignore_markers']:
-        with open(pars['ignore_markers']) as ignv:
-            ignore_markers = set([l.strip() for l in ignv])
-    else:
-        ignore_markers = set()
-
-    no_map = False
-    if pars['input_type'] == 'fasta' or pars['input_type'] == 'fastq':
-        bow = pars['bowtie2db'] is not None
-
-        if not bow:
-            sys.stderr.write( "No MetaPhlAn BowTie2 database provided\n "
-                              "[--bowtie2db and --index options]!\n"
-                              "Exiting...\n\n" )
-            sys.exit(1)
-
-        if pars['no_map']:
-            pars['bowtie2out'] = tf.NamedTemporaryFile(dir=pars['tmp_dir']).name
-            no_map = True
-        else:
-            if bow and not pars['bowtie2out']:
-                if pars['inp'] and "," in  pars['inp']:
-                    sys.stderr.write("Error! --bowtie2out needs to be specified when multiple "
-                                     "FASTQ or FASTA files (comma separated) are provided\n")
-                    sys.exit(1)
-                fname = pars['inp']
-                if fname is None:
-                    fname = "stdin_map"
-                elif stat.S_ISFIFO(os.stat(fname).st_mode):
-                    fname = "fifo_map"
-                pars['bowtie2out'] = fname + ".bowtie2out.txt"
-
-            if os.path.exists( pars['bowtie2out'] ) and not pars['force']:
-                sys.stderr.write(
-                    "BowTie2 output file detected: " + pars['bowtie2out'] + "\n"
-                    "Please use it as input or remove it if you want to "
-                    "re-perform the BowTie2 run.\n"
-                    "Exiting...\n\n" )
-                sys.exit(1)
-            if pars['force']:
-                if os.path.exists(pars['bowtie2out']):
-                    os.remove( pars['bowtie2out'] )
-
-        bt2_ext = 'bt2l' if SGB_ANALYSIS else 'bt2'            
-        if bow and not all([os.path.exists(".".join([str(pars['bowtie2db']), p]))
-                            for p in ["1." + bt2_ext, "2." + bt2_ext, "3." + bt2_ext, "4." + bt2_ext, "rev.1." + bt2_ext, "rev.2." + bt2_ext]]):
-            sys.stderr.write("No MetaPhlAn BowTie2 database found (--index "
-                             "option)!\nExpecting location {}\nExiting..."
-                             .format(pars['bowtie2db']))
-            sys.exit(1)
-        if bow and not (abs(os.path.getsize(".".join([str(pars['bowtie2db']), "1." + bt2_ext])) - os.path.getsize(".".join([str(pars['bowtie2db']), "rev.1." + bt2_ext]))) <= 1000):
-            sys.stderr.write("Partial MetaPhlAn BowTie2 database found at {}. "
-                             "Please remove and rebuild the database.\nExiting..."
-                             .format(pars['bowtie2db']))
-            sys.exit(1)
-
-        if bow:
-            run_bowtie2(pars['inp'], pars['bowtie2out'], pars['bowtie2db'],
-                                pars['bt2_ps'], pars['nproc'], file_format=pars['input_type'],
-                                exe=pars['bowtie2_exe'], samout=pars['samout'],
-                                min_alignment_len=pars['min_alignment_len'], read_min_len=pars['read_min_len'], min_mapq_val=pars['min_mapq_val'])
-            pars['input_type'] = 'bowtie2out'
-        pars['inp'] = pars['bowtie2out'] # !!!
-    with bz2.BZ2File( pars['mpa_pkl'], 'r' ) as a:
-        mpa_pkl = pickle.load( a )
-
-    REPORT_MERGED = mpa_pkl.get('merged_taxon',False)
-    tree = TaxTree( mpa_pkl, ignore_markers )
-    tree.set_min_cu_len( pars['min_cu_len'] )
-
-    if pars['input_type'] == 'sam' and not pars['nreads']:
-        sys.stderr.write(
-                "Please provide the size of the metagenome using the "
-                "--nreads parameter when running MetaPhlAn using SAM files as input"
-                "\nExiting...\n\n" )
-        sys.exit(1)
-
-    markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'], pars['nreads'], pars['subsampling'], pars['subsampling_seed'])
-
-    tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'])
-
-    if no_map:
-        os.remove( pars['inp'] )
-
-    map_out = []
-    for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
-        if marker not in tree.markers2lens:
-            continue
-        tax_seq, ids_seq = tree.add_reads( marker, len(reads),
-                                  add_viruses = pars['add_viruses'],
-                                  ignore_eukaryotes = pars['ignore_eukaryotes'],
-                                  ignore_bacteria = pars['ignore_bacteria'],
-                                  ignore_archaea = pars['ignore_archaea'],
-                                  ignore_ksgbs = pars['ignore_ksgbs'],
-                                  ignore_usgbs = pars['ignore_usgbs']
-                                  )
-        if tax_seq:
-            map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)]
-
-    if pars['output'] is None and pars['output_file'] is not None:
-        pars['output'] = pars['output_file']
-
-    out_stream = open(pars['output'],"w") if pars['output'] else sys.stdout
-    MPA2_OUTPUT = pars['legacy_output']
-    CAMI_OUTPUT = pars['CAMI_format_output']
-
-    with out_stream as outf:
-        if not MPA2_OUTPUT:
-            outf.write('#{}\n'.format(pars['index']))
-            outf.write('#{}\n'.format(' '.join(sys.argv)))
-            outf.write('#{} reads processed\n'.format(n_metagenome_reads))
-
-        if not CAMI_OUTPUT:
-            outf.write('#' + '\t'.join((pars["sample_id_key"], pars["sample_id"])) + '\n')
-
-        if ESTIMATE_UNK:
-            mapped_reads = 0
-            cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None  )
-            cl2ab, _, _ = tree.relative_abundances( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )
-            confident_taxa = [taxstr for (taxstr, _),relab in cl2ab.items() if relab > 0.0]
-            for c, m in cl2pr.items():
-                if c in confident_taxa:
-                    markers_cov = [a  / 1000 for _, a in m if a > 0]
-                    mapped_reads += np.mean(markers_cov) * tree.all_clades[c.split('|')[-1]].glen
-            # If the mapped reads are over-estimated, set the ratio at 1
-            fraction_mapped_reads = min(mapped_reads/float(n_metagenome_reads), 1.0)
-        else:
-            fraction_mapped_reads = 1.0
-      
-        if pars['t'] == 'reads_map':
-            if not MPA2_OUTPUT:
-               outf.write('#read_id\tNCBI_taxlineage_str\tNCBI_taxlineage_ids\n')
-            outf.write( "\n".join( map_out ) + "\n" )
-
-        elif pars['t'] == 'rel_ab':
-            if CAMI_OUTPUT:
-                outf.write('''@SampleID:{}\n at Version:0.10.0\n at Ranks:superkingdom|phylum|class|order|family|genus|species|strain\n@@TAXID\tRANK\tTAXPATH\tTAXPATHSN\tPERCENTAGE\n'''.format(pars["sample_id"]))
-            if not MPA2_OUTPUT and not CAMI_OUTPUT:
-                if not pars['use_group_representative']:
-                    outf.write('#clade_name\tNCBI_tax_id\trelative_abundance\tadditional_species\n')
-                else:
-                    outf.write('#clade_name\tNCBI_tax_id\trelative_abundance\n')
-
-            cl2ab, _, tot_nreads = tree.relative_abundances(
-                        pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )
-            
-            outpred = [(taxstr, taxid,round(relab*100.0,5)) for (taxstr, taxid), relab in cl2ab.items() if relab > 0.0]
-            has_repr = False
-            
-            if outpred:
-                if CAMI_OUTPUT:
-                    for clade, taxid, relab in sorted(  outpred, reverse=True,
-                                        key=lambda x:x[2]+(100.0*(8-(x[0].count("|"))))):
-                        if taxid and clade.split('|')[-1][0] != 't':
-                            rank = ranks2code[clade.split('|')[-1][0]]
-                            leaf_taxid = taxid.split('|')[-1]
-                            taxpathsh = '|'.join([remove_prefix(name) if '_unclassified' not in name else '' for name in clade.split('|')])
-                            outf.write( '\t'.join( [ leaf_taxid, rank, taxid, taxpathsh, str(relab*fraction_mapped_reads) ] ) + '\n' )
-                else:
-                    if ESTIMATE_UNK:
-                        outf.write( "\t".join( [    "UNCLASSIFIED",
-                                                    "-1",
-                                                    str(round((1-fraction_mapped_reads)*100,5)),""]) + "\n" )
-                                                    
-                    for clade, taxid, relab in sorted(  outpred, reverse=True,
-                                        key=lambda x:x[2]+(100.0*(8-(x[0].count("|"))))):
-                        add_repr = ''
-                        if REPORT_MERGED and (clade, taxid) in mpa_pkl['merged_taxon']:
-                            if pars['use_group_representative'] and not SGB_ANALYSIS:
-                                if '_group' in clade:
-                                    clade, taxid, _ = sorted(mpa_pkl['merged_taxon'][(clade, taxid)], key=lambda x:x[2], reverse=True)[0]
-                            elif not pars['use_group_representative']:
-                                add_repr = '{}'.format(','.join( [ n[0] for n in mpa_pkl['merged_taxon'][(clade, taxid)]] ))
-                                has_repr = True
-                        if not MPA2_OUTPUT:
-                            outf.write( "\t".join( [clade, 
-                                                    taxid, 
-                                                    str(relab*fraction_mapped_reads), 
-                                                    add_repr
-                                                ] ) + "\n" )
-                        else:
-                            outf.write( "\t".join( [clade, 
-                                                    str(relab*fraction_mapped_reads)] ) + "\n" )
-                if REPORT_MERGED and has_repr:
-                    sys.stderr.write("WARNING: The metagenome profile contains clades that represent multiple species merged into a single representant.\n"
-                                     "An additional column listing the merged species is added to the MetaPhlAn output.\n"
-                                    )
-            else:
-                if not MPA2_OUTPUT:
-                    outf.write( "UNCLASSIFIED\t-1\t100.0\t\n" )
-                else:
-                    outf.write( "UNCLASSIFIED\t100.0\n" )
-                sys.stderr.write("WARNING: MetaPhlAn did not detect any microbial taxa in the sample.\n")
-            maybe_generate_biom_file(tree, pars, outpred)
-
-        elif pars['t'] == 'rel_ab_w_read_stats':
-            cl2ab, rr, tot_nreads = tree.relative_abundances(
-                        pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )
-
-            unmapped_reads = max(n_metagenome_reads - tot_nreads, 0)
-
-            outpred = [(taxstr, taxid,round(relab*100.0*fraction_mapped_reads,5)) for (taxstr, taxid),relab in cl2ab.items() if relab > 0.0]
-
-            if outpred:
-                outf.write( "#estimated_reads_mapped_to_known_clades:{}\n".format(round(tot_nreads)) )
-                outf.write( "\t".join( [    "#clade_name",
-                                            "clade_taxid",
-                                            "relative_abundance",
-                                            "coverage",
-                                            "estimated_number_of_reads_from_the_clade" ]) +"\n" )
-                if ESTIMATE_UNK:
-                    outf.write( "\t".join( [    "UNCLASSIFIED",
-                                                "-1",
-                                                str(round((1-fraction_mapped_reads)*100,5)),
-                                                "-",
-                                                str(round(unmapped_reads)) ]) + "\n" )
-                                                
-                for taxstr, taxid, relab in sorted(  outpred, reverse=True,
-                                    key=lambda x:x[2]+(100.0*(8-(x[0].count("|"))))):
-                    outf.write( "\t".join( [    taxstr,
-                                                taxid,
-                                                str(relab),
-                                                str(round(rr[(taxstr, taxid)][0],5)) if (taxstr, taxid) in rr else '-',          #coverage
-                                                str( int( round( rr[(taxstr, taxid)][1], 0) )  if (taxstr, taxid) in rr else '-')       #estimated_number_of_reads_from_the_clade
-                                                ] ) + "\n" )
-            else:
-                if not MPA2_OUTPUT:
-                    outf.write( "#estimated_reads_mapped_to_known_clades:0\n")
-                    outf.write( "\t".join( [    "#clade_name",
-                                                "clade_taxid",
-                                                "relative_abundance",
-                                                "coverage",
-                                                "estimated_number_of_reads_from_the_clade" ]) +"\n" )
-                    outf.write( "unclassified\t-1\t100.0\t0\t0\n" )
-                else:
-                    outf.write( "unclassified\t100.0\n" )
-            maybe_generate_biom_file(tree, pars, outpred)
-
-        elif pars['t'] == 'clade_profiles':
-            cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None  )
-            for c,p in cl2pr.items():
-                mn,n = zip(*p)
-                outf.write( "\t".join( [""]+[str(s) for s in mn] ) + "\n" )
-                outf.write( "\t".join( [c]+[str(s) for s in n] ) + "\n" )
-
-        elif pars['t'] == 'marker_ab_table':
-            cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None  )
-            for v in cl2pr.values():
-                outf.write( "\n".join(["\t".join([str(a),str(b/float(pars['nreads'])) if pars['nreads'] else str(b)])
-                                for a,b in v if b > 0.0]) + "\n" )
-
-        elif pars['t'] == 'marker_pres_table':
-            cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None  )
-            for v in cl2pr.values():
-                strout = ["\t".join([str(a),"1"]) for a,b in v if b > pars['pres_th']]
-                if strout:
-                    outf.write( "\n".join(strout) + "\n" )
-
-        elif pars['t'] == 'marker_counts':
-            outf.write( "\n".join( ["\t".join([m,str(c)]) for m,c in tree.markers2counts().items() ]) +"\n" )
-
-        elif pars['t'] == 'clade_specific_strain_tracker':
-            cl2pr = tree.clade_profiles( None, get_all = True  )
-            cl2ab, _, _ = tree.relative_abundances( None )
-            strout = []
-            for (taxstr, taxid), relab in cl2ab.items():
-                clade = taxstr
-                if clade.endswith(pars['clade']) and relab*100.0 < pars['min_ab']:
-                    strout = []
-                    break
-                if pars['clade'] in clade:
-                    strout += ["\t".join([str(a),str(int(b > pars['pres_th']))]) for a,b in cl2pr[clade]]
-            if strout:
-                strout = sorted(strout,key=lambda x:x[0])
-                outf.write( "\n".join(strout) + "\n" )
-            else:
-                sys.stderr.write("Clade "+pars['clade']+" not present at an abundance >"+str(round(pars['min_ab'],2))+"%, "
-                                 "so no clade specific markers are reported\n")
-
-
-if __name__ == '__main__':
-    t0 = time.time()
-    main()
-    sys.stderr.write('Elapsed time to run MetaPhlAn: {} s\n'.format( (time.time()-t0) ) )
-


=====================================
metaphlan/metaphlan.py
=====================================
@@ -23,7 +23,7 @@ import re
 import time
 import random
 from collections import defaultdict as defdict
-from packaging.version import parse
+from distutils.version import LooseVersion
 from glob import glob
 from subprocess import DEVNULL
 import argparse as ap
@@ -947,7 +947,7 @@ def maybe_generate_biom_file(tree, pars, abundance_predictions):
     #      run.                                                                                   *
     #  George Weingart    05/22/2017   george.weingart at mail.com                                   *
     #**********************************************************************************************
-    if parse(biom.__version__) < parse("2.0.0"):
+    if LooseVersion(biom.__version__) < LooseVersion("2.0.0"):
         biom_table = biom.table.table_factory(
             data,
         sample_ids,



View it on GitLab: https://salsa.debian.org/med-team/metaphlan/-/commit/fd779d0688af6d4559d9db67f5a100e7bb481823

-- 
View it on GitLab: https://salsa.debian.org/med-team/metaphlan/-/commit/fd779d0688af6d4559d9db67f5a100e7bb481823
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20250505/1427b9e6/attachment-0001.htm>