[med-svn] [Git][med-team/flye][master] 6 commits: routine-update: New upstream version

Andreas Tille (@tille) gitlab at salsa.debian.org
Tue Jul 11 16:49:56 BST 2023



Andreas Tille pushed to branch master at Debian Med / flye


Commits:
08be7934 by Andreas Tille at 2023-07-11T17:36:08+02:00
routine-update: New upstream version

- - - - -
114451b2 by Andreas Tille at 2023-07-11T17:36:09+02:00
New upstream version 2.9.2+dfsg
- - - - -
333bbb88 by Andreas Tille at 2023-07-11T17:36:28+02:00
Update upstream source from tag 'upstream/2.9.2+dfsg'

Update to upstream version '2.9.2+dfsg'
with Debian dir 92d248706649cd3daf2faa1fe12d72adafacdf13
- - - - -
a61543fe by Andreas Tille at 2023-07-11T17:36:28+02:00
routine-update: Standards-Version: 4.6.2

- - - - -
a7e4fb04 by Andreas Tille at 2023-07-11T17:46:28+02:00
Adapt patch

- - - - -
4b87e567 by Andreas Tille at 2023-07-11T17:48:56+02:00
Upload to unstable

- - - - -


15 changed files:

- Makefile
- README.md
- debian/changelog
- debian/control
- debian/patches/use_debian_packaged_libs.patch
- docs/FAQ.md
- docs/NEWS.md
- flye/__build__.py
- flye/__version__.py
- flye/assembly/assemble.py
- flye/main.py
- flye/polishing/alignment.py
- flye/polishing/bubbles.py
- flye/polishing/polish.py
- flye/utils/sam_parser.py


Changes:

=====================================
Makefile
=====================================
@@ -11,6 +11,11 @@ export SAMTOOLS_DIR = ${ROOT_DIR}/lib/samtools-1.9
 export CXXFLAGS += ${LIBCUCKOO} ${INTERVAL_TREE} ${LEMON} -I${MINIMAP2_DIR}
 export LDFLAGS += -lz -L${MINIMAP2_DIR} -lminimap2
 
+ifeq ($(shell uname -m),arm64)
+	export arm_neon=1
+	export aarch64=1
+endif
+
 .PHONY: clean all profile debug minimap2 samtools
 
 .DEFAULT_GOAL := all


=====================================
README.md
=====================================
@@ -3,7 +3,7 @@ Flye assembler
 
 [![BioConda Install](https://img.shields.io/conda/dn/bioconda/flye.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/flye)
 
-### Version: 2.9.1
+### Version: 2.9.2
 
 Flye is a de novo assembler for single-molecule sequencing reads,
 such as those produced by PacBio and Oxford Nanopore Technologies.
@@ -26,6 +26,11 @@ Manuals
 Latest updates
 --------------
 
+### Flye 2.9.2 release (18 March 2023)
+* Update to minimap 2.24 + using HiFi and Kit14 parameters for faster alignment
+* Fixed a few small bugs and corner cases
+* Polisher now outputs read depth for each base of consensus (can be used as confidence measure)
+
 ### Flye 2.9.1 release (07 Aug 2022)
 * New option --no-alt-contigs to remove all non-primary contigs from the assembly
 * Fixed crash on MacOS with Python 3.8+


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+flye (2.9.2+dfsg-1) unstable; urgency=medium
+
+  * New upstream version
+  * Standards-Version: 4.6.2 (routine-update)
+
+ -- Andreas Tille <tille at debian.org>  Tue, 11 Jul 2023 17:46:42 +0200
+
 flye (2.9.1+dfsg-1) unstable; urgency=medium
 
   * New upstream version


=====================================
debian/control
=====================================
@@ -11,7 +11,7 @@ Build-Depends: debhelper-compat (= 13),
                libminimap2-dev,
                samtools,
                zlib1g-dev
-Standards-Version: 4.6.1
+Standards-Version: 4.6.2
 Vcs-Browser: https://salsa.debian.org/med-team/flye
 Vcs-Git: https://salsa.debian.org/med-team/flye.git
 Homepage: https://github.com/fenderglass/Flye


=====================================
debian/patches/use_debian_packaged_libs.patch
=====================================
@@ -4,7 +4,7 @@ Description: use Debian packaged libminimap2 and liblemon
 
 --- a/Makefile
 +++ b/Makefile
-@@ -8,10 +8,10 @@ export BIN_DIR = ${ROOT_DIR}/bin
+@@ -8,15 +8,15 @@ export BIN_DIR = ${ROOT_DIR}/bin
  export MINIMAP2_DIR = ${ROOT_DIR}/lib/minimap2
  export SAMTOOLS_DIR = ${ROOT_DIR}/lib/samtools-1.9
  
@@ -13,12 +13,17 @@ Description: use Debian packaged libminimap2 and liblemon
 +export CXXFLAGS += ${LIBCUCKOO} ${INTERVAL_TREE} ${LEMON}
 +export LDFLAGS += -lz -lminimap2
  
+ ifeq ($(shell uname -m),arm64)
+ 	export arm_neon=1
+ 	export aarch64=1
+ endif
+ 
 -.PHONY: clean all profile debug minimap2 samtools
 +.PHONY: clean all profile debug
  
  .DEFAULT_GOAL := all
  
-@@ -29,15 +29,13 @@ ${BIN_DIR}/flye-samtools:
+@@ -34,15 +34,13 @@ ${BIN_DIR}/flye-samtools:
  	make samtools -C ${SAMTOOLS_DIR} -j ${THREADS}
  	cp ${SAMTOOLS_DIR}/samtools ${BIN_DIR}/flye-samtools
  


=====================================
docs/FAQ.md
=====================================
@@ -205,15 +205,16 @@ Do I need to preprocess my reads in any way?
 --------------------------------------------
 
 No, usually it is not necessary. Flye automatically filters out
-chimeric reads or reads with bad ends. If the read coverage is very high,
-you can use the built-in `--asm-coverage` option for subsampling the longest ones.
+chimeric reads or reads with bad ends. Adapter trimming and quality
+filtering is not needed either.
 
-Note that in PacBio mode, Flye assumes that the input files represent PacBio subreads,
+If the read coverage is very high, you can use the built-in `--asm-coverage` option for subsampling the longest ones.
+
+Note that in PacBio CLR mode, Flye assumes that the input files represent PacBio subreads,
 e.g. adaptors and scraps are removed and multiple passes of the same insertion
 sequence are separated. This is typically handled by PacBio instruments/toolchains,
-however we saw examples of problemmatic raw -> fastq conversions, 
-which resulted into incorrect subreads. In this case, 
-consider using [pbclip](https://github.com/fenderglass/pbclip) to fix your Fasta/q reads.
+however we saw examples of problemmatic raw -> fastq conversions with old CLR data. 
+In this case, consider using [pbclip](https://github.com/fenderglass/pbclip) to fix your Fasta/q reads.
 
 Are cluster environments (SGE / Slurm etc.) supported?
 ------------------------------------------------------


=====================================
docs/NEWS.md
=====================================
@@ -1,3 +1,9 @@
+Flye 2.9.2 release (18 March 2023)
+=================================
+* Update to minimap 2.24 + using HiFi and Kit14 parameters for faster alignment
+* Fixed a few small bugs and corner cases
+* Polisher now outputs read depth for each base of consensus (can be used as confidence measure)
+
 Flye 2.9.1 release (07 Aug 2022)
 ===============================
 * New option --no-alt-contigs to remove all non-primary contigs from the assembly


=====================================
flye/__build__.py
=====================================
@@ -1 +1 @@
-__build__ = 1780
+__build__ = 1786


=====================================
flye/__version__.py
=====================================
@@ -1 +1 @@
-__version__ = "2.9.1"
+__version__ = "2.9.2"


=====================================
flye/assembly/assemble.py
=====================================
@@ -41,7 +41,7 @@ def assemble(args, run_params, out_file, log_file, config_path):
     logger.info("Assembling disjointigs")
     logger.debug("-----Begin assembly log------")
     cmdline = [ASSEMBLE_BIN, "assemble", "--reads", ",".join(args.reads), "--out-asm", out_file,
-               "--config", config_path, "--log", log_file, "--threads", str(args.threads)]
+               "--config", config_path, "--log", log_file, "--threads", str(1 if args.deterministic else args.threads)]
     if args.debug:
         cmdline.append("--debug")
     if args.meta:


=====================================
flye/main.py
=====================================
@@ -30,8 +30,8 @@ from flye.utils.bytes2human import human2bytes, bytes2human
 from flye.utils.sam_parser import AlignmentException
 import flye.utils.fasta_parser as fp
 #import flye.short_plasmids.plasmids as plas
-import flye.trestle.trestle as tres
-import flye.trestle.graph_resolver as tres_graph
+#import flye.trestle.trestle as tres
+#import flye.trestle.graph_resolver as tres_graph
 from flye.repeat_graph.repeat_graph import RepeatGraph
 from flye.six.moves import range
 
@@ -124,43 +124,6 @@ class JobAssembly(Job):
         logger.debug("Disjointigs length: %d, N50: %d", asm_len, asm_n50)
 
 
-#class JobShortPlasmidsAssembly(Job):
-#    def __init__(self, args, work_dir, contigs_file, repeat_graph,
-#                 graph_edges):
-#        super(JobShortPlasmidsAssembly, self).__init__()
-#
-#        self.args = args
-#        self.work_dir = work_dir
-#        self.work_dir = os.path.join(work_dir, "22-plasmids")
-#        self.contigs_path = contigs_file
-#        self.repeat_graph = repeat_graph
-#        self.graph_edges = graph_edges
-#
-#        self.name = "plasmids"
-#        self.out_files["repeat_graph"] = os.path.join(self.work_dir,
-#                                                      "repeat_graph_dump")
-#        self.out_files["repeat_graph_edges"] = \
-#            os.path.join(self.work_dir, "repeat_graph_edges.fasta")
-#
-#
-#    def run(self):
-#        super(JobShortPlasmidsAssembly, self).run()
-#        logger.info("Recovering short unassembled sequences")
-#        if not os.path.isdir(self.work_dir):
-#            os.mkdir(self.work_dir)
-#        plasmids = plas.assemble_short_plasmids(self.args, self.work_dir,
-#                                                self.contigs_path)
-#
-#        #updating repeat graph
-#        repeat_graph = RepeatGraph(fp.read_sequence_dict(self.graph_edges))
-#        repeat_graph.load_from_file(self.repeat_graph)
-#        plas.update_graph(repeat_graph, plasmids)
-#        repeat_graph.dump_to_file(self.out_files["repeat_graph"])
-#        fp.write_fasta_dict(repeat_graph.edges_fasta,
-#                            self.out_files["repeat_graph_edges"])
-
-
-
 class JobRepeat(Job):
     def __init__(self, args, work_dir, log_file, disjointigs):
         super(JobRepeat, self).__init__()
@@ -314,8 +277,7 @@ class JobConsensus(Job):
         logger.info("Running Minimap2")
         out_alignment = os.path.join(self.consensus_dir, "minimap.bam")
         aln.make_alignment(self.in_contigs, self.args.reads, self.args.threads,
-                           self.consensus_dir, self.args.platform, out_alignment,
-                           reference_mode=True, sam_output=True)
+                           self.args.platform, self.args.read_type, out_alignment)
 
         contigs_info = aln.get_contigs_info(self.in_contigs)
         logger.info("Computing consensus")
@@ -363,7 +325,7 @@ class JobPolishing(Job):
                                self.out_files["stats"], self.out_files["contigs"])
         pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa,
                                     self.out_files["contigs"],
-                                    self.polishing_dir, self.args.platform,
+                                    self.polishing_dir, self.args.platform, self.args.read_type,
                                     stats, self.args.threads)
         os.remove(contigs)
         if os.path.getsize(self.out_files["contigs"]) == 0:
@@ -371,60 +333,6 @@ class JobPolishing(Job):
                                         "pipeline stopped")
 
 
-class JobTrestle(Job):
-    def __init__(self, args, work_dir, log_file, repeat_graph,
-                 graph_edges, reads_alignment_file):
-        super(JobTrestle, self).__init__()
-
-        self.args = args
-        self.work_dir = os.path.join(work_dir, "21-trestle")
-        self.log_file = log_file
-        #self.repeats_dump = repeats_dump
-        self.graph_edges = graph_edges
-        self.repeat_graph = repeat_graph
-        self.reads_alignment_file = reads_alignment_file
-
-        self.name = "trestle"
-        self.out_files["repeat_graph"] = os.path.join(self.work_dir,
-                                                      "repeat_graph_dump")
-        self.out_files["repeat_graph_edges"] = \
-            os.path.join(self.work_dir, "repeat_graph_edges.fasta")
-
-    def run(self):
-        super(JobTrestle, self).run()
-
-        if not os.path.isdir(self.work_dir):
-            os.mkdir(self.work_dir)
-
-        summary_file = os.path.join(self.work_dir, "trestle_summary.txt")
-        resolved_repeats_seqs = os.path.join(self.work_dir,
-                                             "resolved_copies.fasta")
-        repeat_graph = RepeatGraph(fp.read_sequence_dict(self.graph_edges))
-        repeat_graph.load_from_file(self.repeat_graph)
-
-        try:
-            repeats_info = tres_graph \
-                .get_simple_repeats(repeat_graph, self.reads_alignment_file,
-                                    fp.read_sequence_dict(self.graph_edges))
-            tres_graph.dump_repeats(repeats_info,
-                                    os.path.join(self.work_dir, "repeats_dump"))
-
-            tres.resolve_repeats(self.args, self.work_dir, repeats_info,
-                                 summary_file, resolved_repeats_seqs)
-            tres_graph.apply_changes(repeat_graph, summary_file,
-                                     fp.read_sequence_dict(resolved_repeats_seqs))
-        except KeyboardInterrupt as e:
-            raise
-        #except Exception as e:
-        #    logger.warning("Caught unhandled exception: " + str(e))
-        #    logger.warning("Continuing to the next pipeline stage. "
-        #                   "Please submit a bug report along with the full log file")
-
-        repeat_graph.dump_to_file(self.out_files["repeat_graph"])
-        fp.write_fasta_dict(repeat_graph.edges_fasta,
-                            self.out_files["repeat_graph_edges"])
-
-
 def _create_job_list(args, work_dir, log_file):
     """
     Build pipeline as a list of consecutive jobs
@@ -452,13 +360,12 @@ def _create_job_list(args, work_dir, log_file):
 
     #Trestle: Resolve Unbridged Repeats
     #if not args.no_trestle and not args.meta and args.read_type == "raw":
-    if args.trestle:
-        jobs.append(JobTrestle(args, work_dir, log_file,
-                    repeat_graph, repeat_graph_edges,
-                    reads_alignment))
-        repeat_graph_edges = jobs[-1].out_files["repeat_graph_edges"]
-        repeat_graph = jobs[-1].out_files["repeat_graph"]
-
+    #if args.trestle:
+    #    jobs.append(JobTrestle(args, work_dir, log_file,
+    #                repeat_graph, repeat_graph_edges,
+    #                reads_alignment))
+    #    repeat_graph_edges = jobs[-1].out_files["repeat_graph_edges"]
+    #    repeat_graph = jobs[-1].out_files["repeat_graph"]
 
     #Contigger
     jobs.append(JobContigger(args, work_dir, log_file, repeat_graph_edges,
@@ -619,7 +526,8 @@ def _usage():
             "\t     [--meta] [--polish-target] [--min-overlap SIZE]\n"
             "\t     [--keep-haplotypes] [--debug] [--version] [--help] \n"
             "\t     [--scaffold] [--resume] [--resume-from] [--stop-after] \n"
-            "\t     [--read-error float] [--extra-params]")
+            "\t     [--read-error float] [--extra-params] \n"
+            "\t     [--deterministic]")
 
 
 def _epilog():
@@ -752,6 +660,9 @@ def main():
                         dest="debug", default=False,
                         help="enable debug output")
     parser.add_argument("-v", "--version", action="version", version=_version())
+    parser.add_argument("--deterministic", action="store_true",
+                        dest="deterministic", default=False,
+                        help="perform disjointig assembly single-threaded")
     args = parser.parse_args()
 
     if args.asm_coverage and (args.genome_size is None):


=====================================
flye/polishing/alignment.py
=====================================
@@ -39,20 +39,9 @@ def check_binaries():
 
 
 def make_alignment(reference_file, reads_file, num_proc,
-                   work_dir, platform, out_alignment, reference_mode,
-                   sam_output):
-    """
-    Runs minimap2 and sorts its output
-    """
-    minimap_ref_mode = {False: "ava", True: "map"}
-    minimap_reads_mode = {"nano": "ont", "pacbio": "pb"}
-    mode = minimap_ref_mode[reference_mode] + "-" + minimap_reads_mode[platform]
-
-    _run_minimap(reference_file, reads_file, num_proc, mode,
-                 out_alignment, sam_output)
-
-    #if sam_output:
-    #    preprocess_sam(out_alignment, work_dir)
+                   platform, read_type, out_alignment):
+    mode = platform + "-" + read_type
+    _run_minimap(reference_file, reads_file, num_proc, mode, out_alignment)
 
 
 def get_contigs_info(contigs_file):
@@ -107,7 +96,8 @@ def get_uniform_alignments(alignments):
     MIN_QV = 20
 
     def is_reliable(aln):
-        return not aln.is_secondary and not aln.is_supplementary and aln.map_qv >= MIN_QV
+        return not aln.is_secondary and aln.map_qv >= MIN_QV
+        #return not aln.is_secondary and not aln.is_supplementary and aln.map_qv >= MIN_QV
 
     seq_len = alignments[0].trg_len
     ctg_id = alignments[0].trg_id
@@ -234,8 +224,7 @@ def merge_chunks(fasta_in, fold_function=lambda l: "".join(l)):
     return out_dict
 
 
-def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
-                 sam_output):
+def _run_minimap(reference_file, reads_files, num_proc, reads_type, out_file):
     #SAM_HEADER = "\'@PG|@HD|@SQ|@RG|@CO\'"
     work_dir = os.path.dirname(out_file)
     stderr_file = os.path.join(work_dir, "minimap.stderr")
@@ -243,9 +232,22 @@ def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
     SORT_MEM = "4G" if os.path.getsize(reference_file) > 100 * 1024 * 1024 else "1G"
     BATCH = "5G" if os.path.getsize(reference_file) > 100 * 1024 * 1024 else "1G"
 
+    mode = None
+    extra_args = []
+    if reads_type in ["pacbio-raw", "pacbio-corrected", "pacbio-subasm"]:
+        mode = "map-pb"
+    if reads_type == "pacbio-hifi":
+        mode = "map-hifi"
+    elif reads_type in ["nano-raw", "nano-corrected"]:
+        mode = "map-ont"
+    elif reads_type == "nano-nano_hq":
+        mode = "map-ont"
+        extra_args = ["-k", "17"]
+
     cmdline = [MINIMAP_BIN, "'" + reference_file + "'"]
     cmdline.extend(["'" + read_file + "'" for read_file in reads_files])
     cmdline.extend(["-x", mode, "-t", str(num_proc)])
+    cmdline.extend(extra_args)
 
     #Produces gzipped SAM sorted by reference name. Since it's not sorted by
     #read name anymore, it's important that all reads have SEQ.
@@ -256,22 +258,14 @@ def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
     #--secondary-seq = custom option to output SEQ for seqcondary alignment with hard clipping
     #-L: move CIGAR strings for ultra-long reads to the separate tag
     #-Q don't output fastq quality
-    if sam_output:
-        tmp_prefix = os.path.join(os.path.dirname(out_file),
-                                  "sort_" + datetime.datetime.now().strftime("%y%m%d_%H%M%S"))
-        cmdline.extend(["-a", "-p", "0.5", "-N", "10", "--sam-hit-only", "-L", "-K", BATCH,
-                        "-z", "1000", "-Q", "--secondary-seq", "-I", "64G"])
-        cmdline.extend(["|", SAMTOOLS_BIN, "view", "-T", "'" + reference_file + "'", "-u", "-"])
-        cmdline.extend(["|", SAMTOOLS_BIN, "sort", "-T", "'" + tmp_prefix + "'", "-O", "bam",
-                        "-@", SORT_THREADS, "-l", "1", "-m", SORT_MEM])
-        cmdline.extend(["-o", "'" + out_file + "'"])
-    else:
-        pass    #paf output enabled by default
-
-        #cmdline.extend(["|", "grep", "-Ev", SAM_HEADER])    #removes headers
-        #cmdline.extend(["|", "sort", "-k", "3,3", "-T", work_dir,
-        #                "--parallel=8", "-S", "4G"])
-        #cmdline.extend(["|", "gzip", "-1"])
+    tmp_prefix = os.path.join(os.path.dirname(out_file),
+                              "sort_" + datetime.datetime.now().strftime("%y%m%d_%H%M%S"))
+    cmdline.extend(["-a", "-p", "0.5", "-N", "10", "--sam-hit-only", "-L", "-K", BATCH,
+                    "-z", "1000", "-Q", "--secondary-seq", "-I", "64G"])
+    cmdline.extend(["|", SAMTOOLS_BIN, "view", "-T", "'" + reference_file + "'", "-u", "-"])
+    cmdline.extend(["|", SAMTOOLS_BIN, "sort", "-T", "'" + tmp_prefix + "'", "-O", "bam",
+                    "-@", SORT_THREADS, "-l", "1", "-m", SORT_MEM])
+    cmdline.extend(["-o", "'" + out_file + "'"])
 
     #logger.debug("Running: " + " ".join(cmdline))
     try:
@@ -280,11 +274,11 @@ def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
                               "set -eo pipefail; " + " ".join(cmdline)],
                               stderr=open(stderr_file, "w"),
                               stdout=open(os.devnull, "w"))
-        if sam_output:
-            subprocess.check_call(SAMTOOLS_BIN + " index -@ 4 " + "'" + out_file + "'", shell=True)
+        subprocess.check_call(SAMTOOLS_BIN + " index -@ 4 " + "'" + out_file + "'", shell=True)
         #os.remove(stderr_file)
 
     except (subprocess.CalledProcessError, OSError) as e:
         logger.error("Error running minimap2, terminating. See the alignment error log "
                      " for details: " + stderr_file)
+        logger.error("Cmd: " + " ".join(cmdline))
         raise AlignmentException(str(e))


=====================================
flye/polishing/bubbles.py
=====================================
@@ -328,7 +328,7 @@ def _compute_profile(alignment, ref_sequence):
     genome_len = alignment[0].trg_len
 
     #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
-    min_aln_len = cfg.vals["min_polish_aln_len"]
+    min_aln_len = min(cfg.vals["min_polish_aln_len"], genome_len // 2)
     aln_errors = []
     #filtered = 0
     profile = [ProfileInfo() for _ in range(genome_len)]


=====================================
flye/polishing/polish.py
=====================================
@@ -12,6 +12,7 @@ import logging
 import subprocess, multiprocessing
 import os
 from collections import defaultdict
+import gzip
 
 from flye.polishing.alignment import (make_alignment, get_contigs_info,
                                       merge_chunks, split_into_chunks)
@@ -64,6 +65,7 @@ def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platfo
     use_hopo = cfg.vals["err_modes"][read_platform]["hopo_enabled"]
     use_hopo = use_hopo and (read_type == "raw")
     stats_file = os.path.join(work_dir, "contigs_stats.txt")
+    bed_coverage = os.path.join(work_dir, "base_coverage.bed.gz")
 
     bam_input = read_seqs[0].endswith("bam")
 
@@ -78,8 +80,7 @@ def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platfo
             logger.info("Running minimap2")
             alignment_file = os.path.join(work_dir, "minimap_{0}.bam".format(i + 1))
             make_alignment(prev_assembly, read_seqs, num_threads,
-                           work_dir, read_platform, alignment_file,
-                           reference_mode=True, sam_output=True)
+                           read_platform, read_type, alignment_file)
         else:
             logger.info("Polishing with provided bam")
             alignment_file = read_seqs[0]
@@ -109,7 +110,7 @@ def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platfo
         logger.info("Correcting bubbles")
         _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix,
                         consensus_out, num_threads, output_progress, use_hopo)
-        polished_fasta, polished_lengths = _compose_sequence(consensus_out)
+        polished_fasta, polished_lengths, bubble_coverages = _compose_sequence(consensus_out)
         fp.write_fasta_dict(polished_fasta, polished_file)
 
         #Cleanup
@@ -127,6 +128,14 @@ def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platfo
             f.write("{0}\t{1}\t{2}\n".format(ctg_id,
                     contig_lengths[ctg_id], coverage_stats[ctg_id]))
 
+    with gzip.open(bed_coverage, "wt") as f:
+        f.write("#seq_name\tstart\tend\tcoverage\n")
+        for ctg_id in bubble_coverages:
+            cur_pos = 0
+            for (bub_len, bub_cov) in bubble_coverages[ctg_id]:
+                f.write("{0}\t{1}\t{2}\t{3}\n".format(ctg_id, cur_pos, cur_pos + bub_len, bub_cov))
+                cur_pos += bub_len
+
     if not output_progress:
         logger.disabled = logger_state
 
@@ -134,7 +143,7 @@ def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platfo
 
 
 def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir,
-                            error_mode, polished_stats, num_threads):
+                            platform, read_type, polished_stats, num_threads):
     """
     Generate polished graph edges sequences by extracting them from
     polished contigs
@@ -153,8 +162,7 @@ def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir,
     alignment_file = os.path.join(work_dir, "edges_aln.bam")
     polished_dict = fp.read_sequence_dict(polished_contigs)
     make_alignment(polished_contigs, [edges_file], num_threads,
-                   work_dir, error_mode, alignment_file,
-                   reference_mode=True, sam_output=True)
+                   platform, read_type, alignment_file)
     aln_reader = SynchronizedSamReader(alignment_file,
                                        polished_dict, multiprocessing.Manager(),
                                        cfg.vals["max_read_coverage"])
@@ -309,19 +317,23 @@ def _compose_sequence(consensus_file):
 
                 ctg_id = tokens[0][1:]
                 ctg_pos = int(tokens[1])
-                #coverage[ctg_id].append(int(tokens[2]))
+                coverage = int(tokens[2])
                 ctg_sub_pos = int(tokens[3])
             else:
-                consensuses[ctg_id].append((ctg_pos, ctg_sub_pos, line.strip()))
+                consensuses[ctg_id].append((ctg_pos, ctg_sub_pos, coverage, line.strip()))
             header = not header
 
     polished_fasta = {}
     polished_stats = {}
+    polished_coverages = {}
     for ctg_id, seqs in iteritems(consensuses):
-        sorted_seqs = [p[2] for p in sorted(seqs, key=lambda p: (p[0], p[1]))]
+        seqs.sort(key=lambda p: (p[0], p[1]))
+        sorted_seqs = [p[3] for p in seqs]
+        bubble_coverages = [(len(p[3]), p[2]) for p in seqs]
         concat_seq = "".join(sorted_seqs)
         #mean_coverage = sum(coverage[ctg_id]) / len(coverage[ctg_id])
         polished_fasta[ctg_id] = concat_seq
         polished_stats[ctg_id] = len(concat_seq)
+        polished_coverages[ctg_id] = bubble_coverages
 
-    return polished_fasta, polished_stats
+    return polished_fasta, polished_stats, polished_coverages


=====================================
flye/utils/sam_parser.py
=====================================
@@ -240,7 +240,7 @@ class SynchronizedSamReader(object):
                     soft_clipped_left += size
                 else:
                     soft_clipped_right += size
-            elif op == b"M":
+            elif op in b"MX=":
                 qry_seq.append(read_str[qry_pos : qry_pos + size].upper())
                 trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper())
                 qry_pos += size



View it on GitLab: https://salsa.debian.org/med-team/flye/-/compare/2fbaaf43498c5d30a843f6ed97e5d0a581061b3b...4b87e567c262fab28205968e496f6fdbb5551219

-- 
View it on GitLab: https://salsa.debian.org/med-team/flye/-/compare/2fbaaf43498c5d30a843f6ed97e5d0a581061b3b...4b87e567c262fab28205968e496f6fdbb5551219
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230711/f749c8e7/attachment-0001.htm>


More information about the debian-med-commit mailing list