[med-svn] [circlator] 01/04: New upstream version 1.5.1

Sascha Steinbiss satta at debian.org
Fri Jun 23 10:15:16 UTC 2017


This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository circlator.

commit e6369ca76bcbacfbedc915b65ef4c5e1aaea709e
Author: Sascha Steinbiss <satta at debian.org>
Date:   Fri Jun 23 11:58:53 2017 +0200

    New upstream version 1.5.1
---
 AUTHORS                                            |  2 +
 circlator/assemble.py                              | 69 +++++++++++++++--
 circlator/assembly.py                              | 90 +++++++++++++++-------
 circlator/bamfilter.py                             | 32 +++++++-
 circlator/common.py                                |  3 +
 circlator/external_progs.py                        | 17 +++-
 circlator/merge.py                                 | 25 ++++--
 circlator/tasks/all.py                             | 17 +++-
 circlator/tasks/assemble.py                        |  6 +-
 circlator/tasks/bam2reads.py                       |  2 +
 circlator/tasks/merge.py                           |  8 ++
 circlator/tests/assemble_test.py                   | 33 ++++++++
 circlator/tests/assembly_test.py                   | 30 +++++---
 .../assemble_test_rename_canu_contigs.expect.fa    |  8 ++
 .../data/assemble_test_rename_canu_contigs.in.fa   |  8 ++
 .../assembly_test_circular_contigs_from_canu.gfa   | 14 ++++
 circlator/versions.py                              |  4 +-
 install_dependencies.sh                            | 10 +++
 setup.py                                           |  4 +-
 19 files changed, 314 insertions(+), 68 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 711ae85..31dc516 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1 +1,3 @@
 Martin Hunt (mh12 at sanger.ac.uk)
+
+This small modification to allow replacing Spades with Canu, and different handling of small contigs, has been done by Nicola De Maio (nicola.de.maio.85 at gmail.com)
diff --git a/circlator/assemble.py b/circlator/assemble.py
index c6b91ad..7396f43 100644
--- a/circlator/assemble.py
+++ b/circlator/assemble.py
@@ -17,6 +17,9 @@ class Assembler:
       only_assembler=True,
       verbose=False,
       spades_use_first_success=False,
+      assembler='spades',
+      genomeSize=100000, # only matters for Canu if correcting reads (which we're not)
+      data_type='pacbio-corrected',
     ):
         self.outdir = os.path.abspath(outdir)
         self.reads = os.path.abspath(reads)
@@ -24,14 +27,23 @@ class Assembler:
             raise Error('Reads file not found:' + self.reads)
 
         self.verbose = verbose
-        self.threads = threads
-        self.careful = careful
-        self.only_assembler = only_assembler
-        self.spades = external_progs.make_and_check_prog('spades', verbose=self.verbose)
-        self.spades_kmers = self._build_spades_kmers(spades_kmers)
-        self.spades_use_first_success = spades_use_first_success
         self.samtools = external_progs.make_and_check_prog('samtools', verbose=self.verbose)
-        self.assembler = 'spades'
+        self.threads = threads
+        self.assembler = assembler
+
+        if self.assembler == 'spades':
+            self.spades = external_progs.make_and_check_prog('spades', verbose=self.verbose, required=True)
+            self.spades_kmers = self._build_spades_kmers(spades_kmers)
+            self.spades_use_first_success = spades_use_first_success
+            self.careful = careful
+            self.only_assembler = only_assembler
+        elif self.assembler == 'canu':
+            self.canu = external_progs.make_and_check_prog('canu', verbose=self.verbose, required=True)
+            self.genomeSize=genomeSize
+            self.data_type = data_type
+        else:
+            raise Error('Unknown assembler: "' + self.assembler + '". cannot continue')
+
 
 
     def _build_spades_kmers(self, kmers):
@@ -67,6 +79,21 @@ class Assembler:
         return ' '.join(cmd)
 
 
+    def _make_canu_command(self, outdir, out_name):
+        cmd = [
+            self.canu.exe(),
+            '-useGrid=false',
+            'gnuplotTested=true',
+            '-assemble',
+            'genomeSize='+str(float(self.genomeSize)/1000000)+'m',
+            '-d', outdir,
+            '-p', out_name,
+            '-'+self.data_type,
+            self.reads,
+        ]
+        return ' '.join(cmd)
+
+
     def run_spades_once(self, kmer, outdir):
         cmd = self._make_spades_command(kmer, outdir)
         return common.syscall(cmd, verbose=self.verbose, allow_fail=True)
@@ -118,8 +145,36 @@ class Assembler:
             raise Error('Error running SPAdes. Output directories are:\n  ' + '\n  '.join(kmer_to_dir.values()) + '\nThe reason why should be in the spades.log file in each directory.')
 
 
+    @classmethod
+    def _rename_canu_contigs(cls, infile, outfile):
+        with open (infile) as f_in:
+            with open(outfile, 'w') as f_out:
+                for line in f_in:
+                    if line.startswith('>'):
+                        print(line.split()[0], file=f_out)
+                    else:
+                        print(line, end='', file=f_out)
+
+
+    def run_canu(self):
+        '''Runs canu instead of spades'''
+        cmd = self._make_canu_command(self.outdir,'canu')
+        ok, errs = common.syscall(cmd, verbose=self.verbose, allow_fail=False)
+        if not ok:
+            raise Error('Error running Canu.')
+
+        original_contigs = os.path.join(self.outdir, 'canu.contigs.fasta')
+        renamed_contigs = os.path.join(self.outdir, 'contigs.fasta')
+        Assembler._rename_canu_contigs(original_contigs, renamed_contigs)
+        original_gfa = os.path.join(self.outdir, 'canu.contigs.gfa')
+        renamed_gfa = os.path.join(self.outdir, 'contigs.gfa')
+        os.rename(original_gfa, renamed_gfa)
+
+
     def run(self):
         if self.assembler == 'spades':
             self.run_spades(stop_at_first_success=self.spades_use_first_success)
+        elif self.assembler == 'canu':
+            self.run_canu()
         else:
             raise Error('Unknown assembler: "' + self.assembler + '". cannot continue')
diff --git a/circlator/assembly.py b/circlator/assembly.py
index daa55c4..70a3bf6 100644
--- a/circlator/assembly.py
+++ b/circlator/assembly.py
@@ -4,16 +4,19 @@ import pyfastaq
 class Error (Exception): pass
 
 class Assembly:
-    def __init__(self, path):
+    def __init__(self, path, assembler):
         '''path can be a directory or a filename. If directory, assumes the name of a SPAdes
+           or Canu (with files contigs.fasta and contigs.gfa files)
            output directory. If a file, assumes it is a fasta file of contigs'''
+        self.assembler = assembler
+
         if not os.path.exists(path):
             raise Error('Input path to Assembly.__init__ not found: ' + path)
         elif os.path.isdir(path):
-            self.spades_dir = os.path.abspath(path)
+            self.assembler_dir = os.path.abspath(path)
         else:
             self.contigs_fasta = os.path.abspath(path)
-            self.spades_dir = None
+            self.assembler_dir = None
 
         self._set_filenames()
 
@@ -27,34 +30,42 @@ class Assembly:
 
 
     def _set_filenames(self):
+        self.contigs_gfa = None
         self.contigs_fastg = None
         self.contigs_paths = None
         self.assembly_graph_fastg = None
 
-        if self.spades_dir is None:
+        if self.assembler_dir is None:
             return
 
-        contigs_fasta = os.path.join(self.spades_dir, 'contigs.fasta')
-        self.contigs_fasta = self._file_exists(os.path.join(contigs_fasta))
+        contigs_fasta = os.path.join(self.assembler_dir, 'contigs.fasta')
+        self.contigs_fasta = self._file_exists(contigs_fasta)
 
         if self.contigs_fasta is None:
-            raise Error('Error finding SPAdes contigs file: ' + contigs_fasta)
-
-        self.contigs_fastg = self._file_exists(os.path.join(self.spades_dir, 'contigs.fastg'))
-        self.contigs_paths = self._file_exists(os.path.join(self.spades_dir, 'contigs.paths'))
-        self.assembly_graph_fastg = self._file_exists(os.path.join(self.spades_dir, 'assembly_graph.fastg'))
-
-        if None == self.contigs_fastg == self.contigs_paths == self.assembly_graph_fastg or \
-           ( self.contigs_fastg is None and (None in {self.contigs_paths, self.assembly_graph_fastg}) ) or \
-           ( self.contigs_fastg is not None and (self.contigs_paths is not None or self.assembly_graph_fastg is not None) ):
-            error_message = '\n'.join([
-                 'Error finding SPAdes graph files in the directory ' + self.spades_dir,
-                 'Expected either:',
-                 '    contigs.fastg (SPAdes <3.6.1)',
-                 'or:',
-                 '    contigs.paths and assembly_graph.fastg (SPAdes >3.6.1)'
-            ])
-            raise Error(error_message)
+            raise Error('Error finding contigs file: ' + contigs_fasta)
+
+        self.contigs_gfa = self._file_exists(os.path.join(self.assembler_dir, 'contigs.gfa'))
+        self.contigs_fastg = self._file_exists(os.path.join(self.assembler_dir, 'contigs.fastg'))
+        self.contigs_paths = self._file_exists(os.path.join(self.assembler_dir, 'contigs.paths'))
+        self.assembly_graph_fastg = self._file_exists(os.path.join(self.assembler_dir, 'assembly_graph.fastg'))
+
+        if self.assembler == 'spades':
+            if None == self.contigs_fastg == self.contigs_paths == self.assembly_graph_fastg or \
+               ( self.contigs_fastg is None and (None in {self.contigs_paths, self.assembly_graph_fastg}) ) or \
+               ( self.contigs_fastg is not None and (self.contigs_paths is not None or self.assembly_graph_fastg is not None) ):
+                error_message = '\n'.join([
+                     'Error finding SPAdes graph files in the directory ' + self.assembler_dir,
+                     'Expected either:',
+                     '    contigs.fastg (SPAdes <3.6.1)',
+                     'or:',
+                     '    contigs.paths and assembly_graph.fastg (SPAdes >3.6.1)'
+                ])
+                raise Error(error_message)
+        elif self.assembler == 'canu':
+            if self.contigs_fasta is None or self.contigs_gfa is None:
+                raise Error('Error finding canu contigs fasta and/or gfa file')
+        else:
+            raise Error('Assembler "' + self.assembler + '" not recognised. Cannot continue')
 
 
     def get_contigs(self):
@@ -143,11 +154,36 @@ class Assembly:
         return circular_nodes
 
 
+    @staticmethod
+    def _circular_contigs_from_canu_gfa(gfa_file):
+        self_matches = {}
+        other_matches = set()
+
+        with open(gfa_file) as f:
+            for line in f:
+                if line.startswith('L\t'):
+                    L, node1, dir1, node2, dir2, *the_rest = line.rstrip().split('\t')
+                    if node1 == node2:
+                        if dir1 == dir2:
+                            if node1 not in self_matches:
+                                self_matches[node1] = set()
+                            self_matches[node1].add(dir1)
+                    else:
+                        other_matches.update({node1, node2})
+
+        return {x for x in self_matches if self_matches[x] == {'+', '-'} and x not in other_matches}
+
+
     def circular_contigs(self):
         '''Returns a set of the contig names that are circular'''
-        if self.contigs_fastg is not None:
-            return self._circular_contigs_from_spades_before_3_6_1(self.contigs_fastg)
-        elif None not in [self.contigs_paths, self.assembly_graph_fastg]:
-            return self._circular_contigs_from_spades_after_3_6_1(self.assembly_graph_fastg, self.contigs_paths)
+        if self.assembler == 'spades':
+            if self.contigs_fastg is not None:
+                return self._circular_contigs_from_spades_before_3_6_1(self.contigs_fastg)
+            elif None not in [self.contigs_paths, self.assembly_graph_fastg]:
+                return self._circular_contigs_from_spades_after_3_6_1(self.assembly_graph_fastg, self.contigs_paths)
+            else:
+                return set()
+        elif self.assembler == 'canu':
+            return self._circular_contigs_from_canu_gfa(self.contigs_gfa)
         else:
             return set()
diff --git a/circlator/bamfilter.py b/circlator/bamfilter.py
index 68e5565..b523cbf 100644
--- a/circlator/bamfilter.py
+++ b/circlator/bamfilter.py
@@ -18,6 +18,7 @@ class BamFilter:
              discard_unmapped=False,
              log_prefix='[bamfilter]',
              verbose=False,
+             split_all_reads=False,
     ):
         self.bam = os.path.abspath(bam)
         if not os.path.exists(self.bam):
@@ -32,6 +33,8 @@ class BamFilter:
         self.discard_unmapped = discard_unmapped
         self.min_read_length = min_read_length
         self.verbose = verbose
+        self.split_all_reads = split_all_reads
+
 
 
     def _get_ref_lengths(self):
@@ -158,10 +161,31 @@ class BamFilter:
                 continue
 
             if ref_lengths[contig] <= self.length_cutoff:
-                self._all_reads_from_contig(contig, f_fa)
-                print(self.log_prefix, contig, ref_lengths[contig], 'all', sep='\t', file=f_log)
-                if self.verbose:
-                    print('Getting all reads that map to contig', contig, flush=True)
+                #NEW version: even for small contigs, split them in two halves, and split the reads mapping through the middle point of the contig in two, so that each piece maps to only one half.
+                if self.split_all_reads:
+                    end_bases_keep = int(0.5 * ref_lengths[contig])
+                    start = end_bases_keep - 1
+                    end = end_bases_keep #max(end_bases_keep - 1, ref_lengths[contig] - end_bases_keep)
+                    self._get_region(contig, 0, start, f_fa, min_length=self.min_read_length)
+                    self._get_region(contig, end, ref_lengths[contig], f_fa, min_length=self.min_read_length)
+                    coords_string = '1-' + str(start + 1) +  ';' + str(end + 1) + '-' + str(ref_lengths[contig])
+                    print(
+                        self.log_prefix,
+                        contig,
+                        ref_lengths[contig],
+                        coords_string,
+                        sep='\t',
+                        file=f_log
+                    )
+                    if self.verbose:
+                        print('Getting all reads that map to ends of contig ', contig, '. Coords: ', coords_string, sep='', flush=True)
+                else:
+                    #OLD version, do not split reads so that SPAdes attempts to circulrize.
+                    self._all_reads_from_contig(contig, f_fa)
+                    print(self.log_prefix, contig, ref_lengths[contig], 'all', sep='\t', file=f_log)
+                    if self.verbose:
+                        print('Getting all reads that map to contig', contig, flush=True)
+                    
             else:
                 end_bases_keep = int(0.5 * self.length_cutoff)
                 start = end_bases_keep - 1
diff --git a/circlator/common.py b/circlator/common.py
index 114c980..afce3e3 100644
--- a/circlator/common.py
+++ b/circlator/common.py
@@ -4,6 +4,9 @@ import subprocess
 
 class Error (Exception): pass
 
+allowed_assemblers = ['canu', 'spades'] 
+allowed_data_types = ['pacbio-raw', 'pacbio-corrected', 'nanopore-raw', 'nanopore-corrected']
+
 def syscall(cmd, allow_fail=False, verbose=False):
     if verbose:
         print('syscall:', cmd, flush=True)
diff --git a/circlator/external_progs.py b/circlator/external_progs.py
index c3b8527..30a5c59 100644
--- a/circlator/external_progs.py
+++ b/circlator/external_progs.py
@@ -11,6 +11,7 @@ class Error (Exception): pass
 prog_to_env_var = {
     'samtools': 'CIRCLATOR_SAMTOOLS',
     'spades': 'CIRCLATOR_SPADES',
+    'canu': 'CIRCLATOR_CANU',
 }
 
 
@@ -20,6 +21,7 @@ prog_to_version_cmd = {
     'prodigal': ('-v', re.compile('^Prodigal V([0-9\.]+):')),
     'samtools': ('', re.compile('^Version: ([0-9\.]+)')),
     'spades': ('', re.compile('^SPAdes genome assembler v.?([0-9][0-9\.]+)')),
+    'canu': ('-version', re.compile('^Canu.*v.?([0-9][0-9\.]+)')),
 }
 
 
@@ -29,6 +31,7 @@ min_versions = {
     'prodigal': '2.6',
     'samtools': '0.1.19',
     'spades': '3.6.2', # this is the first version to support python3
+    'canu': '0.0',
 }
 
 
@@ -43,8 +46,10 @@ prog_name_to_default = {
     'prodigal': 'prodigal',
     'spades': 'spades.py',
     'samtools': 'samtools',
+    'canu': 'canu',
 }
 
+not_required = {'spades', 'canu'}
 
 def handle_error(message, raise_error=True):
     if raise_error:
@@ -53,7 +58,7 @@ def handle_error(message, raise_error=True):
         print(message)
 
 
-def make_and_check_prog(name, verbose=False, raise_error=True, filehandle=None, debug=False):
+def make_and_check_prog(name, verbose=False, raise_error=True, filehandle=None, debug=False, required=False):
     p = program.Program(
         prog_name_to_default[name],
         prog_to_version_cmd[name][0],
@@ -63,7 +68,11 @@ def make_and_check_prog(name, verbose=False, raise_error=True, filehandle=None,
     )
 
     if not p.in_path():
-        handle_error("Didn't find " + name + " in path. Looked for:" + p.path, raise_error=raise_error)
+        if required:
+            die = True
+        else:
+            die = raise_error and (name not in not_required)
+        handle_error("WARNING: Didn't find " + name + " in path. Looked for:" + p.path, raise_error=die)
         return p
 
     version = p.version
@@ -92,8 +101,8 @@ def make_and_check_prog(name, verbose=False, raise_error=True, filehandle=None,
     return p
 
 
-def check_all_progs(verbose=False, raise_error=False, filehandle=None, debug=False):
+def check_all_progs(verbose=False, raise_error=False, filehandle=None, debug=False, assembler=None):
     for prog in sorted(prog_name_to_default):
         if debug:
             print('__________ checking', prog, '____________', flush=True)
-        make_and_check_prog(prog, verbose=verbose, raise_error=raise_error, filehandle=filehandle, debug=debug)
+        make_and_check_prog(prog, verbose=verbose, raise_error=raise_error, filehandle=filehandle, debug=debug, required=prog==assembler)
diff --git a/circlator/merge.py b/circlator/merge.py
index 6dfea1f..3958996 100644
--- a/circlator/merge.py
+++ b/circlator/merge.py
@@ -26,6 +26,10 @@ class Merger:
           spades_use_first_success=False,
           spades_careful=True,
           spades_only_assembler=True,
+          assembler='spades',
+          length_cutoff=100000,
+          split_all_reads=False,
+          data_type='pacbio-corrected',
           ref_end_tolerance=15000,
           qry_end_tolerance=1000,
           verbose=False,
@@ -35,8 +39,9 @@ class Merger:
         if not os.path.exists(original_assembly):
             raise Error('File not found:' + original_assembly)
 
+        self.assembler = assembler
         self.original_fasta = original_assembly
-        self.reassembly = circlator.assembly.Assembly(reassembly)
+        self.reassembly = circlator.assembly.Assembly(reassembly, assembler=self.assembler)
         self.reads = reads
         self.outprefix = outprefix
         self.nucmer_diagdiff = nucmer_diagdiff
@@ -49,6 +54,9 @@ class Merger:
         self.spades_use_first_success = spades_use_first_success
         self.spades_careful = spades_careful
         self.spades_only_assembler = spades_only_assembler
+        self.length_cutoff=length_cutoff
+        self.split_all_reads=split_all_reads
+        self.data_type=data_type
         self.ref_end_tolerance = ref_end_tolerance
         self.qry_end_tolerance = qry_end_tolerance
         self.verbose = verbose
@@ -347,7 +355,7 @@ class Merger:
         else:
             circular_string = 'None'
         print(log_outprefix, file=log_fh)
-        print(log_outprefix, '\tSPAdes reassembly contigs that are circular: ', circular_string, sep='', file=log_fh)
+        print(log_outprefix, '\t', self.assembler, ' reassembly contigs that are circular: ', circular_string, sep='', file=log_fh)
 
         fate_of_contigs = {
             x: set() for x in [
@@ -365,12 +373,12 @@ class Merger:
                 new_contig, spades_contig = self._make_new_contig_from_nucmer_and_spades(ref_name, nucmer_hits[ref_name], called_as_circular_by_spades, log_fh=log_fh, log_outprefix=log_outprefix)
 
                 if new_contig is None:
-                    print(log_outprefix, ref_name, 'Could not circularize using matches to SPAdes circular contigs', sep='\t', file=log_fh)
+                    print(log_outprefix, ref_name, 'Could not circularize using matches to ' + self.assembler + ' circular contigs', sep='\t', file=log_fh)
 
                     if ref_name in to_circularise_with_nucmer:
                         start_hit, end_hit = to_circularise_with_nucmer[ref_name]
                         assert start_hit.ref_name == end_hit.ref_name == ref_name
-                        print(log_outprefix, ref_name, 'Circularizing using this pair of nucmer matches to SPAdes contig:', sep='\t', file=log_fh)
+                        print(log_outprefix, ref_name, 'Circularizing using this pair of nucmer matches to ' + self.assembler + ' contig:', sep='\t', file=log_fh)
                         print(log_outprefix, '\t', ref_name, '\t\t', start_hit, sep='', file=log_fh)
                         print(log_outprefix, '\t', ref_name, '\t\t', end_hit, sep='', file=log_fh)
                         new_contig = self._make_circularised_contig(start_hit, end_hit)
@@ -386,7 +394,7 @@ class Merger:
                     else:
                         self.original_contigs[new_contig.id] = new_contig
                         fate_of_contigs['circl_using_spades'].add(ref_name)
-                        print(log_outprefix, ref_name, 'Circularized using matches to SPAdes circular contigs', sep='\t', file=log_fh)
+                        print(log_outprefix, ref_name, 'Circularized using matches to ' + self.assembler + ' circular contigs', sep='\t', file=log_fh)
                         used_spades_contigs.add(spades_contig)
             else:
                 print(log_outprefix, ref_name, 'Cannot circularize: no nucmer hits', sep='\t', file=log_fh)
@@ -695,7 +703,7 @@ class Merger:
 
                 reads_prefix = outprefix + '.iter.' + str(iteration) + '.reads'
                 reads_to_map =  reads_prefix + ('.fasta' if self.spades_only_assembler else '.fastq')
-                bam_filter = circlator.bamfilter.BamFilter(bam, reads_prefix, fastq_out=not self.spades_only_assembler)
+                bam_filter = circlator.bamfilter.BamFilter(bam, reads_prefix, fastq_out=not self.spades_only_assembler, split_all_reads=self.split_all_reads)
                 bam_filter.run()
                 assembler_dir = outprefix + '.iter.' + str(iteration) + '.assembly'
                 a = circlator.assemble.Assembler(
@@ -707,9 +715,12 @@ class Merger:
                     verbose=self.verbose,
                     spades_kmers=self.spades_kmers,
                     spades_use_first_success=self.spades_use_first_success,
+                    assembler=self.assembler,
+                    genomeSize=self.length_cutoff,
+                    data_type=self.data_type
                 )
                 a.run()
-                self.reassembly = circlator.assembly.Assembly(assembler_dir)
+                self.reassembly = circlator.assembly.Assembly(assembler_dir, assembler=self.assembler)
                 self.reassembly_contigs = self.reassembly.get_contigs()
             elif iteration <= 2:
                 print(this_log_prefix, '\tNo contig merges were made',sep='', file=log_fh)
diff --git a/circlator/tasks/all.py b/circlator/tasks/all.py
index 75a3c1d..9845723 100644
--- a/circlator/tasks/all.py
+++ b/circlator/tasks/all.py
@@ -19,6 +19,9 @@ def run():
     parser.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
     parser.add_argument('--verbose', action='store_true', help='Be verbose')
     parser.add_argument('--unchanged_code', type=int, help='Code to return when the input assembly is not changed [%(default)s]', default=0, metavar='INT')
+    parser.add_argument('--assembler', choices=circlator.common.allowed_assemblers, help='Assembler to use for reassemblies [%(default)s]', default='spades')
+    parser.add_argument('--split_all_reads', action='store_true', help='By default, reads mapped to shorter contigs are left unchanged. This option splits them into two, broken at the middle of the contig to try to force circularization. May help if the assembler does not detect circular contigs (eg canu)')
+    parser.add_argument('--data_type', choices=circlator.common.allowed_data_types, help='String representing one of the 4 type of data analysed (only used for Canu) [%(default)s]', default='pacbio-corrected')
     parser.add_argument('assembly', help='Name of original assembly', metavar='assembly.fasta')
     parser.add_argument('reads', help='Name of corrected reads FASTA or FASTQ file', metavar='reads.fasta/q')
     parser.add_argument('outdir', help='Name of output directory (must not already exist)', metavar='output directory')
@@ -66,9 +69,9 @@ def run():
 
     print_message('{:_^79}'.format(' Checking external programs '), options)
     if options.verbose:
-        circlator.versions.get_all_versions(sys.stdout, raise_error=True)
+        circlator.versions.get_all_versions(sys.stdout, raise_error=True, assembler=options.assembler)
     else:
-        circlator.versions.get_all_versions(None, raise_error=True)
+        circlator.versions.get_all_versions(None, raise_error=True, assembler=options.assembler)
 
 
     files_to_check = [options.assembly, options.reads]
@@ -143,6 +146,7 @@ def run():
         contigs_to_use=options.b2r_only_contigs,
         discard_unmapped=options.b2r_discard_unmapped,
         verbose=options.verbose,
+        split_all_reads=options.split_all_reads,
     )
     bam_filter.run()
 
@@ -157,6 +161,9 @@ def run():
         only_assembler=not options.assemble_not_only_assembler,
         spades_kmers=options.assemble_spades_k,
         spades_use_first_success=options.assemble_spades_use_first,
+        assembler=options.assembler,
+        genomeSize=options.b2r_length_cutoff,
+        data_type=options.data_type,
         verbose=options.verbose
     )
     a.run()
@@ -180,7 +187,7 @@ def run():
 
     m = circlator.merge.Merger(
         assembly_to_use,
-        reassembly,
+        assembly_dir,
         merge_prefix,
         nucmer_diagdiff=options.merge_diagdiff,
         nucmer_min_id=options.merge_min_id,
@@ -191,6 +198,10 @@ def run():
         spades_use_first_success=options.assemble_spades_use_first,
         spades_careful=not options.assemble_not_careful,
         spades_only_assembler=not options.assemble_not_only_assembler,
+        assembler=options.assembler,
+        length_cutoff=options.b2r_length_cutoff,
+        split_all_reads=options.split_all_reads,
+        data_type=options.data_type,
         nucmer_breaklen=options.merge_breaklen,
         ref_end_tolerance=options.merge_ref_end,
         qry_end_tolerance=options.merge_reassemble_end,
diff --git a/circlator/tasks/assemble.py b/circlator/tasks/assemble.py
index 2081464..e9f9a2c 100644
--- a/circlator/tasks/assemble.py
+++ b/circlator/tasks/assemble.py
@@ -5,7 +5,7 @@ import circlator
 
 def run():
     parser = argparse.ArgumentParser(
-        description = 'Assemble reads using SPAdes',
+        description = 'Assemble reads using SPAdes/Canu',
         usage = 'circlator assemble [options] <in.reads.fasta> <out_dir>')
     parser.add_argument('--not_careful', action='store_true', help='Do not use the --careful option with SPAdes (used by default)')
     parser.add_argument('--not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default)')
@@ -13,6 +13,8 @@ def run():
     parser.add_argument('--verbose', action='store_true', help='Be verbose')
     parser.add_argument('--spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...')
     parser.add_argument('--spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50')
+    parser.add_argument('--assembler', choices=circlator.common.allowed_assemblers, help='Assembler to use for reassemblies [%(default)s]', default='spades')
+    parser.add_argument('--data_type', choices=circlator.common.allowed_data_types, help='String representing one of the 4 type of data analysed (only used for Canu) [%(default)s]', default='pacbio-corrected')
     parser.add_argument('reads', help='Name of input reads FASTA file', metavar='in.reads.fasta')
     parser.add_argument('out_dir', help='Output directory (must not already exist)')
     options = parser.parse_args()
@@ -25,6 +27,8 @@ def run():
         only_assembler=not options.not_only_assembler,
         spades_kmers=options.spades_k,
         spades_use_first_success=options.spades_use_first,
+        assembler=options.assembler,
+        data_type=options.data_type,
         verbose=options.verbose
     )
     a.run()
diff --git a/circlator/tasks/bam2reads.py b/circlator/tasks/bam2reads.py
index e86b107..8127f04 100644
--- a/circlator/tasks/bam2reads.py
+++ b/circlator/tasks/bam2reads.py
@@ -12,6 +12,7 @@ def run():
     parser.add_argument('--only_contigs', help='File of contig names (one per line). Only reads that map to these contigs are kept (and unmapped reads, unless --discard_unmapped is used).', metavar='FILENAME')
     parser.add_argument('--length_cutoff', type=int, help='All reads mapped to contigs shorter than this will be kept [%(default)s]', default=100000, metavar='INT')
     parser.add_argument('--min_read_length', type=int, help='Minimum length of read to output [%(default)s]', default=250, metavar='INT')
+    parser.add_argument('--split_all_reads', action='store_true', help='By default, reads mapped to shorter contigs are left unchanged. This option splits them into two, broken at the middle of the contig to try to force circularization. May help if the assembler does not detect circular contigs (eg canu)')
     parser.add_argument('--verbose', action='store_true', help='Be verbose')
     parser.add_argument('bam', help='Name of input bam file', metavar='in.bam')
     parser.add_argument('outprefix', help='Prefix of output filenames')
@@ -25,6 +26,7 @@ def run():
         min_read_length=options.min_read_length,
         contigs_to_use=options.only_contigs,
         discard_unmapped=options.discard_unmapped,
+        split_all_reads=options.split_all_reads,
         verbose=options.verbose,
     )
     bam_filter.run()
diff --git a/circlator/tasks/merge.py b/circlator/tasks/merge.py
index c98d9d3..94ccd95 100644
--- a/circlator/tasks/merge.py
+++ b/circlator/tasks/merge.py
@@ -17,6 +17,10 @@ def run():
     parser.add_argument('--assemble_not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default)')
     parser.add_argument('--spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...')
     parser.add_argument('--spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50')
+    parser.add_argument('--assembler', choices=circlator.common.allowed_assemblers, help='Assembler to use for reassemblies [%(default)s]', default='spades')
+    parser.add_argument('--data_type', choices=circlator.common.allowed_data_types, help='String representing one of the 4 type of data analysed (only used for Canu) [%(default)s]', default='pacbio-corrected')
+    parser.add_argument('--b2r_length_cutoff', type=int, help='All reads mapped to contigs shorter than this will be kept [%(default)s]', default=100000, metavar='INT')
+    parser.add_argument('--b2r_split_all_reads', action='store_true', help='By default, reads mapped to shorter contigs are left unchanged. This option splits them into two, broken at the middle of the contig to try to force circularization. May help if the assembler does not detect circular contigs (eg canu)')
     parser.add_argument('--ref_end', type=int, help='max distance allowed between nucmer hit and end of input assembly contig [%(default)s]', metavar='INT', default=15000)
     parser.add_argument('--reassemble_end', type=int, help='max distance allowed between nucmer hit and end of reassembly contig [%(default)s]', metavar='INT', default=1000)
     parser.add_argument('--threads', type=int, help='Number of threads for remapping/assembly (only applies if --reads is used) [%(default)s]', default=1, metavar='INT')
@@ -41,6 +45,10 @@ def run():
         spades_only_assembler=not options.assemble_not_only_assembler,
         spades_kmers=options.spades_k,
         spades_use_first_success=options.spades_use_first,
+        assembler=options.assembler,
+        length_cutoff=options.b2r_length_cutoff,
+        split_all_reads=options.b2r_split_all_reads,
+        data_type=options.data_type,
         ref_end_tolerance=options.ref_end,
         qry_end_tolerance=options.reassemble_end,
         threads=options.threads,
diff --git a/circlator/tests/assemble_test.py b/circlator/tests/assemble_test.py
index 56d1fac..d94139a 100644
--- a/circlator/tests/assemble_test.py
+++ b/circlator/tests/assemble_test.py
@@ -61,3 +61,36 @@ class TestAssemble(unittest.TestCase):
         self.assembler.threads = 2
         self.assertEqual(cmd_start + ' -o out -t 2 -k 41 --careful --only-assembler', self.assembler._make_spades_command(41, 'out'))
 
+
+    def test_make_canu_command(self):
+        '''test _make_canu_command'''
+        tmp_assemble_dir = 'tmp.assemble_test'
+        assembler = assemble.Assembler(
+            os.path.join(data_dir, 'assemble_test.dummy_reads.fa'),
+            tmp_assemble_dir,
+            assembler='canu'
+        )
+
+        cmd_start = ' '.join([
+            assembler.canu.exe(),
+            '-useGrid=false',
+            'gnuplotTested=true',
+            '-assemble',
+            'genomeSize=0.1m',
+        ])
+
+        reads = os.path.join(data_dir, 'assemble_test.dummy_reads.fa')
+        self.assertEqual(cmd_start + ' -d out -p outname -pacbio-corrected ' + reads, assembler._make_canu_command('out', 'outname'))
+        self.assertEqual(cmd_start + ' -d out2 -p outname2 -pacbio-corrected ' + reads, assembler._make_canu_command('out2', 'outname2'))
+        assembler.data_type = 'pacbio-raw'
+        self.assertEqual(cmd_start + ' -d out -p outname -pacbio-raw ' + reads, assembler._make_canu_command('out', 'outname'))
+
+
+    def test_rename_canu_contigs(self):
+        '''test _rename_canu_contigs'''
+        infile = os.path.join(data_dir, 'assemble_test_rename_canu_contigs.in.fa')
+        tmpfile = 'tmp.assemble_test_rename_canu_contigs.out.fa'
+        expected = os.path.join(data_dir, 'assemble_test_rename_canu_contigs.expect.fa')
+        assemble.Assembler._rename_canu_contigs(infile, tmpfile)
+        self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
+        os.unlink(tmpfile)
diff --git a/circlator/tests/assembly_test.py b/circlator/tests/assembly_test.py
index 63f2497..f55fb07 100644
--- a/circlator/tests/assembly_test.py
+++ b/circlator/tests/assembly_test.py
@@ -14,20 +14,20 @@ class TestAssembly(unittest.TestCase):
     def test_init_file_not_dir(self):
         '''Test _init_file_not_dir'''
         contigs_fasta = os.path.join(data_dir, 'assembly_test_init_contigs.fasta')
-        a = assembly.Assembly(contigs_fasta)
+        a = assembly.Assembly(contigs_fasta, 'spades')
         self.assertEqual(a.contigs_fasta, contigs_fasta)
         self.assertIsNone(a.contigs_fastg)
         self.assertIsNone(a.assembly_graph_fastg)
-        self.assertIsNone(a.spades_dir)
+        self.assertIsNone(a.assembler_dir)
 
 
     def test_init_pre_3_6_1_ok(self):
         '''Test _init_pre_3_6_1_ok'''
         test_dir = os.path.join(data_dir, 'assembly_test_init_spades_pre_3_6_1_ok')
-        a = assembly.Assembly(test_dir)
+        a = assembly.Assembly(test_dir, 'spades')
         self.assertEqual(a.contigs_fasta, os.path.join(test_dir, 'contigs.fasta'))
         self.assertEqual(a.contigs_fastg, os.path.join(test_dir, 'contigs.fastg'))
-        self.assertEqual(a.spades_dir, test_dir)
+        self.assertEqual(a.assembler_dir, test_dir)
         self.assertIsNone(a.assembly_graph_fastg)
         self.assertIsNone(a.contigs_paths)
 
@@ -35,8 +35,8 @@ class TestAssembly(unittest.TestCase):
     def test_init_post_3_6_1_ok(self):
         '''Test _init_post_3_6_1_ok'''
         test_dir = os.path.join(data_dir, 'assembly_test_init_spades_post_3_6_1_ok')
-        a = assembly.Assembly(test_dir)
-        self.assertEqual(a.spades_dir, test_dir)
+        a = assembly.Assembly(test_dir, 'spades')
+        self.assertEqual(a.assembler_dir, test_dir)
         self.assertEqual(a.contigs_fasta, os.path.join(test_dir, 'contigs.fasta'))
         self.assertEqual(a.assembly_graph_fastg, os.path.join(test_dir, 'assembly_graph.fastg'))
         self.assertEqual(a.contigs_paths, os.path.join(test_dir, 'contigs.paths'))
@@ -55,12 +55,12 @@ class TestAssembly(unittest.TestCase):
 
         for test_dir in test_dirs:
             with self.assertRaises(assembly.Error):
-                a = assembly.Assembly(os.path.join(data_dir, test_dir))
+                a = assembly.Assembly(os.path.join(data_dir, test_dir), 'spades')
 
 
     def test_get_contigs(self):
         '''Test get_contigs'''
-        a = assembly.Assembly(os.path.join(data_dir, 'assembly_test_get_contigs.fasta'))
+        a = assembly.Assembly(os.path.join(data_dir, 'assembly_test_get_contigs.fasta'), 'spades')
         expected = {
             'contig1': pyfastaq.sequences.Fasta('contig1', 'ACGT'),
             'contig2': pyfastaq.sequences.Fasta('contig2', 'AAAA'),
@@ -128,9 +128,17 @@ class TestAssembly(unittest.TestCase):
         self.assertEqual(expected, got)
 
 
+    def test_circular_contigs_from_canu_gfa(self):
+        '''Test _circular_contigs_from_canu_gfa'''
+        infile = os.path.join(data_dir, 'assembly_test_circular_contigs_from_canu.gfa')
+        got = assembly.Assembly._circular_contigs_from_canu_gfa(infile)
+        expected = {'tig00000042'}
+        self.assertEqual(expected, got)
+
+
     def test_circular_contigs_spades_pre_3_6_1(self):
         '''Test circular_contigs with spades pre 3.6.1'''
-        a = assembly.Assembly(os.path.join(data_dir, 'assembly_test_circular_contigs_spades_pre_3_6_1'))
+        a = assembly.Assembly(os.path.join(data_dir, 'assembly_test_circular_contigs_spades_pre_3_6_1'), 'spades')
         got = a.circular_contigs()
         expected = {'NODE_1_length_5_cov_42.42_ID_1'}
         self.assertEqual(expected, got)
@@ -138,7 +146,7 @@ class TestAssembly(unittest.TestCase):
 
     def test_circular_contigs_spades_post_3_6_1(self):
         '''Test circular_contigs with spades post 3.6.1'''
-        a = assembly.Assembly(os.path.join(data_dir, 'assembly_test_circular_contigs_spades_post_3_6_1'))
+        a = assembly.Assembly(os.path.join(data_dir, 'assembly_test_circular_contigs_spades_post_3_6_1'), 'spades')
         got = a.circular_contigs()
         expected = {
             'NODE_5_length_8134_cov_20.6513_ID_2269',
@@ -151,7 +159,7 @@ class TestAssembly(unittest.TestCase):
 
     def test_circular_contigs_just_fasta(self):
         '''Test circular_contigs when input is just fasta'''
-        a = assembly.Assembly(os.path.join(data_dir, 'assembly_test_circular_contigs_only_contigs.fasta'))
+        a = assembly.Assembly(os.path.join(data_dir, 'assembly_test_circular_contigs_only_contigs.fasta'), 'spades')
         got = a.circular_contigs()
         expected = set()
         self.assertEqual(expected, got)
diff --git a/circlator/tests/data/assemble_test_rename_canu_contigs.expect.fa b/circlator/tests/data/assemble_test_rename_canu_contigs.expect.fa
new file mode 100644
index 0000000..3f7c209
--- /dev/null
+++ b/circlator/tests/data/assemble_test_rename_canu_contigs.expect.fa
@@ -0,0 +1,8 @@
+>tig00000001
+ACGTA
+>tig00000042
+TGAGTGATGA
+TGAGTGATGA
+TGAGTGATGA
+TGAGTGATGA
+TT
diff --git a/circlator/tests/data/assemble_test_rename_canu_contigs.in.fa b/circlator/tests/data/assemble_test_rename_canu_contigs.in.fa
new file mode 100644
index 0000000..453a747
--- /dev/null
+++ b/circlator/tests/data/assemble_test_rename_canu_contigs.in.fa
@@ -0,0 +1,8 @@
+>tig00000001 len=5 reads=42 covStat=42.42 gappedBases=no class=contig suggestRepeat=no suggestCircular=no
+ACGTA
+>tig00000042 len=42 reads=42 covStat=42.424242 gappedBases=no class=contig suggestRepeat=no suggestCircular=no
+TGAGTGATGA
+TGAGTGATGA
+TGAGTGATGA
+TGAGTGATGA
+TT
diff --git a/circlator/tests/data/assembly_test_circular_contigs_from_canu.gfa b/circlator/tests/data/assembly_test_circular_contigs_from_canu.gfa
new file mode 100644
index 0000000..8e66e9e
--- /dev/null
+++ b/circlator/tests/data/assembly_test_circular_contigs_from_canu.gfa
@@ -0,0 +1,14 @@
+H	VN:Z:bogart/edges
+S	tig00000001	*	LN:i:92589
+S	tig00000002	*	LN:i:24034
+S	tig00000003	*	LN:i:24348
+S	tig00000004	*	LN:i:17645
+S	tig00000005	*	LN:i:19559
+S	tig00000042	*	LN:i:424242
+L	tig00000001	+	tig00000002	+	9648M
+L	tig00000003	+	tig00000003	+	9639M
+L	tig00000003	-	tig00000003	-	9639M	cv:A:F
+L	tig00000003	-	tig00000004	+	5329M	cv:A:F
+L	tig00000005	+	tig00000005	+	5332M	cv:A:F
+L	tig00000042	+	tig00000042	+	6035M	cv:A:F
+L	tig00000042	-	tig00000042	-	6036M	cv:A:F
diff --git a/circlator/versions.py b/circlator/versions.py
index fd6e517..c5d6155 100644
--- a/circlator/versions.py
+++ b/circlator/versions.py
@@ -7,12 +7,12 @@ from circlator import external_progs, __version__
 from circlator import __version__ as circlator_version
 
 
-def get_all_versions(filehandle, raise_error=True, debug=False):
+def get_all_versions(filehandle, raise_error=True, debug=False, assembler=None):
     if filehandle is not None:
         print('Circlator version:', circlator_version, file=filehandle)
         print('\nExternal dependencies:', file=filehandle)
 
-    external_progs.check_all_progs(verbose=False, raise_error=raise_error, filehandle=filehandle, debug=debug)
+    external_progs.check_all_progs(verbose=False, raise_error=raise_error, filehandle=filehandle, debug=debug, assembler=assembler)
 
     if filehandle is not None:
         print('\nPython version:', file=filehandle)
diff --git a/install_dependencies.sh b/install_dependencies.sh
index 3fa9560..020871c 100644
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -5,12 +5,14 @@ set -x
 start_dir=$(pwd)
 
 BWA_VERSION=0.7.12
+CANU_VERSION=1.4
 PRODIGAL_VERSION=2.6.2
 SAMTOOLS_VERSION=1.3
 MUMMER_VERSION=3.23
 SPADES_VERSION=3.7.1
 
 BWA_DOWNLOAD_URL="http://downloads.sourceforge.net/project/bio-bwa/bwa-${BWA_VERSION}.tar.bz2"
+CANU_DOWNLOAD_URL="https://github.com/marbl/canu/releases/download/v${CANU_VERSION}/canu-${CANU_VERSION}.Linux-amd64.tar.xz"
 PRODIGAL_DOWNLOAD_URL="https://github.com/hyattpd/Prodigal/releases/download/v${PRODIGAL_VERSION}/prodigal.linux"
 SAMTOOLS_DOWNLOAD_URL="https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2"
 MUMMER_DOWNLOAD_URL="http://downloads.sourceforge.net/project/mummer/mummer/${MUMMER_VERSION}/MUMmer${MUMMER_VERSION}.tar.gz"
@@ -46,6 +48,13 @@ cd $bwa_dir
 make
 
 
+# ------------- canu -------------------
+cd $build_dir
+download $CANU_DOWNLOAD_URL "canu-${CANU_VERSION}.tar.xz"
+tar -xf canu-${CANU_VERSION}.tar.xz
+canu_dir=$build_dir/canu-${CANU_VERSION}/Linux-amd64/bin/
+
+
 # --------------- prodigal -----------------
 cd $build_dir
 prodigal_dir="$build_dir/prodigal-${PRODIGAL_VERSION}"
@@ -92,6 +101,7 @@ update_path () {
 }
 
 update_path ${bwa_dir}
+update_path ${canu_dir}
 update_path ${prodigal_dir}
 update_path ${mummer_dir}
 update_path ${samtools_dir}
diff --git a/setup.py b/setup.py
index d95e87c..66feaa3 100644
--- a/setup.py
+++ b/setup.py
@@ -7,11 +7,11 @@ from setuptools import setup, find_packages
 
 setup(
     name='circlator',
-    version='1.4.1',
+    version='1.5.1',
     description='circlator: a tool to circularise genome assemblies',
     packages = find_packages(),
     package_data={'circlator': ['data/*']},
-    author='Martin Hunt, Nishadi De Silva',
+    author='Martin Hunt, Nishadi De Silva, (this small edit is from Nicola De Maio) ',
     author_email='path-help at sanger.ac.uk',
     url='https://github.com/sanger-pathogens/circlator',
     scripts=glob.glob('scripts/*'),

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/circlator.git



More information about the debian-med-commit mailing list