[med-svn] [circlator] 01/03: New upstream version 1.4.1
Sascha Steinbiss
satta at debian.org
Tue Jan 24 16:36:26 UTC 2017
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository circlator.
commit 2224d663ae9c13ca0ed5074c368ff4b960caa4bd
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date: Tue Jan 24 17:26:54 2017 +0100
New upstream version 1.4.1
---
circlator/bamfilter.py | 28 +++++-----
circlator/merge.py | 6 +--
circlator/tasks/all.py | 9 ++--
circlator/tasks/bam2reads.py | 2 +
circlator/tests/bamfilter_test.py | 58 ++++++++++++++++++---
.../bamfilter_test_run_keep_unmapped.out.reads.fq | 32 ++++++++++++
...test_run.bam => bamfilter_test_run_no_qual.bam} | Bin
....bam.bai => bamfilter_test_run_no_qual.bam.bai} | Bin
.../tests/data/bamfilter_test_run_with_qual.bam | Bin 0 -> 908 bytes
...am.bai => bamfilter_test_run_with_qual.bam.bai} | Bin
setup.py | 2 +-
11 files changed, 111 insertions(+), 26 deletions(-)
diff --git a/circlator/bamfilter.py b/circlator/bamfilter.py
index b480c46..68e5565 100644
--- a/circlator/bamfilter.py
+++ b/circlator/bamfilter.py
@@ -11,6 +11,7 @@ class BamFilter:
self,
bam,
outprefix,
+ fastq_out=False,
length_cutoff=100000,
min_read_length=250,
contigs_to_use=None,
@@ -22,8 +23,9 @@ class BamFilter:
if not os.path.exists(self.bam):
raise Error('File not found:' + self.bam)
+ self.fastq_out = fastq_out
self.length_cutoff = length_cutoff
- self.reads_fa = os.path.abspath(outprefix + '.fasta')
+ self.reads_outfile = os.path.abspath(outprefix + ('.fastq' if self.fastq_out else '.fasta'))
self.log = os.path.abspath(outprefix + '.log')
self.log_prefix = log_prefix
self.contigs_to_use = self._get_contigs_to_use(contigs_to_use)
@@ -69,7 +71,7 @@ class BamFilter:
'''Gets all reads from contig called "contig" and writes to fout'''
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(contig):
- print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
+ print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)
def _get_all_unmapped_reads(self, fout):
@@ -77,7 +79,7 @@ class BamFilter:
sam_reader = pysam.Samfile(self.bam, "rb")
for read in sam_reader.fetch(until_eof=True):
if read.is_unmapped:
- print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
+ print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)
def _break_reads(self, contig, position, fout, min_read_length=250):
@@ -88,15 +90,15 @@ class BamFilter:
if read.pos < position < read.reference_end - 1:
split_point = position - read.pos
if split_point - 1 >= min_read_length:
- sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(0, split_point)
+ sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out).subseq(0, split_point)
sequence.id += '.left'
seqs.append(sequence)
if read.query_length - split_point >= min_read_length:
- sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(split_point, read.query_length)
+ sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out).subseq(split_point, read.query_length)
sequence.id += '.right'
seqs.append(sequence)
else:
- seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True))
+ seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=not self.fastq_out))
for seq in seqs:
if read.is_reverse:
@@ -111,7 +113,7 @@ class BamFilter:
for read in sam_reader.fetch(contig):
read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1)
if not read_interval.intersects(exclude_interval):
- print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
+ print(mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out), file=fout)
def _get_region(self, contig, start, end, fout, min_length=250):
@@ -120,15 +122,17 @@ class BamFilter:
trimming_end = (start == 0)
for read in sam_reader.fetch(contig, start, end):
read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1)
- seq = mapping.aligned_read_to_read(read, ignore_quality=True, revcomp=False)
+ seq = mapping.aligned_read_to_read(read, ignore_quality=not self.fastq_out, revcomp=False)
if trimming_end:
bases_off_start = 0
bases_off_end = max(0, read.reference_end - 1 - end)
- seq.seq = seq.seq[:read.query_alignment_end - bases_off_end]
+ #seq.seq = seq.seq[:read.query_alignment_end - bases_off_end]
+ seq = seq.subseq(0, read.query_alignment_end - bases_off_end)
else:
bases_off_start = max(0, start - read.pos + 1)
- seq.seq = seq.seq[bases_off_start + read.query_alignment_start:]
+ #seq.seq = seq.seq[bases_off_start + read.query_alignment_start:]
+ seq = seq.subseq(bases_off_start + read.query_alignment_start, len(seq))
if read.is_reverse:
seq.revcomp()
@@ -141,7 +145,7 @@ class BamFilter:
ref_lengths = self._get_ref_lengths()
assert len(ref_lengths) > 0
f_log = pyfastaq.utils.open_file_write(self.log)
- f_fa = pyfastaq.utils.open_file_write(self.reads_fa)
+ f_fa = pyfastaq.utils.open_file_write(self.reads_outfile)
print(self.log_prefix, '#contig', 'length', 'reads_kept', sep='\t', file=f_log)
if self.verbose:
print('Getting reads from BAM file', self.bam, flush=True)
@@ -187,4 +191,4 @@ class BamFilter:
if self.verbose:
print('Finished getting reads.')
print('Log file:', self.log)
- print('Reads file:', self.reads_fa, flush=True)
+ print('Reads file:', self.reads_outfile, flush=True)
diff --git a/circlator/merge.py b/circlator/merge.py
index 9cd81e2..6dfea1f 100644
--- a/circlator/merge.py
+++ b/circlator/merge.py
@@ -694,12 +694,12 @@ class Merger:
)
reads_prefix = outprefix + '.iter.' + str(iteration) + '.reads'
- reads_to_map = reads_prefix + '.fasta'
- bam_filter = circlator.bamfilter.BamFilter(bam, reads_prefix)
+ reads_to_map = reads_prefix + ('.fasta' if self.spades_only_assembler else '.fastq')
+ bam_filter = circlator.bamfilter.BamFilter(bam, reads_prefix, fastq_out=not self.spades_only_assembler)
bam_filter.run()
assembler_dir = outprefix + '.iter.' + str(iteration) + '.assembly'
a = circlator.assemble.Assembler(
- reads_prefix + '.fasta',
+ reads_prefix + ('.fasta' if self.spades_only_assembler else '.fastq'),
assembler_dir,
threads=self.threads,
careful=self.spades_careful,
diff --git a/circlator/tasks/all.py b/circlator/tasks/all.py
index 2009bf7..75a3c1d 100644
--- a/circlator/tasks/all.py
+++ b/circlator/tasks/all.py
@@ -15,12 +15,12 @@ def print_message(m, opts):
def run():
parser = argparse.ArgumentParser(
description = 'Run mapreads, bam2reads, assemble, merge, clean, fixstart',
- usage = 'circlator all [options] <assembly.fasta> <reads.fasta> <output directory>')
+ usage = 'circlator all [options] <assembly.fasta> <reads.fasta/q> <output directory>')
parser.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
parser.add_argument('--verbose', action='store_true', help='Be verbose')
parser.add_argument('--unchanged_code', type=int, help='Code to return when the input assembly is not changed [%(default)s]', default=0, metavar='INT')
parser.add_argument('assembly', help='Name of original assembly', metavar='assembly.fasta')
- parser.add_argument('reads', help='Name of corrected reads FASTA file', metavar='reads.fasta')
+ parser.add_argument('reads', help='Name of corrected reads FASTA or FASTQ file', metavar='reads.fasta/q')
parser.add_argument('outdir', help='Name of output directory (must not already exist)', metavar='output directory')
mapreads_group = parser.add_argument_group('mapreads options')
@@ -36,7 +36,7 @@ def run():
parser.add_argument('--assemble_spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...')
parser.add_argument('--assemble_spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50')
parser.add_argument('--assemble_not_careful', action='store_true', help='Do not use the --careful option with SPAdes (used by default)')
- parser.add_argument('--assemble_not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default)')
+ parser.add_argument('--assemble_not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default). Important: with this option, the input reads must be in FASTQ format, otherwise SPAdes will crash because it needs quality scores to correct the reads.')
merge_group = parser.add_argument_group('merge options')
merge_group.add_argument('--merge_diagdiff', type=int, help='Nucmer diagdiff option [%(default)s]', metavar='INT', default=25)
@@ -103,7 +103,7 @@ def run():
original_assembly_renamed = '00.input_assembly.fasta'
bam = '01.mapreads.bam'
filtered_reads_prefix = '02.bam2reads'
- filtered_reads = filtered_reads_prefix + '.fasta'
+ filtered_reads = filtered_reads_prefix + ('.fastq' if options.assemble_not_only_assembler else '.fasta')
assembly_dir = '03.assemble'
reassembly = os.path.join(assembly_dir, 'contigs.fasta')
merge_prefix = '04.merge'
@@ -137,6 +137,7 @@ def run():
bam_filter = circlator.bamfilter.BamFilter(
bam,
filtered_reads_prefix,
+ fastq_out=options.assemble_not_only_assembler,
length_cutoff=options.b2r_length_cutoff,
min_read_length=options.b2r_min_read_length,
contigs_to_use=options.b2r_only_contigs,
diff --git a/circlator/tasks/bam2reads.py b/circlator/tasks/bam2reads.py
index 4fd2cc2..e86b107 100644
--- a/circlator/tasks/bam2reads.py
+++ b/circlator/tasks/bam2reads.py
@@ -8,6 +8,7 @@ def run():
description = 'Make reads from mapping to be reassembled',
usage = 'circlator bam2reads [options] <in.bam> <outprefix>')
parser.add_argument('--discard_unmapped', action='store_true', help='Use this to not keep unmapped reads')
+ parser.add_argument('--fastq', action='store_true', help='Write fastq output (if quality scores are present in input BAM file)')
parser.add_argument('--only_contigs', help='File of contig names (one per line). Only reads that map to these contigs are kept (and unmapped reads, unless --discard_unmapped is used).', metavar='FILENAME')
parser.add_argument('--length_cutoff', type=int, help='All reads mapped to contigs shorter than this will be kept [%(default)s]', default=100000, metavar='INT')
parser.add_argument('--min_read_length', type=int, help='Minimum length of read to output [%(default)s]', default=250, metavar='INT')
@@ -19,6 +20,7 @@ def run():
bam_filter = circlator.bamfilter.BamFilter(
options.bam,
options.outprefix,
+ fastq_out=options.fastq,
length_cutoff=options.length_cutoff,
min_read_length=options.min_read_length,
contigs_to_use=options.only_contigs,
diff --git a/circlator/tests/bamfilter_test.py b/circlator/tests/bamfilter_test.py
index b7336d2..c88b880 100644
--- a/circlator/tests/bamfilter_test.py
+++ b/circlator/tests/bamfilter_test.py
@@ -119,11 +119,11 @@ class TestBamfilter(unittest.TestCase):
os.unlink(tmp)
- def test_run_keep_unmapped(self):
- '''test run keep unmapped'''
+ def test_run_keep_unmapped_no_quals(self):
+ '''test run keep unmapped bam has no quality scores'''
outprefix = 'tmp.bamfilter_run'
b = bamfilter.BamFilter(
- os.path.join(data_dir, 'bamfilter_test_run.bam'),
+ os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
outprefix,
length_cutoff=600,
min_read_length=100,
@@ -135,12 +135,58 @@ class TestBamfilter(unittest.TestCase):
os.unlink(outprefix + '.fasta')
os.unlink(outprefix + '.log')
+ b = bamfilter.BamFilter(
+ os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
+ outprefix,
+ fastq_out=True,
+ length_cutoff=600,
+ min_read_length=100,
+ contigs_to_use={'contig1', 'contig3', 'contig4'}
+ )
+ b.run()
+ expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fa')
+ self.assertTrue(filecmp.cmp(expected, outprefix + '.fastq', shallow=False))
+ os.unlink(outprefix + '.fastq')
+ os.unlink(outprefix + '.log')
+
+
+ def test_run_keep_unmapped_with_quals(self):
+ '''test run keep unmapped bam has quality scores'''
+ outprefix = 'tmp.bamfilter_run'
+ b = bamfilter.BamFilter(
+ os.path.join(data_dir, 'bamfilter_test_run_with_qual.bam'),
+ outprefix,
+ fastq_out=False,
+ length_cutoff=600,
+ min_read_length=100,
+ contigs_to_use={'contig1', 'contig3', 'contig4'}
+ )
+ b.run()
+ expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fa')
+ self.assertTrue(filecmp.cmp(expected, outprefix + '.fasta', shallow=False))
+ os.unlink(outprefix + '.fasta')
+ os.unlink(outprefix + '.log')
+
+ b = bamfilter.BamFilter(
+ os.path.join(data_dir, 'bamfilter_test_run_with_qual.bam'),
+ outprefix,
+ fastq_out=True,
+ length_cutoff=600,
+ min_read_length=100,
+ contigs_to_use={'contig1', 'contig3', 'contig4'}
+ )
+ b.run()
+ expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fq')
+ self.assertTrue(filecmp.cmp(expected, outprefix + '.fastq', shallow=False))
+ os.unlink(outprefix + '.fastq')
+ os.unlink(outprefix + '.log')
+
- def test_run_discard_unmapped(self):
- '''test run keep unmapped'''
+ def test_run_discard_unmapped_no_quals(self):
+ '''test run keep unmapped bam has no quality scores'''
outprefix = 'tmp.bamfilter_run'
b = bamfilter.BamFilter(
- os.path.join(data_dir, 'bamfilter_test_run.bam'),
+ os.path.join(data_dir, 'bamfilter_test_run_no_qual.bam'),
outprefix,
length_cutoff=600,
min_read_length=100,
diff --git a/circlator/tests/data/bamfilter_test_run_keep_unmapped.out.reads.fq b/circlator/tests/data/bamfilter_test_run_keep_unmapped.out.reads.fq
new file mode 100644
index 0000000..450ab15
--- /dev/null
+++ b/circlator/tests/data/bamfilter_test_run_keep_unmapped.out.reads.fq
@@ -0,0 +1,32 @@
+ at contig1:1-100
+TTTAATTTGTCCGTAAATTGGGAGGTCTTCAACCGGGGGCGAATGTCGATCTCGTCGAGGCGTTTGTAAAGTGGTAACAGGGGTCATTGATCACGGTGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+ at contig1:301-400
+CGTTGATGATACGAATTACGTAGGGCTCTGGGAGATGCTCGGAACCCCACAGCGTCTATTTTAGTTGCGACATTACGCGGTATGCGCTTCTGCAAGATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+ at contig4:1-100
+TCGAAAGTACACTTTGAACTCTAAAAGCGGTTACGACCTCTTCCGTTCGATCGATGCGTGAGTACGTACTCTGGATCCAGCCGTGGCAAACCGGGTAACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+ at contig4:201-350
+CCTCCGCCTGCCTTTGACACACCGGACCTCGGGGGTGTCTAAAAGCCGTCCGTGTGTTGGATAGACATTTGTGACCGTATAGCGGGATGACGTTTCTCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ at contig4:651-801
+TGAAGGAGGGGGAGTCGTGCCCGTATCTGGGCCCAGTATATACATTGGGCAGGAGGGTTTGTCAAGAATTCTATCCTTACTAGTCTATTTTCGATACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGFDBAA
+ at contig4:851-950
+GAGTACGAGGGACAGATGTCTACACTTGAGCGTACACAAGAATGTGGTACCAAAGGTATCCTCATCGCAACTGGCATTCAAGCCGCTGTTCGACAGTGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+ at contig4:901-1000
+CAAAGGTATCCTCATCGCAACTGGCATTCAAGCCGCTGTTCGACAGTGGGTCTGTTGTACCCCTCTGCCCAACTGCTGAGTAGTTGGGTAAGGACCGAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
+ at unmapped_read
+TTATGGTACTTCGTTGCTCCCAAGGCTGAACTGATACATAGAGTGGGCTTTGTGATAGAACCAAACGACAACGAAGCGAATTTCGTCACCATCTCCATAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHGFDC
diff --git a/circlator/tests/data/bamfilter_test_run.bam b/circlator/tests/data/bamfilter_test_run_no_qual.bam
similarity index 100%
rename from circlator/tests/data/bamfilter_test_run.bam
rename to circlator/tests/data/bamfilter_test_run_no_qual.bam
diff --git a/circlator/tests/data/bamfilter_test_run.bam.bai b/circlator/tests/data/bamfilter_test_run_no_qual.bam.bai
similarity index 100%
copy from circlator/tests/data/bamfilter_test_run.bam.bai
copy to circlator/tests/data/bamfilter_test_run_no_qual.bam.bai
diff --git a/circlator/tests/data/bamfilter_test_run_with_qual.bam b/circlator/tests/data/bamfilter_test_run_with_qual.bam
new file mode 100644
index 0000000..d8322f5
Binary files /dev/null and b/circlator/tests/data/bamfilter_test_run_with_qual.bam differ
diff --git a/circlator/tests/data/bamfilter_test_run.bam.bai b/circlator/tests/data/bamfilter_test_run_with_qual.bam.bai
similarity index 100%
rename from circlator/tests/data/bamfilter_test_run.bam.bai
rename to circlator/tests/data/bamfilter_test_run_with_qual.bam.bai
diff --git a/setup.py b/setup.py
index be4bb45..d95e87c 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@ from setuptools import setup, find_packages
setup(
name='circlator',
- version='1.4.0',
+ version='1.4.1',
description='circlator: a tool to circularise genome assemblies',
packages = find_packages(),
package_data={'circlator': ['data/*']},
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/circlator.git
More information about the debian-med-commit
mailing list