[med-svn] [mapping-pipeline] 04/10: New upstream version 2.0
Andreas Tille
tille at debian.org
Tue Feb 21 12:56:44 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository mapping-pipeline.
commit ccdaa057c87e03dfdcb5026d278d97924e5059c2
Author: Andreas Tille <tille at debian.org>
Date: Tue Feb 21 09:06:29 2017 +0100
New upstream version 2.0
---
extract_genbank.py | 94 ++---
mapper.py | 8 +-
mapping.py | 83 ++++-
metrics.py | 27 +-
reffinder.py | 1056 ++++++++++++++++++++++++++++++++++++----------------
report.tex | 13 +-
run.config | 5 +
7 files changed, 899 insertions(+), 387 deletions(-)
diff --git a/extract_genbank.py b/extract_genbank.py
index 0cccc4e..741f02b 100755
--- a/extract_genbank.py
+++ b/extract_genbank.py
@@ -1,52 +1,58 @@
#!/usr/bin/python3
+
def extract_name(infile):
- definition="UNKNOWN"
- accession="UNKNOWN"
- lines = 0
- for line in infile:
- if line.startswith("DEFINITION"):
- definition = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
- if line.startswith("ACCESSION"):
- accession = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
- lines += 1
- if accession != "UNKNOWN" and definition != "UNKNOWN":
- break
- if lines > 4:
- break
- return accession + " " + definition
+ definition="UNKNOWN"
+ accession="UNKNOWN"
+ lines = 0
+ for line in infile:
+ if line.startswith("DEFINITION"):
+ definition = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
+ if line.startswith("ACCESSION"):
+ accession = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
+ lines += 1
+ if accession != "UNKNOWN" and definition != "UNKNOWN":
+ break
+ if lines > 4:
+ break
+ return accession + " " + definition
def extract_sequence(infile):
- res = []
- numline = 0
- for line in infile:
- if line.startswith("ORIGIN"):
- break
- for line in infile:
- if line[0] == "/":
- seq = "".join(res)
- return "".join(seq)
- res += line.strip().split(" ")[1:]
- numline += 1
+ res = []
+ numline = 0
+ for line in infile:
+ if line.startswith("ORIGIN"):
+ break
+ for line in infile:
+ if line[0] == "/":
+ seq = "".join(res)
+ return "".join(seq)
+ res += line.strip().split(" ")[1:]
+ numline += 1
def write_sequence(outfile, seq, name, splitlen):
- f = open(outfile, "w")
- f.write(">" + name + "\n")
- pos = 0
- numline = 0
- while pos < len(seq):
- f.write(seq[pos:pos+splitlen])
- pos += splitlen
- f.write("\n")
- numline += 1
+ f = open(outfile, "w")
+ f.write(">" + name + "\n")
+ pos = 0
+ numline = 0
+ while pos < len(seq):
+ f.write(seq[pos:pos+splitlen])
+ pos += splitlen
+ f.write("\n")
+ numline += 1
def main(infile, fastafile):
- f = open(infile, "r")
- linelength = 80
- name = extract_name(f)
- seq = extract_sequence(f)
- f.close()
- write_sequence(fastafile, seq, name, linelength)
- return fastafile
-
-
-
+ #f = open(infile, "r")
+ #linelength = 80
+ #name = extract_name(f)
+ #seq = extract_sequence(f)
+ #f.close()
+ #write_sequence(fastafile, seq, name, linelength)
+ #return fastafile
+ from Bio import SeqIO
+ with open(infile, 'r') as f, open(fastafile, 'w') as g:
+ sequences = SeqIO.parse(f, "genbank")
+ for record in sequences:
+ #x = record.id
+ #record.id = x.split('.')[0]
+ SeqIO.write(record, g, "fasta")
+ return fastafile
diff --git a/mapper.py b/mapper.py
index a34d68c..d7f6920 100755
--- a/mapper.py
+++ b/mapper.py
@@ -221,7 +221,9 @@ class Bowtie2(Mapper):
x_pos = match.group(6)
y_pos = match.group(7)
except:
- log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set.')
+ log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set. Using default flowcell and lane.')
+ flowcell="unset"
+ lane="1000"
if self.query_paired:
try:
@@ -415,7 +417,9 @@ class BWAMem:
x_pos = match.group(6)
y_pos = match.group(7)
except:
- log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set.')
+ log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set. Using default flowcell and lane.')
+ flowcell="unset"
+ lane="1000"
if self.query_paired:
try:
diff --git a/mapping.py b/mapping.py
index ef0448a..c470760 100755
--- a/mapping.py
+++ b/mapping.py
@@ -28,7 +28,7 @@ import metrics
__author__ = "Enrico Seiler"
__credits__ = ["Wojtek Dabrowski"]
__license__ = "GPL"
-__version__ = "1.0.0"
+__version__ = "1.2.1"
__maintainer__ = "Enrico Seiler"
__email__ = "seilere at rki.de"
__status__ = "Development"
@@ -39,6 +39,7 @@ class MAPPING:
self.version = __version__
self.logger = logger
self.python_version = sys.version.split(' ')[0]
+ self.reffinder_version = reffinder.__version__
self._argument_parser()
self.args.output = os.path.realpath(self.args.output)
self.user = pwd.getpwuid(os.getuid()).pw_name
@@ -65,6 +66,8 @@ class MAPPING:
self.mapper = None
self.sam_files = list()
self.delete = list()
+ self.reads = list() # save read names for report
+ self.read_dict = dict() # save reads per sample for report
self.tmp = os.path.join(os.path.expanduser('~/tmp/'), datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f'))
try:
os.mkdir(os.path.expanduser('~/tmp/'))
@@ -74,19 +77,49 @@ class MAPPING:
os.mkdir(self.tmp)
except OSError:
pass
+ self.samtools_version = self._samtools_version()
+ self.picard_version = self._picard_version()
self.run()
if self.args.join:
+ self._join_sample()
+ self.run_metrics()
self._join()
- if self.args.join_sample:
+ elif self.args.join_sample:
self._join_sample()
- self.run_metrics()
+ self.run_metrics()
+ else:
+ self.run_metrics()
+ self.convert_to_bam()
self.cleanup()
+ def _samtools_version(self):
+ cmd = '{samtools}/samtools --help'.format(samtools=self.args.samtools)
+ out = str(sp.check_output(cmd, shell=True), 'utf-8')
+ pattern = re.compile('.*Version: ([\d\.]+) .*', re.DOTALL)
+ return re.match(pattern, out).group(1)
+
+ def _picard_version(self):
+ try:
+ cmd = '{java} -jar {picard}/picard.jar ViewSam --version'.format(java=self.args.java, picard=self.args.picard)
+ out = ''
+ try:
+ out = str(sp.check_output(cmd, shell=True, stderr=sp.STDOUT), 'utf-8')
+ except sp.CalledProcessError as e:
+ out = str(e.output, 'utf-8')
+ pattern = re.compile('([\d\.]+)\(.*', re.DOTALL)
+ return re.match(pattern, out).group(1)
+ except:
+ return 'Not used'
+
def run_metrics(self):
if not self.args.noreport:
for sam in self.sam_files:
+ out = os.path.join(self.args.output, 'data', os.path.basename(sam))
+ os.makedirs(out, exist_ok=True)
+ if self.args.join_sample or self.args.join:
+ self.reads = self.read_dict[os.path.basename(sam)+'.sam']
self.sam = sam
- self.metric = metrics.METRICS(sam, output=self.tmp, logger=self.logger, samtools=os.path.realpath(os.path.join(self.args.samtools, 'samtools')))
+ self.metric = metrics.METRICS(sam + '.sam', output=out, logger=self.logger, samtools=os.path.realpath(os.path.join(self.args.samtools, 'samtools')))
self.current_project = os.path.basename(self._sam_name(os.path.basename(sam))) #read1
self.metrics_results[self.current_project] = dict()
self.metric.read_stats()
@@ -108,7 +141,13 @@ class MAPPING:
loader = jinja2.FileSystemLoader(template_dir)
env = jinja2.Environment(loader=loader)
template = env.get_template(template_file)
- self.args.reference = self.args.reference[0]
+ try:
+ if type(self.args.reference) == tuple or type(self.args.reference) == list:
+ self.args.reference = self.args.reference[0]
+ elif self.args.reference is None or len(self.args.reference) == 0:
+ self.args.reference = 'User defined index'
+ except:
+ self.args.reference = 'User defined index'
pdf_latex = template.render(pipeline=self, metric=self.metric)
with open(self.report_file, 'w') as self.latex:
@@ -143,7 +182,14 @@ class MAPPING:
except sp.CalledProcessError:
log.print_write(self.logger, 'exception', 'ERROR: CPE in MAPPING.cleanup()')
pass
-
+ def convert_to_bam(self):
+ log.print_write(self.logger, 'info', '=== Converting to BAM file ===')
+ for sam in self.sam_files:
+ sam = sam + '.sam'
+ self.command = '{} view -hb {} -o {}'.format(os.path.realpath(os.path.join(self.args.samtools, 'samtools')), sam, sam[:-3] + 'bam')
+ log.call(self.command, self.logger, shell=True)
+ self.command = 'rm {}'.format(sam)
+ log.call(self.command, self.logger, shell=True)
def run_reffinder(self, mode, walk=None):
log.print_write(self.logger, 'info', '=== Running Reffinder ===')
if self.args.reffinder_options is None:
@@ -164,6 +210,8 @@ class MAPPING:
self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} {} --tmp {}".format(os.path.join(walk, self.r), os.path.join(walk, self.r2), ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
else:
self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single {} --tmp {}".format(os.path.join(walk, self.r), ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+ if type(self.args.reference) == tuple:
+ self.args.reference = self.args.reference[0]
def convert_to_accession(self):
pass
#pattern = re.compile('>gi\|\d*\|[^|]*\|([^|]*)\|(.*)')
@@ -238,6 +286,9 @@ class MAPPING:
self.run_reffinder(2, walk[0])
else:
self.args.reference = self.args.reference[0]
+ self.reads.append(self.r)
+ self.reads.append(self.r2)
+ print('RUNNING {} and {}'.format(self.r, self.r2))
self._run_once(os.path.join(walk[0], self.r), read2=os.path.join(walk[0], self.r2))
if self.args.reference is not None:
self.args.reference = self.cache_ref
@@ -251,6 +302,8 @@ class MAPPING:
self.run_reffinder(3, walk[0])
else:
self.args.reference = self.args.reference[0]
+ self.reads.append(self.r)
+ print('RUNNING {}'.format(self.r))
self._run_once(os.path.join(walk[0], self.r))
if self.args.reference is not None:
self.args.reference = self.cache_ref
@@ -265,7 +318,10 @@ class MAPPING:
try:
self.sam_name = re.match('(.*)(\.fastq\.gz|\.fastq)$', st).group(1)
except:
+ st = os.path.basename(st)
self.sam_name = '.'.join(st.split('.')[:-1])
+ if self.sam_name == '':
+ self.sam_name = os.path.basename(s)
return os.path.join(self.args.output, self.sam_name)
def _join(self):
log.print_write(self.logger, 'info', '=== Joining SAM files ===')
@@ -274,8 +330,9 @@ class MAPPING:
command += 'I={}.sam '.format(sam)
self.delete.append(sam + '.sam')
log.call(command, self.logger, shell=True)
- self.sam_files = [os.path.join(self.args.output, 'joined.sam')]
+ self.sam_files = [os.path.join(self.args.output, 'joined')]
def _join_sample(self):
+ log.print_write(self.logger, 'info', '=== Joining SAM files per sample ===')
groups = defaultdict(set)
pattern = re.compile('^U{0,1}P_(.*)_L\d\d\d_R[1,2]_\d\d\d')
try:
@@ -296,7 +353,8 @@ class MAPPING:
command += 'I={}.sam '.format(os.path.join(self.args.output, sam))
self.delete.append(os.path.join(self.args.output, sam + '.sam'))
log.call(command, self.logger, shell=True)
- self.sam_files.append(os.path.join(self.args.output, key+'.sam'))
+ self.sam_files.append(os.path.join(self.args.output, key))
+ self.read_dict[key+'.sam'] = [i for i in self.reads if key in i]
def _run_once(self, read1, read2=None):
log.print_write(self.logger, 'info', '=== Processing read #{} ==='.format(self.read_counter))
log.write(self.logger, 'info', 'Read1 = {}'.format(read1))
@@ -314,7 +372,6 @@ class MAPPING:
self.mapper_version = self.args.mapper + ' ' + self.mapper.get_version()
self.sam_files.append(self._sam_name(self.sam_file))
def split_multifasta(self):
- i = 0
f = None
files = list()
for ref in self.args.reference:
@@ -322,9 +379,9 @@ class MAPPING:
if '>' in line:
if f is not None:
f.close()
- f = open(os.path.join(self.tmp, '{}.fasta'.format(i)), 'w')
- files.append(os.path.join(self.tmp, '{}.fasta'.format(i)))
- i += 1
+ name = os.path.join(self.tmp, '{}.fasta'.format(line.split(' ')[0][1:].strip())) if len(line.split(' ')) >= 2 else os.path.join(self.tmp, '{}.fasta'.format(line[1:].strip()))
+ f = open(name, 'w')
+ files.append(name)
f.write('{}'.format(line))
else:
f.write('{}'.format(line))
@@ -427,6 +484,8 @@ class MAPPING:
self.optional.add_argument('--picard', dest='picard', nargs='?', default='/usr/bin/', type=str, help='Path to folder containing picard.jar')
self.optional.add_argument('--config', help='Define config file to read parameters from. If run.config exists in the source directory, it is if --config is not defined.')
self.args = self.parser.parse_args(remaining_argv)
+ if self.args.join_sample and self.args.join:
+ raise argparse.ArgumentTypeError('Only either --join or --join_sample are allowed')
if self.args.join_sample or self.args.join:
self.args.picard = self._valid_picard(os.path.expanduser(self.args.picard))
if self.args.options is not None:
diff --git a/metrics.py b/metrics.py
index a9a9c8a..5fe66e9 100755
--- a/metrics.py
+++ b/metrics.py
@@ -10,6 +10,7 @@ import os
import pysam
import pylab
import itertools
+import json
import string
from scipy import stats
from collections import Counter
@@ -340,10 +341,27 @@ class METRICS:
conf_int = mean_coverage + 2 * sd_coverage
lower_conf_int = (reference_coverage < conf_int).sum()
greater_conf_int = (reference_coverage > conf_int).sum()
- cov_stats = {'mean': mean_coverage, 'max': maximum_coverage, 'min': minimum_coverage, 'sd': sd_coverage, 'zero': bases_zero_coverage, 'lower': lower_conf_int, 'greater': greater_conf_int}
+ cov_stats = dict()
+ cov_stats['mean'] = int(mean_coverage)
+ cov_stats['max'] = int(maximum_coverage)
+ cov_stats['min'] = int(minimum_coverage)
+ cov_stats['sd'] = int(sd_coverage)
+ cov_stats['zero'] = int(bases_zero_coverage)
+ cov_stats['lower'] = int(lower_conf_int)
+ cov_stats['greater'] = int(greater_conf_int)
self.cov_stats_result.append(cov_stats)
if len(self.cov_stats_result) == 0:
- self.cov_stats_result.append({'mean': -1, 'max': -1, 'min': -1, 'sd': -1, 'zero': -1, 'lower': -1, 'greater': -1})
+ cov_stats = dict()
+ cov_stats['mean'] = -1
+ cov_stats['max'] = -1
+ cov_stats['min'] = -1
+ cov_stats['sd'] = -1
+ cov_stats['zero'] = -1
+ cov_stats['lower'] = -1
+ cov_stats['greater'] = -1
+ self.cov_stats_result.append(cov_stats)
+ with open(os.path.join(self.output, 'coverage_stats_{}.json'.format(reference)), 'w') as f:
+ json.dump(cov_stats, f)
#return self.cov_stats
def read_stats(self):
@@ -374,13 +392,16 @@ class METRICS:
self.read_stats['mapped'] = mapped
self.read_stats['unmapped'] = unmapped
self.read_stats['total'] = total
+ with open(os.path.join(self.output, 'read_stats.json'), 'w') as f:
+ json.dump(self.read_stats, f)
#self.read_stats_result.append(self.stats)
#return self.stats
def cleanup(self):
log.write(self.logger, 'info', '=== Cleaning up metrics ===')
try:
- for mfile in self.reads_per_quality_result+self.bases_per_coverage_result+self.mapq_graph_result+self.coverage_graph_result+[(self.sam.sam_file_path, 'bam'), (self.sam.sam_file_path[:-1] + 'i', 'bai')]:
+ #for mfile in self.reads_per_quality_result+self.bases_per_coverage_result+self.mapq_graph_result+self.coverage_graph_result+[(self.sam.sam_file_path, 'bam'), (self.sam.sam_file_path[:-1] + 'i', 'bai')]:
+ for mfile in [(self.sam.sam_file_path, 'bam'), (self.sam.sam_file_path[:-1] + 'i', 'bai')]:
if mfile is not None:
self.command = 'rm {}'.format(mfile[0])
log.call(self.command, self.logger, shell=True)
diff --git a/reffinder.py b/reffinder.py
index 4d02b01..32053f6 100755
--- a/reffinder.py
+++ b/reffinder.py
@@ -1,42 +1,44 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#created by Stephan Fuchs (FG13, RKI), 2015
-version = '0.0.9'
+__version__ = '2.1.2'
+version = '2.1.2'
import argparse
import time
-import re
import sys
-if sys.version_info < (3, 2):
- import subprocess32 as sp
-else:
- import subprocess as sp
import os
import random
import math
-import custom_logger as log
+import subprocess
+import tempfile
+import shutil
+import mimetypes
+import gzip
+import random
+import re
-#sys.path.insert(0, "/home/RKID1/fuchss/data/scripts/libs")
+#sys.path.insert(0, "/opt/common/pipelines/tools/refFinder/libs")
import fux_filing as filing
-# options is a string containing parameters, e.g. "--fastq sth. --ref sth. -p 0.2"
def parse_arguments(options=None, logger=None):
#processing command line arguments
parser = argparse.ArgumentParser(prog="REFFINDER", description='')
- parser.add_argument('--fastq', metavar="FILE(S)", help="input FASTQ file(s)", type=argparse.FileType('r'), required=True, nargs='+')
- parser.add_argument('--ref', metavar="FILE(S)", help="input reference FASTA file(s)", type=str, required=True, nargs='+')
+ parser.add_argument('--fastq', metavar="FILE", help="input FASTQ file(s)", type=argparse.FileType('r'), required=True, nargs='+')
+ parser.add_argument('--ref', metavar="FILE", help="input reference FASTA file(s)", type=argparse.FileType('r'), required=True, nargs='+')
parser.add_argument('--tmp', type=str, help='Temporary directory to use', default=os.path.expanduser('~/tmp/'))
parser.add_argument('-p', metavar="DECIMAL", help="proportion of reads used for mapping (default: 0.1)", type=float, action="store", default=0.1)
parser.add_argument('-t', metavar='INT', help="number of threads/CPUs used for FastQC (default: 20)", type=int, action="store", default=20)
- parser.add_argument('-s', help="if set, subset fastq files are stored in a sub folder. Please consider, that high amount of data may be generated.", action="store_true")
- parser.add_argument('-b', help="if set, subset BAM files are stored in a sub folder. Please consider, that high amount of data may be generated.", action="store_true")
- parser.add_argument('-i', help="if set, reference index files are not deleted.", action="store_true")
+ #parser.add_argument('-s', help="if set, subset fastq files are stored in a sub folder. Please consider, that high amount of data may be generated.", action="store_true")
+ parser.add_argument('-b', help="if set, BAM files kept. Please cosnider that this option can accumulate a large amount of data.", action="store_true")
+ parser.add_argument('-q', help="base quality threshold (default: 20)", type=int, default=20)
+ parser.add_argument('-Q', help="mapping quality threshold (default: 20)", type=int, default=20)
parser.add_argument('--mode', help="select paired (default) or unpaired mode", choices=['paired', 'single'], action="store", default='paired')
- parser.add_argument('--mapping', help="select mapping algorithm (default: bwasw).", choices=['bwasw', 'bwamem'], action="store", default='bwasw')
+ parser.add_argument('--mapping', help="select mapping algorithm (default: bwamem).", choices=['bwasw', 'bwamem'], action="store", default='bwamem')
parser.add_argument('--version', action='version', version='%(prog)s ' + version)
if options is None:
- args = parser.parse_args()
+ args = parser.parse_args()
else:
args = parser.parse_args(options.split(' '))
args.logger = logger
@@ -48,340 +50,746 @@ def parse_arguments(options=None, logger=None):
#FUNCTIONS
-def indexRef(ref, args):
- if args.mapping in ['bwasw', 'bwamem']:
- cmd = ['bwa', 'index', '-p', os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), '-a', 'is', ref]
- elif args.mapping == smalt:
- cmd = ['smalt', 'index', '-s', '6', ref, ref]
- log.call(cmd, args.logger)
- cmd = ['samtools', 'faidx', ref]
- log.call(cmd, args.logger)
-
-def mapping(ref, fastq, revfastq, samfile, args):
- #for single mapping set revfastq to False
- if args.mapping == 'bwasw':
- cmd = ['bwa', 'bwasw', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), fastq, revfastq, '-f', samfile]
- if revfastq == False:
- del(cmd[6])
- log.call(cmd, args.logger)
- elif args.mapping == 'bwamem':
- with open(samfile,"wb") as outfile:
- cmd = ['bwa', 'mem', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), fastq, revfastq]
- if revfastq == False:
- del(cmd[6])
- sp.check_call(cmd, stdout=outfile)
- elif args.mapping == 'smalt':
- cmd = ['smalt', 'map', '-n', str(args.t), '-f', 'sam', '-o', samfile, ref, fastq, revfastq]
- if revfastq == False:
- del(cmd[-1])
- log.call(cmd, args.logger)
-
-def sam2bam(samfile, bamfile, delSam = True):
- with open(bamfile, 'w') as bf:
- cmd = ['samtools', 'view', '-F', '4', '-Sb', samfile]
- sp.check_call(cmd, stdout=bf)
- if delSam:
- os.remove(samfile)
-
-def getMappedReads(bamfile):
- cmd = ['samtools', 'flagstat', bamfile]
- out = sp.check_output(cmd).decode("utf-8")
- mapped_reads = re.compile('.*?(\d*) \+ \d* mapped.*', re.DOTALL)
- m = mapped_reads.match(out)
- #return int(out[2].split('+')[0])
- return int(m.group(1))
-
-def getBwaVers():
- cmd = ['bwa']
- process = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
- lines = process.communicate()[1].decode("utf-8").split('\n')
- return lines[2].split(': ')[1]
-
-def getSamtoolsVers():
- cmd = ['samtools']
- process = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
- lines = process.communicate()[1].decode("utf-8").split('\n')
- return lines[2].split(': ')[1]
-
-def clean(args):
- exts = ['amb', 'ann', 'bwt', 'fai', 'pac', 'sa']
- for ref in args.ref:
- for ext in exts:
- if os.path.isfile(ref + '.' + ext):
- os.remove(ref + '.' + ext)
+def indexRef(ref, logfile, args):
+ with open(logfile, 'a') as lf:
+ if args.mapping in ['bwasw', 'bwamem']:
+ cmd = ['bwa', 'index', '-p', os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), '-a', 'is', ref]
+ elif args.mapping == smalt:
+ cmd = ['smalt', 'index', '-s', '6', ref, ref]
+ subprocess.check_call(cmd, shell=False, stdout=lf, stderr=lf)
+
+ cmd = ['samtools', 'faidx', ref]
+ subprocess.check_call(cmd, shell=False, stdout=lf, stderr=lf)
+
+def mapping(ref, fastq, revfastq, samfile, mapper, logfile, args):
+ #for single mapping set revfastq to False
+ with open(logfile, 'a') as lf:
+ if mapper == 'bwasw':
+ cmd = ['bwa', 'bwasw', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), fastq, revfastq, '-f', samfile]
+ if revfastq == False:
+ del(cmd[6])
+ subprocess.check_call(cmd, stderr=lf)
+
+ elif mapper == 'bwamem':
+ with open(samfile,"wb") as outfile:
+ cmd = ['bwa', 'mem', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), fastq, revfastq]
+ if revfastq == False:
+ del(cmd[6])
+ subprocess.check_call(cmd, stdout=outfile, stderr=lf)
+
+ elif mapper == 'smalt':
+ cmd = ['smalt', 'map', '-n', str(args.t), '-f', 'sam', '-o', samfile, ref, fastq, revfastq]
+ if revfastq == False:
+ del(cmd[-1])
+ subprocess.check_call(cmd, stderr=lf)
+
+def sam2sbam(samfile, bamfile, logfile, delSam = True):
+ bamfile = re.sub(r'\.bam$', '', bamfile) #.bam automatically attached by samtools sort
+
+ #convert
+ with open(bamfile + ".tmp", 'w') as bf, open(logfile, 'a') as lf:
+ cmd = ['samtools', 'view', '-F', '4', '-Sb', samfile]
+ subprocess.check_call(cmd, stdout=bf, stderr=lf)
+
+ #sort
+ with open(bamfile+'.bam', 'w') as bf, open(logfile, 'a') as lf:
+ cmd = ['samtools', 'sort', bamfile + ".tmp"]
+ subprocess.check_call(cmd, stdout=bf, stderr=lf)
+
+ os.remove(bamfile + ".tmp")
+ if delSam:
+ os.remove(samfile)
+
+def getMappedReads(bamfile):
+ cmd = ['samtools', 'flagstat', bamfile]
+ out = subprocess.check_output(cmd).decode("utf-8").split('\n')
+ return int(out[2].split('+')[0])
+
+
+def getCoverage(bamfile, logfile, q, Q, reflen = False):
+ with open(logfile, 'a') as lf:
+ cmd = ['samtools', 'depth', '-q', str(q), '-Q', str(Q), bamfile]
+ out = subprocess.check_output(cmd, stderr=lf).decode("utf-8").split('\n')
+ cov = 0
+ for line in out:
+ line = line.strip()
+ if line != "":
+ cov += int(line.split("\t")[-1])
+ if reflen:
+ cov = cov/reflen
+ return cov
+
+def getBwaVers():
+ cmd = ['bwa']
+ process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ lines = process.communicate()[1].decode("utf-8").split('\n')
+ return lines[2].split(': ')[1]
+
+def getSamtoolsVers():
+ cmd = ['samtools']
+ process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ lines = process.communicate()[1].decode("utf-8").split('\n')
+ return lines[2].split(': ')[1]
+
+def readFasta(filename):
+ entries = [] # [[header, lin_seq]]
+ with open(filename, 'r') as handle:
+ for line in handle:
+ line = line.strip()
+ if line == "":
+ continue
+ elif line[0] == ">":
+ entries.append([line, []])
+ elif len(entries) > 0:
+ entries[-1][1].append(line)
+ out = []
+ for entry in entries:
+ out.append([entry[0], ''.join(entry[1])])
+ return out
+
+def mkOutDir(i = 0):
+ creationtime = time.strftime("%Y-%m-%d_%H-%M-%S")
+ dir = 'REFFINDER_' + creationtime + '/'
+ if os.path.isdir(dir):
+ i += 1
+ if i >= 30:
+ print("ERROR: Cannot generate a ouput dir")
+ exit(1)
+ time.sleep(1)
+ mkOutDir(i)
+ else:
+ os.mkdir(dir)
+
+ return dir
+
+
+def getReadNumber(fastqfilename):
+ '''returns number of reads stored in a fastq file'''
+ mime = mimetypes.guess_type(fastqfilename)[1]
+ if mime == 'gzip':
+ handle = gzip.GzipFile(fastqfilename, 'rb')
+ else:
+ handle = open(fastqfilename, 'r')
+
+ err = []
+ readcount = 0
+ e = 0
+ l = 0
+ for line in handle: #don't use readline() since in case of gzip it is very slow
+ l += 1 #counts line number
+ e += 1 #counts line until 4 then starts at 1 to check entry lines
+
+ if mime == 'gzip':
+ line = line.decode('utf-8')
+
+ #identifier
+ if e == 1:
+ if line.strip() == "":
+ e -= 1
+ continue
+ elif line[0] != "@":
+ err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): identifier line expected")
+ break
+
+ #sequence
+ elif e == 2:
+ if line.strip() == "":
+ err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): missing sequence")
+ break
+
+ #description
+ elif e == 3:
+ if line[0] != "+":
+ err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): description expected")
+ break
+
+ #base qualities
+ else:
+ if line.strip() == "":
+ err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): missing base quality scores")
+ break
+ else:
+ e = 0
+ readcount += 1
+
+ handle.close()
+
+ if readcount == 0:
+ err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): no reads")
+ if l < 4 and l > 0:
+ err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): incomplete read entry")
+
+ if len(err) == 0:
+ err = False
+
+ return readcount, err
+
+def createFastqSubset(srcfile, dstfile, readKeys2select):
+ readKeys = set(readKeys2select)
+ total = len(readKeys)
+ stored = 0
+ readcount = 0
+ mime = mimetypes.guess_type(srcfile)[1]
+ if mime == 'gzip':
+ handle = gzip.GzipFile(srcfile, 'rb')
+ else:
+ handle = open(srcfile, 'r')
+
+ with open(dstfile, 'w') as dsthandle:
+ readcount = 0
+ err = []
+ e = 0
+ l = 0
+ entry = []
+ for line in handle: #don't use readline() since in case of gzip it is very slow
+ l += 1 #counts line number
+ e += 1 #counts line until 4 then starts at 1 to check entry lines
+
+ if mime == 'gzip':
+ line = line.decode('utf-8')
+
+ #identifier
+ if e == 1:
+ if line.strip() == "":
+ e = 0
+ continue
+ elif line[0] != "@":
+ err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): identifier line expected")
+ break
+ else:
+ entry.append(line)
+
+ #sequence
+ elif e == 2:
+ if line.strip() == "":
+ err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): missing sequence")
+ break
+ else:
+ entry.append(line)
+
+ #description
+ elif e == 3:
+ if line[0] != "+":
+ err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): description expected")
+ break
+ else:
+ entry.append(line)
+
+ #base qualities
+ else:
+ if line.strip() == "":
+ err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): missing base quality scores")
+ break
+ else:
+ e = 0
+ entry.append(line)
+ readcount += 1
+
+ #write selected reads to dstfile
+ if readcount in readKeys:
+ dsthandle.write(''.join(entry))
+ stored += 1
+ if stored == total:
+ break
+
+ entry = []
+
+ handle.close()
+
+ if stored != total:
+ err.append("ERROR: subset generation failed for " + srcfile + "(" + str(stored) + "instead of " +str(total) + "reads collected")
+
+ if readcount == 0:
+ err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): no reads")
+ if l < 4 and l > 0:
+ err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): incomplete read entry")
+
+ if len(err) == 0:
+ err = False
+
+
+def writeAsCols (handle, input, sep = ' ' * 5):
+ #allowed signs in numbers
+ pattern = re.compile(r'([+-])?([0-9]+)([.,]?)([0-9]+)(%?)')
+
+ #str conversion
+ lines = [[str(x) for x in line] for line in input]
+
+ #fill up rows to same number of columns (important for transposing)
+ maxlen = max([len(x) for x in lines])
+ [x.extend(['']*(maxlen-len(x))) for x in lines]
+
+ #find format parameters
+ width = [] #colum width or length
+ align = [] #alignment type (number = r; strings = l)
+ prec = [] #number precision
+ for column in zip(*lines):
+ width.append(len(column[0]))
+ prec.append(0)
+ align.append('r')
+ for field in column[1:]:
+ if align[-1] == 'l':
+ width[-1] = max(width[-1], len(field))
+ else:
+ match = pattern.match(field)
+ if match and match.group(0) == field:
+ if match.group(3) and match.group(4):
+ prec[-1] = max(prec[-1], len(match.group(4)))
+
+ if prec[-1] > 0:
+ k = 1
+ else:
+ k = 0
+ width[-1] = max(width[-1], len(match.group(2)) + prec[-1] + k + len(match.group(4)))
+ else:
+ align[-1] = 'l'
+ prec[-1] = False
+ width[-1] = max(width[-1], len(field))
+
+ #formatting
+ output = []
+ for line in lines:
+ f = 0
+ output.append([])
+ for field in line:
+ if align[f] == 'l':
+ output[-1].append(field + ' ' * (width[f] - len(field)))
+ elif align[f] == 'r':
+ match = pattern.match(field)
+ if not match or match.group(0) != field:
+ output[-1].append(' ' * (width[f] - len(field)) + field)
+ else:
+ if match.group(5):
+ percent = match.group(5)
+ else:
+ percent = ''
+ length = len(percent)
+
+ intpart = match.group(2)
+ if match.group(1):
+ intpart = match.group(1) + intpart
+ if match.group(3):
+ decpart = match.group(4)
+ else:
+ intpart += match.group(4)
+ decpart = ''
+ length += len(intpart)
+
+ #print('match:', match.group(0))
+ #print('intpart:', intpart)
+ #print('decpart:', decpart)
+ #print('prec:', prec[f])
+
+ if prec[f] > 0:
+ decpart += '0' * (prec[f] - len(match.group(4)))
+ length += len(decpart) + 1
+ formatted_number = ' ' * (width[f] - length) + intpart + '.' + decpart + percent
+ else:
+ formatted_number = ' ' * (width[f] - length) + intpart + percent
+ output[-1].append(formatted_number)
+ f += 1
+ lb = os.linesep
+ handle.write(lb.join([sep.join(x) for x in output]))
+
+def logging(line, logfilename):
+ with open(logfilename, 'a') as handle:
+ handle.write(line + os.linesep)
+
+def print(*arg, end=None):
+ pass
def find_reference(args):
+
#START
creationtime = time.strftime("%Y-%m-%d_%H-%M-%S")
starttime = time.time()
- #consistency checks
+ #parameter consistency checks
if args.mode == 'paired' and len(args.fastq) % 2 != 0:
- log.write(args.logger, 'debug', "EXCEPTION: in paired mode the number of fastq files must be even")
- exit(1)
-
+ print("EXCEPTION: in paired mode the number of fastq files must be even")
+ exit(1)
if args.p <= 0 or args.p > 1:
- log.write(args.logger, 'debug', "EXCEPTION: read proportion must between 0 (exclusive) and 1 (inclusive).")
- exit(1)
-
- #generate dir
- if args.s or args.b:
- storedir = 'REFFINDER_' + creationtime + '/'
- os.mkdir(storedir)
- else:
- storedir = ''
-
- #files to fastq file objects
- fastqFileObjs = []
- for fastqfile in args.fastq:
- fastqfile.close()
- fastqFileObjs.append(filing.fastqfile(fastqfile.name))
-
- #generating reference index files
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', "generate reference index files ...")
+ print("EXCEPTION: read proportion must be a float between 0 (exclusive) and 1 (inclusive).")
+ exit(1)
+
+ #generate tmpdir
+ tmpdir = args.tmp
+
+ #generate storedir and logfile
+ storedir = args.tmp
+
+ #log file definition
+ process_logfile = os.path.join(storedir, 'process.log')
+ logging('# START LOG #', process_logfile)
+
+ #processing reference index files
+ print ("processing reference files...")
+ reflengths = {} #dict key: origname; value: length of reference
+ refheader = {} #dict key: origname; value: FASTA header
+ err = []
for ref in args.ref:
- indexRef(ref, args)
+ entries = readFasta(ref.name)
+ if len(entries) == 0:
+ err.append("ERROR in " + ref.name + ": No valid FASTA format")
+ elif len(entries) > 1:
+ err.append("ERROR in " + ref.name + ": multiple sequences not supported")
+ else:
+ reflengths[ref.name] = len(entries[0][1])
+ refheader[ref.name] = entries[0][0]
- #generating subset fastq files and mapping
- fastq_mapped_reads =[] #minimal quality mapped reads based in fastq files order
- ref_mapped_reads =[] #minimal quality mapped reads based in ref files order
- total_reads = 0 #total reads
+ if len(err) > 0:
+ for e in err:
+ print(e)
+ exit(1)
- if args.mode == 'paired':
- datasets = [['forward read file', 'reverse read file', 'reads', 'subset reads']]
- elif args.mode == 'single':
- datasets = [['read file', 'reads', 'subset reads']]
- current_task = 0
+ #generating reference index files
+ logging('# REFERENCE FILE INDEXING #', process_logfile)
+ print ("indexing reference files...", end = " ")
+ reffiles = [] #list of sets (tmpname, origname)
+ i = 0
+ for ref in args.ref:
+ logging('## INDEXING ' + ref.name + ' ##', process_logfile)
+ i += 1
+ print ("\rindexing reference files... [" + str(i) + "/" + str(len(args.ref))+ "] ", end = " ")
+ newname = os.path.join(tmpdir, os.path.basename(ref.name))
+ try:
+ shutil.copyfile(ref.name, newname)
+ except shutil.SameFileError:
+ pass
+ ref.close()
+ reffiles.append((newname, ref.name))
+ indexRef(newname, process_logfile, args)
+ print()
+
+ #processing fastq files
+ print ('processing fastq files...', end = " ")
+ readcount = {}
+ err = []
+ i = 0
+ for fastq in args.fastq:
+ i += 1
+ print ("\rprocessing fastq files... [" + str(i) + "/" + str(len(args.fastq))+ "] ", end = " ")
+ count, e = getReadNumber(fastq.name)
+ if e:
+ err.extend(e)
+ else:
+ readcount[fastq.name] = count
+
+
+ print()
+
+ if len(err) > 0:
+ for e in err:
+ print(e)
+ exit(1)
+
+
+ #task actions
+ print("working...")
+ coverages = {} # 1.dim_key: fastq_origname; 2.dim_key: ref_origname; value: mapped reads
- #processing paired files
if args.mode == 'paired':
- total_tasks = int((len(args.fastq)/2) * len(args.ref))
- i = 0
- while i < len(args.fastq):
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'processing paired files ...')
- log.write(args.logger, 'debug', 'forward read file: {}'.format(args.fastq[i].name))
- log.write(args.logger, 'debug', 'reverse read file: {}'.format(args.fastq[i+1].name))
- pairedfastqFileObj = filing.pairedfastqfiles(fname = args.fastq[i].name, rname=args.fastq[i+1].name)
- log.write(args.logger, 'debug', 'reads in each file: {}'.format(pairedfastqFileObj.getReadNumber()))
- datasets.append([args.fastq[i].name, args.fastq[i+1].name, pairedfastqFileObj.getReadNumber()])
-
- #generating subsets
- log.write(args.logger, 'debug', 'generating read subsets ...')
- mapped_reads = []
- with pairedfastqFileObj.randomSubset(prop=args.p) as subsetObj:
- log.write(args.logger, 'debug', 'reads in each subset file: {}'.format(subsetObj.getReadNumber()))
- datasets[-1].append(subsetObj.getReadNumber())
- total_reads += subsetObj.getReadNumber()
- j = 0
- for ref in args.ref:
- current_task += 1
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'START MAPPING TASK {} OF {}'.format(current_task, total_tasks))
-
- #mapping
- log.write(args.logger, 'debug', 'mapping to {} using {} {}'.format(ref, args.mapping, '...'))
- samfile = subsetObj.forward.uniqueFilename(ext='sam')
- mapping(ref, subsetObj.forward.fullname, subsetObj.reverse.fullname, samfile, args)
-
- #convert to bam
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'converting sam to bam ...')
- bamfile = subsetObj.forward.uniqueFilename(ext='bam')
- sam2bam(samfile, bamfile)
-
- #get reads aligend with minimal mapping quality
- mapped_reads.append(getMappedReads(bamfile))
- if len(ref_mapped_reads) < j + 1:
- ref_mapped_reads.append(0)
- ref_mapped_reads[j] += mapped_reads[-1]
-
- if args.b:
- fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
- reffilepath, reffilename = os.path.split(ref)
- os.rename(bamfile, storedir + 'subset_' + fastqfilename + '__to__' + reffilename + '.bam')
- else:
- os.remove(bamfile)
-
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'mapped reads: {}'.format(mapped_reads[-1]))
- j += 1
-
- if args.s:
- fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
- subsetObj.forward.renameFile('subset_' + fastqfilename, path=storedir)
- fastqfilepath, fastqfilename = os.path.split(args.fastq[i+1].name)
- subsetObj.reverse.renameFile('subset_' + fastqfilename, path=storedir)
- else:
- subsetObj.setTmp()
-
- #make fastq_mapped_reads relative
- k = 0
- for reads in mapped_reads:
- mapped_reads[k] = str(round(mapped_reads[k]/subsetObj.getReadNumber() * 100, 2)) + '%'
- k += 1
- fastq_mapped_reads.append(mapped_reads)
- i += 2
-
-
- #processing single files
- if args.mode == 'single':
- total_tasks = int(len(args.fastq) * len(args.ref))
- i = 0
- while i < len(args.fastq):
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'processing fastq file {}'.format(args.fastq[i].name))
- fastqFileObj = filing.fastqfile(name = args.fastq[i].name)
- log.write(args.logger, 'debug', 'number of reads: {}'.format(fastqFileObj.getReadNumber()) )
- datasets.append([args.fastq[i].name, fastqFileObj.getReadNumber()])
-
- log.write(args.logger, 'debug', 'generating read subset ...')
- mapped_reads = []
- with fastqFileObj.randomSubset(prop=args.p) as subsetObj:
- log.write(args.logger, 'debug', 'reads in subset file: {}'.format(subsetObj.getReadNumber()))
- datasets[-1].append(subsetObj.getReadNumber())
- j = 0
- for ref in args.ref:
- current_task += 1
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'START MAPPING TASK {} OF {}'.format(current_task, total_tasks))
-
- #mapping
- log.write(args.logger, 'debug', 'mapping to {} using {}'.format(ref, args.mapping))
- samfile = subsetObj.uniqueFilename(ext='sam')
- mapping(ref, subsetObj.fullname, False, samfile, args)
-
- #convert to bam
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'converting sam to bam ...')
- bamfile = subsetObj.uniqueFilename(ext='bam')
- sam2bam(samfile, bamfile)
-
- #get reads aligend with minimal mapping quality
- mapped_reads.append(getMappedReads(bamfile))
- if len(ref_mapped_reads) < j + 1:
- ref_mapped_reads.append(0)
- ref_mapped_reads[j] += mapped_reads[-1]
-
- if args.b:
- fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
- reffilepath, reffilename = os.path.split(ref)
- os.rename(bamfile, storedir + 'subset_' + fastqfilename + '__to__' + reffilename + '.bam')
- else:
- os.remove(bamfile)
-
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'mapped reads: {}'.format(mapped_reads[-1]))
- j += 1
-
- if args.s:
- fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
- subsetObj.renameFile('subset_' + fastqfilename, path=storedir)
- else:
- subsetObj.setTmp()
-
- #make fastq_mapped_reads relative
- k = 0
- for reads in mapped_reads:
- mapped_reads[k] = str(round(mapped_reads[k]/subsetObj.getReadNumber() * 100, 2)) + '%'
- k += 1
- fastq_mapped_reads.append(mapped_reads)
- i += 1
-
- #make ref_mapped_reads relative
+ total_tasks = int(len(args.fastq)/2) * len(args.ref)
+ else:
+ total_tasks = len(args.fastq) * len(args.ref)
+
+ if args.b:
+ bamloghandle = open(os.path.join(storedir, "bamfiles.log"), "w")
+ bamloghandle.write("task\tBAM file\treference file\tread file(s)" + os.linesep)
+
+ f = 0
+ current_task = 0
i = 0
- for reads in ref_mapped_reads:
- if total_reads == 0:
- ref_mapped_reads[i] = '0%'
- else:
- ref_mapped_reads[i] = str(round(reads/total_reads * 100, 2)) + '%'
+ while current_task < total_tasks:
+ print (" preparing new dataset...")
+
+ #generate fastq subsets in paired mode
+ if args.mode == 'paired':
+ file1 = args.fastq[i].name
+ file2 = args.fastq[i+1].name
+ subset_file1 = os.path.join(tmpdir, re.sub(r'\.gz$', '', "subset_" + os.path.basename(file1)))
+ subset_file2 = os.path.join(tmpdir, re.sub(r'\.gz$', '', "subset_" + os.path.basename(file2)))
+ i += 2
+ print (" paired data")
+ print (" " + file1 + ":", readcount[file1], "reads")
+ print (" " + file2 + ":", readcount[file2], "reads")
+ if readcount[file1] != readcount[file2]:
+ print("ERROR: inconsistent data set.")
+ exit(1)
+ subamount = int(readcount[file1] * args.p)
+ print (" subset:", subamount, "reads each")
+ print (" generating subset files...")
+ selected = random.sample(range(1, readcount[file1]+1), subamount)
+ createFastqSubset(file1, subset_file1, selected)
+ createFastqSubset(file2, subset_file2, selected)
+
+ #single mode
+ else:
+ file1 = args.fastq[i].name
+ subset_file1 = os.path.join(tmpdir, re.sub(r'\.gz$', '', "subset_" + os.path.basename(file1)))
+ subset_file2 = False
i += 1
-
-
+ print (" unpaired data")
+ print (" " + file1 + ":", readcount[file1], "reads")
+ subamount = int(readcount[file1] * args.p)
+ print (" subset:", subamount, "reads")
+ print (" generating subset file...")
+ selected = random.sample(range(1, readcount[file1]+1), subamount)
+ createFastqSubset(file1, subset_file1, selected)
- #get reference charts
- log.write(args.logger, 'debug', '')
- log.write(args.logger, 'debug', 'prepare output ...')
+ if not file1 in coverages:
+ coverages[file1] = {}
- refcharts = [['rank', 'reference file(s)', 'avg of mapped reads']]
+ for reffile in reffiles:
- total_datasets = len(datasets) -1 #minus 1^since it contains a headline
- order = sorted(set(ref_mapped_reads), reverse=True)
- n = 0
-
- for value in order:
- n += 1
- refkeys = [i for i, j in enumerate(ref_mapped_reads) if j == value]
- refs = []
- for key in refkeys:
- refs.append(args.ref[key])
- refcharts.append([str(n), ', '.join(refs), value])
+ ref_origname = reffile[1]
+ #prepare
+ current_task += 1
+ print (' TASK', current_task, 'OF', total_tasks)
- #get rankmatrix
- if args.mode == 'paired':
- rankmatrix = [['forward read file', 'reverse read file']]
- elif args.mode == 'single':
- rankmatrix = [['read file']]
+ ref = reffile[0]
+ samfile = os.path.join(tmpdir, str(current_task) + ".sam")
+ bamfile = os.path.join(tmpdir, str(current_task) + ".bam")
+
+ #process logging
+ if subset_file2:
+ logging('# TASK ' + str(current_task) + ': subsets of ' + file1 + " / " + file2 + " TO " + ref_origname + ' #', process_logfile)
+ else:
+ logging('# TASK ' + str(current_task) + ': subset of ' + file1 + " TO " + ref_origname + ' #', process_logfile)
+
+
+ #mapping
+ print(" mapping...")
+ logging('## MAPPING ##', process_logfile)
+ mapping(ref, subset_file1, subset_file2, samfile, args.mapping, process_logfile, args)
+
+ print(" analyzing...")
+
+ #sam to bam
+ logging('## SAM2BAM CONVERSION AND SORTING ##', process_logfile)
+ sam2sbam(samfile, bamfile, process_logfile)
+
+ #get reads aligend with minimal mapping quality
+ #mapped_reads = getMappedReads(bamfile)
+
+ #get coverages
+ coverages[file1][ref_origname] = getCoverage(bamfile, process_logfile, args.q, args.Q, reflengths[ref_origname])
+ print(" avg_base_cov:", coverages[file1][ref_origname])
+
+ #save bam
+ if args.b:
+ dstbamname = str(current_task) + '.bam'
+ dstbam = os.path.join(os.getcwd(), storedir, dstbamname)
+ shutil.move(bamfile, dstbam)
+ bamloghandle.write(str(current_task) + "\t" + dstbamname + "\t" + ref_origname + "\t" + file1)
+ if args.mode == 'paired':
+ bamloghandle.write(" / " + file2)
+ bamloghandle.write(os.linesep)
+ else:
+ os.remove(bamfile)
+
+ #cleaning subset_files
+ os.remove(subset_file1)
+ if subset_file2:
+ os.remove(subset_file2)
+
+
+
+ if args.b:
+ bamloghandle.close()
+
+ #cleaning tmp dir
+ #shutil.rmtree(tmpdir)
+
+ #process logging end
+ logging('# END LOG #', process_logfile)
+
+ #output
+ print('preparing output...')
- for i in range(1, len(args.ref) + 1):
- rankmatrix[0].append('rank ' + str(i) + ' (mapped reads)')
+ ##reference charts
+ outfile = os.path.join(storedir, "ref_charts.txt")
+ #refcharts = [['rank', 'reference file(s)', 'avg cov per base']]
+ chart = {} #key = %coverage, value: list of references
+ refcov = {} #key: refname; value: list of %coverage
+ bestfits = {} # number of best fits
+
+
+ ###get coverages
+ for fastq in coverages.keys():
+ for ref in coverages[fastq].keys():
+ cov = coverages[fastq][ref]
+ if not ref in refcov:
+ refcov[ref] = [cov]
+ else:
+ refcov[ref].append(cov)
+
+ ###get best fittings
+ maxcovs = []
+ r = list(refcov.keys())
+ r = r[0]
+ for i in range(len(refcov[r])):
+ maxcovs.append(0)
+ for ref in refcov.keys():
+ maxcovs[i] = max([refcov[ref][i], maxcovs[i]])
+
+ for ref in refcov.keys():
+ bestfits[ref] = 0
+ for i in range(len(refcov[ref])):
+ if refcov[ref][i] == maxcovs[i]:
+ bestfits[ref] += 1
+
+ ###calculate mean
+ for ref in refcov:
+ meancov = sum(refcov[ref]) / len(refcov[ref])
+ if not meancov in chart:
+ chart[meancov] = [ref]
+ else:
+ chart[meancov].append(ref)
+
+ order = sorted(set(chart.keys()), reverse=True)
+
+ out = [["rank", "avg cov per base", "#best fittings", "reference header", "reference file"]]
i = 0
- for mapped_reads in fastq_mapped_reads:
- if args.mode == 'paired':
- rankmatrix.append([args.fastq[i].name, args.fastq[i+1].name])
- elif args.mode == 'single':
- rankmatrix.append([args.fastq[i].name])
-
- order = sorted(set(mapped_reads), reverse=True)
- for value in order:
- refkeys = [i for i, j in enumerate(mapped_reads) if j == value]
- refs = []
- for key in refkeys:
- refs.append(args.ref[key])
-
- rankmatrix[-1].append(', '.join(refs) + ' (' + str(value) + ')')
-
- if args.mode == 'paired':
- i += 2
- elif args.mode == 'single':
- i += 1
-
- #get refmatrix
- if args.mode == 'paired':
- refmatrix = [['forward read file', 'reverse read file']]
- elif args.mode == 'single':
- refmatrix = [['read file']]
-
- for ref in args.ref:
- refmatrix[0].append(ref)
+ for meancov in order:
+ i += 1
+ fits = []
+ headers = []
+ for ref in chart[meancov]:
+ fits.append(str(bestfits[ref]))
+ header = refheader[ref][1:]
+ headers.append(header.replace(';', ','))
+ out.append([i, meancov, ";".join(fits), ";".join(headers), ";".join(chart[meancov])])
+
+ with open(outfile, "w") as handle:
+ writeAsCols(handle, out)
+
+
+ ##reference ranking
+ outfile = os.path.join(storedir, "ref_ranking.txt")
+ out = [['sample']]
+ count_ranks = 0
+ i = 0
+ best_reference = None
+ for ref in reffiles:
+ if i == 0:
+ best_reference = ref
+ i += 1
+ out[0].append("rank " + str(i))
+
+
+ i = 0
+ while i < len(args.fastq):
+ fastq = args.fastq[i].name
+ out.append([fastq])
+ i +=1
+ if args.mode == 'paired':
+ out[-1][0] += " / " + args.fastq[i].name
+ i +=1
+
+ ranking = {}
+ for ref in reffiles:
+ ref = ref[1]
+ cov = coverages[fastq][ref]
+ if not cov in ranking:
+ ranking[cov] = []
+ ranking[cov].append(ref)
+
+ order = sorted(set(ranking.keys()), reverse=True)
+ for cov in order:
+ out[-1].append(",".join(ranking[cov]) + " (" + str(cov) + " avg cov per base)")
+
+ while len(out[-1]) < len(reffiles):
+ out[-1].append("-")
+
+ with open(outfile, "w") as handle:
+ writeAsCols(handle, out)
+
+ ##ref matrix
+ outfile = os.path.join(storedir, "ref_matrix.txt")
+ out = [['sample']]
+ count_ranks = 0
+ for ref in reffiles:
+ out[0].append(ref[1] + " [avg cov per base]")
i = 0
- for mapped_reads in fastq_mapped_reads:
- if args.mode == 'paired':
- refmatrix.append([args.fastq[i].name, args.fastq[i+1].name])
- i += 2
- elif args.mode == 'single':
- refmatrix.append([args.fastq[i].name])
- i += 1
- refmatrix[-1].extend(mapped_reads)
-
- #cleaning
- if args.i == False:
- clean(args)
-
- #print best reference
- maxvalue = max(set(ref_mapped_reads))
- refkeys = [i for i, j in enumerate(ref_mapped_reads) if j == maxvalue]
- log.write(args.logger, 'debug', 'best matching reference:')
- #log.write(args.logger, 'debug', args.ref[key].name)
- log.write(args.logger, 'debug', args.ref[refkeys[0]])
- #return(args.ref[key].name)
- return(args.ref[refkeys[0]])
+ while i < len(args.fastq):
+ fastq = args.fastq[i].name
+ out.append([fastq])
+ i +=1
+ if args.mode == 'paired':
+ out[-1][0] += " / " + args.fastq[i].name
+ i +=1
+
+ for ref in reffiles:
+ ref = ref[1]
+ out[-1].append(coverages[fastq][ref])
+
+
+ with open(outfile, "w") as handle:
+ writeAsCols(handle, out)
+
+
+ ##logging
+ logname = os.path.join(storedir, 'refFinder.log')
+ with open(logname, 'w') as handle:
+ handle.write('REFFINDER' + os.linesep)
+ handle.write('=========================' + os.linesep)
+ content =[]
+ content.append(['reffinder version:', version])
+ content.append(['bwa version:', getBwaVers()])
+ content.append(['samtools version:', getSamtoolsVers()])
+ content.append(['job time:', creationtime])
+ content.append(['mapping algorithm:', args.mapping])
+ content.append(['mode:', args.mode])
+ if args.mode == 'paired':
+ content.append(['number of datasets:', int(len(args.fastq)/2)])
+ content.append(['number of fastq files:', len(args.fastq)])
+ content.append(['number of references:', len(args.ref)])
+ content.append(['proportion of reads selected for mapping:', args.p])
+ writeAsCols(handle, content)
+
+ handle.write(os.linesep+os.linesep)
+ handle.write('FASTQ FILES' + os.linesep)
+ handle.write('=========================' + os.linesep)
+ i = 0
+ content = [['#', 'file', 'number reads', 'number reads in subset', 'mate file']]
+ files = sorted(readcount.keys())
+ for fastq in files:
+ i += 1
+ count = int(readcount[fastq] * args.p)
+ content.append([i, fastq, readcount[fastq], count])
+ if args.mode == "single":
+ content[-1].append("-")
+ elif i % 2 == 0:
+ content[-1].append(files[i-2])
+ else:
+ content[-1].append(files[i])
+ writeAsCols(handle, content)
+
+ handle.write(os.linesep+os.linesep)
+ handle.write('REFERENCE FILES' + os.linesep)
+ handle.write('=========================' + os.linesep)
+ i = 0
+ content = [['#', 'file', 'header', 'sequence length']]
+ files = sorted(reflengths.keys())
+ for ref in files:
+ i += 1
+ content.append([i, ref, refheader[ref], reflengths[ref]])
+ writeAsCols(handle, content)
+
+ handle.write(os.linesep+os.linesep)
+ handle.write('--' + os.linesep)
+ handle.write('execution time (hh:mm:ss): ')
+ s = time.time() - starttime
+ handle.write('{:02.0f}:{:02.0f}:{:02.0f}'.format(s//3600, s%3600//60, s%60))
+
+ print('results saved to:', storedir)
+ return best_reference
+
if __name__ == '__main__':
find_reference(parse_arguments())
-
diff --git a/report.tex b/report.tex
index a641807..e336de6 100755
--- a/report.tex
+++ b/report.tex
@@ -50,13 +50,22 @@ Machine: & {{pipeline.machine}}\\
Tool versions & \\
Python: & {{pipeline.python_version}}\\
Mapper: & {{pipeline.mapper_version}}\\
+Samtools: & {{pipeline.samtools_version}}\\
+Picard: & {{pipeline.picard_version}}\\
+Reffinder: & {{pipeline.reffinder_version}}\\
\end{tabular}\\
%----------------- Workflow -------------------%
\line(1,0){ \textwidth } \\
\begin{tabular}{lp{0.85\textwidth}}
-Processed read(s): & \path{{"{"+pipeline.current_read1+"}"}}\\
-{%if pipeline.current_read2 is not none%} & \path{{"{"+pipeline.current_read2+"}"}}\\{%endif%}
+Processed read(s):
+{% if pipeline.args.join or pipeline.args.join_sample %}
+ {% for i in pipeline.reads %}
+ & \path{{"{"+i+"}"}}\\
+ {% endfor %}
+{% else %}
+ & \path{{"{"+pipeline.current_read1+"}"}}\\ {%if pipeline.current_read2 is not none%} & \path{{"{"+pipeline.current_read2+"}"}}\\{%endif%}
+{% endif %}
{%if pipeline.args.reference is defined and pipeline.args.reference is not none%} Reference file: & \path{{"{"+pipeline.args.reference+"}"}}\\{%endif%}
\end{tabular}\\
\line(1,0){ \textwidth } \\
diff --git a/run.config b/run.config
new file mode 100644
index 0000000..389b501
--- /dev/null
+++ b/run.config
@@ -0,0 +1,5 @@
+--samtools=/opt/common/pipelines/tools/samtools-1.3.1/
+--bwa=/opt/common/pipelines/tools/bwa-0.7.15/
+--bowtie2=/usr/bin/
+--picard=/opt/common/pipelines/tools/picard-2.7.1/
+--java=java-1.8
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mapping-pipeline.git
More information about the debian-med-commit
mailing list