[med-svn] [mapping-pipeline] 01/10: Imported Upstream version 1.2.0
Andreas Tille
tille at debian.org
Tue Feb 21 12:56:43 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository mapping-pipeline.
commit 3ccb5c672659957aa1cc5760f668a1f51340bd59
Author: Andreas Tille <tille at debian.org>
Date: Mon Dec 12 12:55:25 2016 +0100
Imported Upstream version 1.2.0
---
.gitignore | 89 +++++
LICENSE | 165 ++++++++
README.md | 128 +++++++
__init__.py | 0
custom_logger.py | 157 ++++++++
example.config | 3 +
example.json | 8 +
extract_genbank.py | 52 +++
fux_filing.py | 1079 ++++++++++++++++++++++++++++++++++++++++++++++++++++
mapper.py | 520 +++++++++++++++++++++++++
mapping.cwl | 41 ++
mapping.py | 543 ++++++++++++++++++++++++++
metrics.py | 394 +++++++++++++++++++
reffinder.py | 387 +++++++++++++++++++
report.tex | 135 +++++++
15 files changed, 3701 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..72364f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,89 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000..65c5ca8
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,165 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+ This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+ 0. Additional Definitions.
+
+ As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+ "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+ An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+ A "Combined Work" is a work produced by combining or linking an
+Application with the Library. The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+ The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+ The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+ 1. Exception to Section 3 of the GNU GPL.
+
+ You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+ 2. Conveying Modified Versions.
+
+ If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+ a) under this License, provided that you make a good faith effort to
+ ensure that, in the event an Application does not supply the
+ function or data, the facility still operates, and performs
+ whatever part of its purpose remains meaningful, or
+
+ b) under the GNU GPL, with none of the additional permissions of
+ this License applicable to that copy.
+
+ 3. Object Code Incorporating Material from Library Header Files.
+
+ The object code form of an Application may incorporate material from
+a header file that is part of the Library. You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+ a) Give prominent notice with each copy of the object code that the
+ Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the object code with a copy of the GNU GPL and this license
+ document.
+
+ 4. Combined Works.
+
+ You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+ a) Give prominent notice with each copy of the Combined Work that
+ the Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
+ document.
+
+ c) For a Combined Work that displays copyright notices during
+ execution, include the copyright notice for the Library among
+ these notices, as well as a reference directing the user to the
+ copies of the GNU GPL and this license document.
+
+ d) Do one of the following:
+
+ 0) Convey the Minimal Corresponding Source under the terms of this
+ License, and the Corresponding Application Code in a form
+ suitable for, and under terms that permit, the user to
+ recombine or relink the Application with a modified version of
+ the Linked Version to produce a modified Combined Work, in the
+ manner specified by section 6 of the GNU GPL for conveying
+ Corresponding Source.
+
+ 1) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (a) uses at run time
+ a copy of the Library already present on the user's computer
+ system, and (b) will operate properly with a modified version
+ of the Library that is interface-compatible with the Linked
+ Version.
+
+ e) Provide Installation Information, but only if you would otherwise
+ be required to provide such information under section 6 of the
+ GNU GPL, and only to the extent that such information is
+ necessary to install and execute a modified version of the
+ Combined Work produced by recombining or relinking the
+ Application with a modified version of the Linked Version. (If
+ you use option 4d0, the Installation Information must accompany
+ the Minimal Corresponding Source and Corresponding Application
+ Code. If you use option 4d1, you must provide the Installation
+ Information in the manner specified by section 6 of the GNU GPL
+ for conveying Corresponding Source.)
+
+ 5. Combined Libraries.
+
+ You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+ a) Accompany the combined library with a copy of the same work based
+ on the Library, uncombined with any other library facilities,
+ conveyed under the terms of this License.
+
+ b) Give prominent notice with the combined library that part of it
+ is a work based on the Library, and explaining where to find the
+ accompanying uncombined form of the same work.
+
+ 6. Revised Versions of the GNU Lesser General Public License.
+
+ The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+ If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..3248df7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,128 @@
+# Mapping Pipeline
+
+## Dependencies
+
+This tool was implemented in Python and needs Python 3.4 or higher.
+
+Required software:
+* [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)
+* [BWA](http://bio-bwa.sourceforge.net/)
+* [Latex*](https://www.latex-project.org/)
+* [samtools](http://www.htslib.org/)
+* [Picard*](https://broadinstitute.github.io/picard/)
+
+At least one mapping software is needed (BWA or Bowtie2). <br>
+*To generate an output report Latex is needed. <br>
+*To merge all SAM files in a project Picard is needed.
+
+Required Python packages:
+* jinja2
+* matplotlib
+* scipy
+* numpy
+* pylab
+* pysam
+
+---
+
+## Usage
+
+```
+./mapping.py <mapper> <read1> <option(s)>
+```
+
+required input:
+* mapper: specify mapper to use {bowtie2, bwa-mem}
+* read1: either unpaired reads, first mate of paired reads, project folder or folder containing project folders. <br>
+ When given a folder, the read names must contain a certain prefix to be recognized. <br>
+ For paired reads: "r1\_P\_" and "r2\_P\_" for first and second mates, respectively. <br>
+ For unpaired reads: "r1\_UP\_" or "r2\_UP\_".
+
+options:
+* --read2 \<read2\>: second mate of paired reads
+* --no-report: do not generate report
+* --output \<out\>: set output directory
+* --options "\<options\>": set options for mapper
+* --reffinder_options="\<options\>": set options for reffinder
+* --reference \<reference\>: location of the reference genome in either FASTA or GenBank format
+* --join: join all SAM files within project, needs --picard
+* --join_sample: join all SAM files per sample, needs --picard
+* --multi-fasta: Splits multifasta files into single fasta files to use with reffinder.
+* --no-rg: Do no set read groups.
+* --unmapped \<path/to/folder/unmapped\>: Write unmapped reads to file. '.fastq' is automatically added.
+* --samtools \<samtools_folder\>: Location of folder containing samtools.
+* --picard \<picard_folder\>: Location of folder containing picard.jar.
+* --bwa \<bwa_folder\>: Location of folder containing bwa.
+* --java \<java\>: Java 1.8+ executable
+* --bowtie2 \<bowtie2_folder\>: Location of folder containing bowtie2.
+* --config <path/to/file>: Load parameter from file.
+
+config file:<br>
+You can also use a config file containing parameters.<br>
+example.config:
+```
+--samtools=/usr/bin/
+--bwa=/usr/bin/
+--bowtie2=/usr/bin/
+```
+
+Example usage:
+```
+./mapping.py <...> --config /home/example.config
+```
+
+
+
+###### GenBank file
+GenBank files may be acquired from [NCBI](http://www.ncbi.nlm.nih.gov/genbank/).
+Make sure to include the sequence (*Customize view* -> *Display options* -> *Show sequence*) before downloading the file (*Send* -> *Complete record* -> *File* -> *GenBank (full)*).
+
+---
+
+## Included Files
+
+This pipeline consists of ten files:
+* mapping.py: main script for the Mapping Pipeline
+* mapper.py: manages different mappers
+* metrics.py: creates report
+* custom_logger.py: contains functions for writing log files
+* mapping.cwl: provides interface for use with cwl-runner
+* report.tex: template for report
+* extract_genbank.py: Originally written by Wojtek Dabrowski. Extracts sequence from Genbank file.
+* reffinder.py: Written by Stephan Fuchs. Provides best reference for mapping when given multiple references.
+* fux_filing.py: Written by Stephan Fuchs. Provides functions for reffinder.
+* example.config: Example config file.
+
+---
+
+## Example
+
+```
+./mapping.py bwa-mem --reference path/reference1.fasta --reference path/reference2.gb path/read1.fastq --read2 path/read2.fastq --output path --options "-M"
+./mapping.py bwa-mem path/read1.fastq --read2 path/read2.fastq --option "-x /index/bwa"
+./mapping.py bowtie2 path/to/project --reference path/to/reference --join path/to/picard
+```
+
+---
+
+## Report
+
+The report includes general information about the run, e.g. start time, user and options.
+Furthermore, several metrics are computed:
+* total reads
+* concordant reads
+* discordant reads
+* paired reads
+* mapped reads
+* unmapped reads
+* coverage graph
+* bases per coverage graph
+* reads per mapping quality graph
+* mapping quality graph
+* average coverage
+* maximum coverage
+* minimum coverage
+* standard deviation of coverage
+* number of bases with coverage of 0
+* number of bases with coverage > mean+2*sd
+* number of bases with coverage < mean+2*sd
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/custom_logger.py b/custom_logger.py
new file mode 100755
index 0000000..7dd7acb
--- /dev/null
+++ b/custom_logger.py
@@ -0,0 +1,157 @@
+import logging
+import select
+import sys
+if sys.version_info < (3, 2):
+ import subprocess32 as sp
+else:
+ import subprocess as sp
+
+def call(popenargs, logger, stdout_log_level=logging.DEBUG,
+ stderr_log_level=logging.ERROR, **kwargs):
+ """
+ Variant of subprocess.call that accepts a logger instead of stdout/stderr,
+ and logs stdout/stderr messages via logger.debug.
+ """
+ try:
+ child = sp.Popen(popenargs, stdout=sp.PIPE, stderr=sp.PIPE, **kwargs)
+
+ log_level = {
+ child.stdout: stdout_log_level,
+ child.stderr: stdout_log_level} # stderr_log_level
+
+ def check_io():
+ ready_to_read = select.select(
+ [child.stdout, child.stderr], [], [], 1000)[0]
+ for io in ready_to_read:
+ line = str(io.readline(), 'utf-8')
+ if line and logger is not None:
+ logger.log(log_level[io], line[:-1])
+
+ # keep checking stdout/stderr until the child exits
+ while child.poll() is None:
+ check_io()
+
+ check_io() # check again to catch anything after the process exits
+ except Exception as e:
+ raise e
+
+ if child.returncode != 0:
+ raise sp.CalledProcessError(child.returncode, popenargs, None)
+ else:
+ return child.wait()
+
+def check_output(popenargs, logger, stdout_log_level=logging.DEBUG,
+ stderr_log_level=logging.ERROR, **kwargs):
+ """
+ Variant of subprocess.check_output that accepts a logger instead of stdout/stderr,
+ and logs stdout/stderr messages via logger.debug and returns the output in the same
+ manner as subprocess.check_output.
+ """
+
+ log_stderr = True
+ try:
+ kwargs['stderr']
+ del kwargs['stderr']
+ except KeyError:
+ log_stderr = False
+
+ output = ''
+
+ try:
+ child = sp.Popen(popenargs, stdout=sp.PIPE, stderr=sp.PIPE, **kwargs)
+
+ log_level = {
+ child.stdout: stdout_log_level,
+ child.stderr: stdout_log_level} #stderr_log_level
+
+ def check_io():
+ ready_to_read = select.select(
+ [child.stdout, child.stderr], [], [], 1000)[0]
+ for io in ready_to_read:
+ line = str(io.readline(), 'utf-8')
+ if line and logger is not None:
+ logger.log(log_level[io], line[:-1])
+ if log_level[io] == 'stdout_log_level':
+ return line
+ elif log_stderr:
+ return line
+
+ # keep checking stdout/stderr until the child exits
+ while child.poll() is None:
+ s = check_io()
+ if s is not None:
+ output += s
+ s = check_io() # check again to catch anything after the process exits
+ if s is not None:
+ output += s
+
+ except Exception as e:
+ raise e
+
+ if child.returncode != 0:
+ raise sp.CalledProcessError(child.returncode, popenargs, None)
+ else:
+ child.wait()
+ return output
+
+def write(logger, mode, message):
+ """Write message to the log.
+
+ Writes message to logger using log_level mode.
+
+ Args:
+ message: Message to write in the log.
+ logger: Open logger instance.
+ mode: Log_lvl to write lmessage to.
+ """
+
+ if logger is not None:
+ if message is None:
+ message = pmessage
+ if mode == 'error':
+ logger.error(message)
+ elif mode == 'critical':
+ logger.critial(message)
+ elif mode == 'debug':
+ logger.debug(message)
+ elif mode == 'warning':
+ logger.warning(message)
+ elif mode == 'info':
+ logger.info(message)
+ elif mode == 'exception':
+ logger.exception(message)
+ else:
+ logger.error(message)
+
+
+def print_write(logger, mode, pmessage, lmessage = None):
+ """Print a message to the screen and write another one to the log.
+
+ Prints pmessage to the screen and writes lmessage to logger using
+ log_level mode.
+
+ Args:
+ pmessage: Message to print to the screen.
+ lmessage: Message to write in the log.
+ logger: Open logger instance.
+ mode: Log_lvl to write lmessage to.
+ """
+
+ print(pmessage)
+ if logger is not None:
+ if lmessage is None:
+ lmessage = pmessage
+ if mode == 'error':
+ logger.error(lmessage)
+ elif mode == 'critical':
+ logger.critial(lmessage)
+ elif mode == 'debug':
+ logger.debug(lmessage)
+ elif mode == 'warning':
+ logger.warning(lmessage)
+ elif mode == 'info':
+ logger.info(lmessage)
+ elif mode == 'exception':
+ logger.exception(lmessage)
+ else:
+ logger.error(lmessage)
diff --git a/example.config b/example.config
new file mode 100644
index 0000000..b3d7106
--- /dev/null
+++ b/example.config
@@ -0,0 +1,3 @@
+--samtools=/usr/bin/
+--bwa=/usr/bin/
+--bowtie2=/usr/bin/
diff --git a/example.json b/example.json
new file mode 100755
index 0000000..cbb13a0
--- /dev/null
+++ b/example.json
@@ -0,0 +1,8 @@
+{
+ "mapper": "bwa-mem",
+ "reference": ["/path/to/reference", "/path/to/another/reference"],
+ "read1": "/path/to/read1",
+ "output": "/path/to/output/folder",
+ "options": "\"-M\"",
+ "noreport": true
+}
diff --git a/extract_genbank.py b/extract_genbank.py
new file mode 100755
index 0000000..0cccc4e
--- /dev/null
+++ b/extract_genbank.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python3
+def extract_name(infile):
+ definition="UNKNOWN"
+ accession="UNKNOWN"
+ lines = 0
+ for line in infile:
+ if line.startswith("DEFINITION"):
+ definition = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
+ if line.startswith("ACCESSION"):
+ accession = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
+ lines += 1
+ if accession != "UNKNOWN" and definition != "UNKNOWN":
+ break
+ if lines > 4:
+ break
+ return accession + " " + definition
+
+def extract_sequence(infile):
+ res = []
+ numline = 0
+ for line in infile:
+ if line.startswith("ORIGIN"):
+ break
+ for line in infile:
+ if line[0] == "/":
+ seq = "".join(res)
+ return "".join(seq)
+ res += line.strip().split(" ")[1:]
+ numline += 1
+
+def write_sequence(outfile, seq, name, splitlen):
+ f = open(outfile, "w")
+ f.write(">" + name + "\n")
+ pos = 0
+ numline = 0
+ while pos < len(seq):
+ f.write(seq[pos:pos+splitlen])
+ pos += splitlen
+ f.write("\n")
+ numline += 1
+
+def main(infile, fastafile):
+ f = open(infile, "r")
+ linelength = 80
+ name = extract_name(f)
+ seq = extract_sequence(f)
+ f.close()
+ write_sequence(fastafile, seq, name, linelength)
+ return fastafile
+
+
+
diff --git a/fux_filing.py b/fux_filing.py
new file mode 100755
index 0000000..e6338a6
--- /dev/null
+++ b/fux_filing.py
@@ -0,0 +1,1079 @@
+#!/usr/bin/env python3
+#copyright Stephan Fuchs (RKI, FG13, FuchsS at rki.de)
+import os
+import hashlib
+import uuid
+import re
+import subprocess
+import random
+import sys
+import time
+
+#############################################################################################
+
+class file():
+ '''generic file object:
+ CHILDS
+ fastafile standard fasta file object
+ linfastafile linearized fasta file object
+ fastqfile standard fastq file object
+ pairedfastqfile combines two standard fastq file object containing as paired reads
+ logfile log file object (as I use it)
+
+ PARAMETER
+ name: filename including relative or absolute path [False]. If false, unique
+ filename not existing is generated
+ tmp: tmp status[False]. If true, file is handled as temporary file.
+ That means, file is deleted on exit
+ mode: mode of file opening. If mode is false and file exists, mode r
+ (reading from beginning) is set by default. If mode is false and file does
+ not exists or name is false mode is set to w (create and write) by default.
+ descr: file description [False]
+
+ ATTRIBUTES:
+ descr: file description [STRING]
+ self.name: file name without path [STRING]
+ self.path: absolute path to file [STRING]
+ self.fullname: absolute path and filename [STRING]
+ self.handle: file handle [FILE RESSOURCE]
+ self.mode: mode of file opening. If false, file is not open [STRING|False]
+ self._tmp: temporary file [BOOL]
+
+ NON-MAGIC METHODS:
+ setFilelocation() extracts name and path from given filename and sets self.name,
+ self.path, and self.fullname attributes
+ openFile() opens file handle with the given mode and sets self.mode attribute
+ reopenFile() reopens file handle with the given mode. Modes w or w+ are replaced
+ by a and a+, respectively, to avoid file truncation. Updates
+ self.mode attribute.
+ Dependency: self.openFile()
+ closeFile() closes file handle and sets self.mode to false
+ nextLine() returns next line from file
+ changeMode() changes file opening mode. By default cursor position is not stored
+ but set to the file start and truncation of the file avoided. Updates
+ self.mode attribute.
+ Dependency: self.openFile(), self.closeFile()
+ renameFile() renames file on disk and keeps opening mode and state. Only modes
+ w or w+ are replaced by a and a+, respectively, to avoid file
+ truncation. By default cursor position is not stored but set to the
+ file start. File truncation If path is set file is moved to the new path.
+ Updates self.name, self.path, and self.fullname attributes.
+ Dependency: self.reopenFile(), self.setFilelocation()
+ setTmp() file is handled as temporary file. That means, file is deleted on exit.
+ Filename is extended by .tmp extension. Opening Mode and cursor
+ position are restored. Modes w or w+ are replaced by a and a+, respectively,
+ to avoid file truncation. Attributes self.name, self.path, self.fullname,
+ self.mode, self._tmp are updated.
+ Dependency: renameFile(), setFilelocation()
+ unsetTmp() file handled as non-temporary file. If file extension is .tmp extension is
+ removed. Opening Mode and cursor position are restored. Modes w or w+
+ are replaced by a and a+, respectively, to avoid file truncation.
+ Attributes self.name, self.path, self.fullname, self.mode, self._tmp
+ are updated.
+ Dependency: renameFile(), setFilelocation()
+ checkFile() [static] checks if file exists. Only parameter is file name including path.
+ returns True or False.
+ uniqueFilename() [static] returns a non-existent 8 chars long filename based on uuid4.
+ If path parameter is not submitted, the actual path is selected.
+ fuseFiles() [class] fuses files returns full name of the fused file (filename + path).
+ Files are submitted as list. Name and Path of the fused file can
+ be also submitted.
+ destroy() destroys file object by closing file handle and in case of temporary
+ file deleting file from disk.
+ Dependency: self.closeFile()
+ write() write to file
+ flush() write buffer to file
+ '''
+
+ def __init__(self, name = False, tmp = False, mode = False, descr = False):
+ self._tmp = tmp
+ self.descr = descr
+
+ if name == False:
+ name = self.uniqueFilename()
+ if self._tmp:
+ name += '.tmp'
+
+ self.setFilelocation(name)
+
+ if mode:
+ self._mode = mode
+ else:
+ if self.checkFile(self.fullname):
+ self._mode='r'
+ else:
+ self._mode='w'
+ if self._mode:
+ self.openFile(self._mode)
+
+ def __enter__(self):
+ return self
+
+ def setFilelocation(self, fullname = False):
+ '''sets or updates name, path, full name attributes'''
+ if fullname:
+ self.path, self.name = os.path.split(fullname)
+ self.path = os.path.abspath(self.path)
+ self.fullname = os.path.abspath(os.path.join(self.path, self.name))
+
+ def openFile(self, mode):
+ '''opens file handle'''
+ self._mode = mode
+ self.handle = open(self.fullname, self._mode)
+
+ def reopenFile(self, mode):
+ '''reopens file handle. Modes w or w+ are replaced by a and a+, respectively, to avoid file truncation'''
+ self._mode = mode
+ if mode == 'w':
+ mode = 'a'
+ elif mode == 'w+':
+ mode = 'a+'
+ self.handle = open(self.fullname, self._mode)
+
+ def closeFile(self):
+ '''closes file handle'''
+ if self.handle.closed == False:
+ self._mode = False
+ self.handle.close()
+
+ def nextLine(self):
+ '''returns next line from file. Does not check if file is open for reading!'''
+ return self.handle.readline()
+
+ def changeMode(self, mode, keepCursor = False, truncate = False):
+ '''
+ changes the mode the file is opened.
+ Parameter:
+ mode new mode
+ keepCursor Pointer within the file is kept. If False (default) cursor is set to beginning of the file.
+ truncate if False (default) truncating modes (w, w+) are replaced by appending modes (a, a+)
+ '''
+ cursor = 0
+ if truncate == False:
+ if mode == 'w':
+ mode = 'a'
+ elif mode == 'w+':
+ mode = 'a+'
+ if self.handle.closed == False:
+ if keepCursor:
+ cursor = self.handle.tell()
+ self.closeFile()
+ if mode:
+ self.openFile(mode)
+ if cursor > 0:
+ self.handle.seek(cursor)
+
+ def renameFile(self, newfilename, path = False, keepCursor = False):
+ ''' Renames a file. File is opened again if it was open (mode is restored). If keepCursor is True cursor is set to the last curor position within the file.'''
+ if path == False:
+ path = self.path
+ cursor = 0
+ mode = self._mode
+ self.handle.close()
+ if self.handle.closed == False:
+ if keepCursor:
+ cursor = self.handle.tell()
+ self.closeFile()
+ newfullname = os.path.join(path, newfilename)
+ if self.checkFile(self.fullname):
+ os.rename(self.fullname, newfullname)
+ self.setFilelocation(newfullname)
+ if mode:
+ self.reopenFile(mode)
+ if cursor > 0:
+ self.handle.seek(cursor)
+
+ def setTmp(self):
+ '''Set fileObject tmp attribute True'''
+ if self._tmp == False:
+ if self.name[-4:] != '.tmp':
+ newfilename = self.name + '.tmp'
+ self.renameFile(newfilename, keepCursor = True)
+ self.name = newfilename
+ self.setFilelocation()
+ self._tmp = True
+
+ def unsetTmp(self):
+ '''Set fileObject tmp attribute False'''
+ if self._tmp == True:
+ if self.name[-4:] == '.tmp':
+ newfilename = self.name[:-4]
+ self.renameFile(newfilename, keepCursor = True)
+ self.name = newfilename
+ self.setFilelocation()
+ self._tmp = False
+
+ def write(self, str):
+ '''write to file'''
+ self.handle.write(str)
+
+ def flush(self):
+ '''write buffer to file'''
+ self.handle.flush()
+ os.fsync(self.handle.fileno())
+
+ @staticmethod
+ def checkFile(fullname):
+ '''checks if file exists. Only parameter is file name including path. Returns True or False.'''
+ return os.path.isfile(fullname)
+
+ @staticmethod
+ def uniqueFilename(path=os.getcwd(), ext = ''):
+ '''provides a filename based on uuid not in use. If no path parameter is used, the current working dir is used. Returns proposed filename including path as string.'''
+ if ext != '':
+ ext = '.' + ext
+ while True:
+ fullname = os.path.join(path, str(uuid.uuid4())[:8]) + ext
+ if file.checkFile(fullname) == False:
+ break
+ return fullname
+
+ @classmethod
+ def fuseFiles(cls, childs, name=False, path=os.getcwd()):
+ '''
+ Creates a fused file. Returns a full filename (including path) of fused file.
+ Parameter:
+ childs list of file names (including paths) whose content should be fused
+ name filename of fused file (including path). File will be overwritten! If false, a unique filename is created
+ path path for fused file (default: current working dir).
+ '''
+ pass
+ if name == False:
+ fullname = cls.uniqueFilename(path)
+ else:
+ fullname = os.path.join(path, name)
+
+ with open(fullname, "wb") as outfile:
+ for file in childs:
+ with open(file, "rb") as infile:
+ while True:
+ content = infile.read(100000000)
+ if not content:
+ break
+ outfile.write(content)
+ outfile.write(b'\n')
+ return fullname
+
+
+ def destroy(self):
+ if self.handle.closed == False:
+ self.closeFile()
+ if self._tmp:
+ try:
+ os.remove(self.fullname)
+ except:
+ pass
+
+ def __exit__(self, type, value, traceback):
+ self.destroy()
+
+#############################################################################################
+
+class fastafile(file):
+ '''standard fasta file object
+ PARENTS
+ file generic file object
+
+ PARAMETER:
+ name: filename including relative or absolute path [False]. If false, unique
+ filename not existing is generated
+ tmp: tmp status[False]. If true, file is handled as temporary file.
+ That means, file is deleted on exit
+ mode: mode of file opening. If mode is false and file exists, mode r
+ (reading from beginning) is set by default. If mode is false and file does
+ not exists or name is false mode is set to w (create and write) by default.
+ descr: file description [False]
+
+ ATTRIBUTES:
+ self._number_header Number of fasta head lines in the file*
+ self._number_singleheader Number of fasta single headers in the file*
+ Fasta head lines can contain multiple headers separated
+ by SOH (chr(1))*
+ self._number_comments Number of comment lines in the file*
+ self.number_accessions Number accessions in the file*
+ self._accessions set of accessions in the file*
+
+ * to avoid unnecessary (and sometimes time consuming) parsing, attributes are generated if
+ needed the first time
+
+ NON-MAGIC METHODS:
+ getHeaderNumber() returns number of fasta headers in the file
+ getSingleHeaderNumber() returns number of single headers in the file
+ getCommentNumber() returns number of comment lines in the file
+ screenFASTA() calculates number of headers, single headers, and comments in the file.
+ For this opening mode is changed to r and cursor set to the beginning of
+ the file. Importantly, original opening mode and cursor positions are
+ restored after actions. However, original modes w or w+ are replaced
+ by a and a+, respectively, to avoid file truncation. Updates
+ self._number_header, self._number_singleheader, and self._number_comments.
+ No return value.
+ Dependency: self.changeMode(), self.isComment(), self.isHeader() self.isHeader
+ nextEntry() Returns next entry of FASTA file as tuple (header, seq, comment). Cursor
+ is set immediatly before the following entry
+ Dependency: self.isComment(), self.isHeader
+ isHeader() returns true if submitted string is a valid fasta header, otherwise false.
+ isComment() returns true if submitted string is a fasta comment, otherwise false.
+ makeNR() fuses fasta entries with the same full-length sequence to one entry with
+ a multiple header. Returns a fasta file object pointing to the generated NR
+ (non-redundant) file.
+ Dependency: fastafile(), linfastafile(), self.linearize(), self.changeMode()
+ linearize() generates a linearized representation of a fasta file. Each line represents an
+ entry organized in columns: header, seq, comment, seqlen (optionally)
+ Separator of columns is \t by default and masked in headers and comments using
+ space by default. Returns a non-temporary linfastafile object.
+ Dependency: self.changeMode()
+ getSeqHash() [static] returns an UTF-8 encoded MD5 hash of a given sequence (string)
+ getAccessions() screens file for accessions of type: accession_prefix|accession_number
+ Accession prefixes have submitted as list. For this opening mode is changed
+ to r and cursor set to the beginning of the file. Importantly, original opening mode
+ and cursor positions are restored after actions. However, original modes w or w+
+ are replaced by a and a+, respectively, to avoid file truncation. Updates
+ self.number_accessions, self._accessions
+ No return value.
+ '''
+
+ def __init__(self, name = False, tmp = False, mode = False, descr = False):
+ file.__init__(self, name = name, tmp = tmp, mode = mode, descr = descr)
+ self._number_header = False
+ self._number_singleheader = False
+ self._number_comments = False
+ self.number_accessions = False
+ self._accessions = False
+
+ def getHeaderNumber(self):
+ if self._number_header == False:
+ self.screenFASTA()
+ return self._number_header
+
+ def getSingleHeaderNumber(self):
+ if self._number_singleheader == False:
+ self.screenFASTA()
+ return self._number_singleheader
+
+ def getCommentNumber(self):
+ if self._number_comments == False:
+ self.screenFASTA()
+ return self._number_comments
+
+ def screenFASTA(self):
+ '''provides number of entries and single header as attributes'''
+ prevmode = self._mode
+ cursor = self.handle.tell()
+ self.changeMode('r')
+ self._number_header = 0
+ self._number_singleheader = 0
+ self._number_comments = 0
+ for line in self.handle:
+ if self.isHeader(line):
+ self._number_header += 1
+ self._number_singleheader += 1 + line.count(chr(1))
+ elif self.isComment(line):
+ self._number_comments += 1
+ self.changeMode(prevmode)
+ self.handle.seek(cursor)
+
+ def nextEntry(self):
+ '''Returns next entry of FASTA file as tuple (header, seq, comment). File pointer is set immediatly before the following entry'''
+ e = 0
+ seq = []
+ comment=[]
+ header = False
+ while e < 2:
+ cursor = self.handle.tell()
+ #read line
+ line = self.nextLine()
+ if not line:
+ break
+ line = line.strip()
+ #header detection
+ if self.isHeader(line):
+ e += 1
+ # entry start
+ if e == 1:
+ header = line
+ #entry end
+ else:
+ self.handle.seek(cursor)
+ break
+ elif self.isComment(line):
+ comment.append(line)
+ else:
+ seq.append(line)
+ if header == False:
+ return False
+ else:
+ return (header, ''.join(seq), ' '.join(comment))
+
+ def isHeader(self, str):
+ #str = str.lstrip()
+ if len(str) > 0 and str[0] == '>':
+ return True
+ else:
+ return False
+
+ def isComment(self, str):
+ #str = str.lstrip()
+ if len(str) > 0 and str[0] == ';':
+ return True
+ else:
+ return False
+
+ def makeNR(self, name=False, path=os.getcwd(), format = 'fasta'):
+ #linearize and add seqlen + seqhash
+ with self.linearize(addSeqLen=True) as linFile:
+ linFile.setTmp()
+ #sort by length and seq
+ with linfastafile(tmp = True, mode = False) as sortedFile:
+ cmd = ['sort', '-k', '4,2', '-t', '\t', '-o' , sortedFile.fullname, linFile.fullname] #linux
+ cmd = ['S:\\cygwin64\\bin\\sort.exe', '-k', '4,2', '-t', ' ', '-o' , sortedFile.fullname, linFile.fullname] #cygwin
+ #print (" ".join(cmd))
+ subprocess.check_call(cmd)
+
+ #compare seqs in sorted file and make NR fasta
+ if format == 'fasta':
+ nrfile = fastafile()
+ else:
+ nrfile = linfastafile()
+
+ sortedFile.changeMode('r')
+ length = -1
+ seq = ''
+ header = []
+ for line in sortedFile.handle:
+ line = line.strip()
+ if len(line) > 0:
+ fields = line.split('\t')
+ if int(fields[3]) == length and fields[1] == seq:
+ header.extend(fields[0][1:].split(chr(1)))
+ else:
+ if len(header) > 0:
+ nrfile.handle.write(">" + chr(1).join(set(header)) + '\n' + seq + '\n')
+ header = fields[0][1:].split(chr(1))
+ seq = fields[1].strip()
+ length = int(fields[3])
+ nrfile.handle.write(">" + chr(1).join(set(header)) + '\n' + seq + '\n')
+ return nrfile
+
+
+ def linearize(self, name = False, sep = '\t', sepmask = ' ', addSeqLen = False):
+ '''
+ Saves a linearized copy of a FASTA file and returns a associated linfastafile Object.
+ Options:
+ sep separator used between columns of the linearized file (by default: \t)
+ sepmask replacement of naturally occuring separators (by default: space)
+ addSeqLen Sequence length is added to file
+ Output:
+ linfastafile Object point to a text file with different columns
+ col1 header
+ col2 sequence
+ col3 comment
+ col4 sequence length (if addSeqLen = True)
+ '''
+ prevmode = self._mode
+ cursor = self.handle.tell()
+ self.changeMode('r')
+ out = []
+ i = 0
+
+ with linfastafile(name, sep, sepmask, tmp=False) as linfile:
+ while True:
+ i += 1
+ entry = self.nextEntry()
+ if entry == False:
+ break
+ entry=[x.replace(sep, sepmask) for x in list(entry)]
+ if addSeqLen:
+ entry.append(str(len(entry[1])))
+ out.append(sep.join(entry))
+ if i >= 1000000:
+ linfile.handle.write('\n'.join(out) + '\n')
+ out = []
+ i = 0
+ linfile.handle.write('\n'.join(out) + '\n')
+ self.changeMode(prevmode)
+ self.handle.seek(cursor)
+ linfile.closeFile()
+ return linfile
+
+ @staticmethod
+ def getSeqHash(seq):
+ '''returns MD5 hash of sequence '''
+ return hashlib.md5(seq.encode('utf-8')).hexdigest()
+
+
+ def getAccessions(self, actypes):
+ if self._accessions:
+ return self._accessions
+ self._accessions = set()
+ prevmode = self._mode
+ cursor = self.handle.tell()
+ self.changeMode('r')
+ self.number_accessions = 0
+ regexp = r"(?:>|" + chr(1) + "|\|)(?:" + "|".join(actypes) + ")\|[^|\r\n ]+"
+ pattern = re.compile(regexp)
+ for line in self.handle:
+ line = line.strip()
+ if self.isHeader(line):
+ matches = re.findall(pattern, line)
+ if matches:
+ self._accessions.update(matches)
+ self.number_accessions = len(self._accessions)
+ self.changeMode(prevmode)
+ self.handle.seek(cursor)
+
+#############################################################################################
+
+class linfastafile(file):
+ '''linearized fasta file object
+ these files are separator-delimited representations of FASTA files usefull for parsing
+ each line represents one entry
+
+ PARENTS
+ file generic file object
+
+ PARAMETER:
+ name: filename including relative or absolute path [False]. If false, unique
+ filename not existing is generated
+ tmp: tmp status[False]. If true, file is handled as temporary file.
+ That means, file is deleted on exit
+ mode: mode of file opening. If mode is false and file exists, mode r
+ (reading from beginning) is set by default. If mode is false and file does
+ not exists or name is false mode is set to w (create and write) by default.
+ descr: file description [False]
+ sep: separator of columns [CHAR]
+ sepmask: char used to mask separator in headers or comments [CHAR]. Must be different from sep
+
+
+ ATTRIBUTES:
+ self._number_header Number of fasta head lines in the file*
+ self._number_singleheader Number of fasta single headers in the file*
+ Fasta head lines can contain multiple headers separated
+ by SOH (chr(1))*
+ self._number_comments Number of comment lines in the file*
+ self._number_accessions Number accessions in the file*
+ self._accessions set of accessions in the file*
+ self.number_seq Number of sequences*
+
+ * to avoid unnecessary (and sometimes time consuming) parsing, attributes are generated if
+ needed the first time
+
+ NON-MAGIC METHODS:
+ initialize() cursor is set to position 0
+ atrributes self._number_header, self._number_seq, self._number_singleheader,
+ self._number_comments are set to False
+ getHeaderNumber() returns number of fasta headers in the file
+ Dependency: self.screenLinFASTA()
+ getSingleHeaderNumber() returns number of single headers in the file
+ Dependency: self.screenLinFASTA()
+ getCommentNumber() returns number of comments in the file
+ Dependency: self.screenLinFASTA()
+ getSequenceNumber() returns number of sequences in the file
+ Dependency: self.screenLinFASTA()
+ screenLinFASTA() calculates number of headers, single headers, and comments in the file.
+ For this opening mode is changed to r and cursor set to the beginning of
+ the file. Importantly, original opening mode and cursor positions are
+ restored after actions. However, original modes w or w+ are replaced
+ by a and a+, respectively, to avoid file truncation. Updates
+ self._number_header, self._number_singleheader, and self._number_comments.
+ No return value.
+ Dependency: self.changeMode()
+ getAccessions() screens file for accessions of type: accession_prefix|accession_number
+ Accession prefixes have submitted as list. For this opening mode is changed
+ to r and cursor set to the beginning of the file. Importantly, original opening mode
+ and cursor positions are restored after actions. However, original modes w or w+
+ are replaced by a and a+, respectively, to avoid file truncation. Updates
+ self.number_accessions, self._accessions
+ No return value.
+ '''
+
+ def __init__(self, name = False, sep = '\t', sepmask = ' ', tmp=False, mode = False, descr = False):
+ file.__init__(self, name=name, tmp=tmp, mode=mode, descr = descr)
+ self.number_header = 0
+ self._sep = sep
+ self._sepmask = sepmask
+ self.number_header = False
+ self.number_seq = False
+ self.number_singleheader = False
+ self.number_comments = False
+ self._accessions = False
+ self.number_accessions = False
+
+ self.initialize()
+
+ def initialize(self):
+ '''provides number of entries and single header as attributes'''
+ filepointer = self.handle.tell()
+ self.handle.seek(0)
+ self._number_header = False
+ self._number_seq = False
+ self._number_singleheader = False
+ self._number_comments = False
+
+ def getHeaderNumber(self):
+ if self._number_header == False:
+ self.screenLinFASTA()
+ return self.number_header
+
+ def getSingleHeaderNumber(self):
+ if self._number_singleheader == False:
+ self.screenLinFASTA()
+ return self._number_singleheader
+
+ def getCommentNumber(self):
+ if self._number_comments == False:
+ self.screenLinFASTA()
+ return self._number_comments
+
+ def getSeqNumber(self):
+ if self.number_seq == False:
+ self.screenLinFASTA()
+ return self.number_seq
+
+ def screenLinFASTA(self):
+ '''provides number of entries and single header as attributes'''
+ prevmode = self._mode
+ cursor = self.handle.tell()
+ self.changeMode('r')
+ self.number_header = 0
+ self.number_seq = 0
+ self.number_singleheader = 0
+ self.number_comments = 0
+ for line in self.handle:
+ if len(line) > 0:
+ fields = [x.strip() for x in line.split(self._sep)]
+ if len(fields[0]) > 0:
+ self.number_header += 1
+ self.number_singleheader += 1 + fields[0].count(chr(1))
+ if len(fields[1]) > 0:
+ self.number_seq += 1
+ if len(fields[2]) > 0:
+ self.number_comment += 1
+ self.changeMode(prevmode)
+ self.handle.seek(cursor)
+
+
+ def getAccessions(self, actypes):
+ if self._accessions:
+ return self._accessions
+ self._accessions = set()
+ filepointer = self.handle.tell()
+ self.handle.seek(0)
+ self.number_accessions = 0
+ regexp = r"(?:>|" + chr(1) + "|\|)(?:" + "|".join(actypes) + ")\|[^|\r\n ]+"
+ pattern = re.compile(regexp)
+ for line in self.handle:
+ matches = re.findall(pattern, line.split(self._sep)[0])
+ if matches:
+ self._accessions.update(matches)
+ self.handle.seek(filepointer)
+ self.number_accessions = len(self._accessions)
+
+#############################################################################################
+
+class fastqfile(file):
+ '''standard fastq file object
+
+ PARENTS
+ file generic file object
+
+ PARAMETER:
+ name: filename including relative or absolute path [False]. If false, unique
+ filename not existing is generated
+ tmp: tmp status[False]. If true, file is handled as temporary file.
+ That means, file is deleted on exit
+ mode: mode of file opening. If mode is false and file exists, mode r
+ (reading from beginning) is set by default. If mode is false and file does
+ not exists or name is false mode is set to w (create and write) by default.
+ descr: file description [False]
+
+
+ ATTRIBUTES:
+ self._number_reads number of reads in the file
+
+ * to avoid unnecessary (and sometimes time consuming) parsing, attributes are generated if
+ needed the first time
+
+ NON-MAGIC METHODS:
+ initialize() cursor is set to position 0
+ atrribute self._number_reads is set to False
+ getReadNumber() returns number of reads in the file
+ Dependency: self.screenFastq()
+ checkIdentifierLine() returns true if given string starts with @
+ checkDescriptionLine() returns true if given string starts with +
+ screenFastq() calculates number of headers, single headers, and comments in the file.
+ For this opening mode is changed to r and cursor set to the beginning of
+ the file. Importantly, original opening mode and cursor positions are
+ restored after actions. However, original modes w or w+ are replaced
+ by a and a+, respectively, to avoid file truncation. Updates
+ self._number_header, self._number_singleheader, and self._number_comments.
+ No return value. Can exit and print Exception message if standard format is
+ violated.
+ Dependency: self.changeMode()
+ nextRead() Returns next read of FASTQ file as tuple (identifier, sequence, description, quality scores).
+ File pointer is set immediatly before the following entry
+ Dependency: self.nextLine()
+ randomSubset() generates a fastq file with a random read subset. Proportion [0, 1] can be defined.
+ Returns a new fastq file object.
+ Dependency: self.nextRead(), self.getReadNumber()
+ '''
+
+ def __init__(self, name = False, tmp=False, mode = False, descr = False):
+ file.__init__(self, name=name, tmp=tmp, mode=mode, descr = descr)
+ self.initialize()
+
+ def initialize(self):
+ '''provides number of entries and single header as attributes'''
+ filepointer = self.handle.tell()
+ self.handle.seek(0)
+ self._number_reads = False
+
+ def getReadNumber(self):
+ '''returns number of reads stored in the fastq file'''
+ if self._number_reads == False:
+ self.screenFastq()
+ return self._number_reads
+
+ def checkIdentifierLine(self, line):
+ if line[0] == "@":
+ return True
+ else:
+ return False
+
+ def checkDescriptionLine(self, line):
+ if line[0] == "+":
+ return True
+ else:
+ return False
+
+ def screenFastq(self):
+ '''provides number of reads as attributes'''
+ prevmode = self._mode
+ cursor = self.handle.tell()
+ self.changeMode('r')
+ self._number_reads = 0
+ l = 0
+ while True:
+ identifier = self.handle.readline()
+ l += 1
+ if not identifier:
+ break
+ if identifier == "":
+ continue
+ if self.checkIdentifierLine(identifier) == False:
+ print("FORMAT EXCEPTION in line", l, ": identifier lines should start with @")
+ self._number_reads = False
+ exit(1)
+ self.nextLine()
+ l += 1
+ if self.checkDescriptionLine(self.nextLine()) == False:
+ print("FORMAT EXCEPTION in line", l , ": description lines should start with +")
+ self._number_reads = False
+ exit(1)
+ l += 1
+ if self.nextLine() == "":
+ print("FORMAT EXCEPTION in line", l , ": empty score line")
+ self._number_reads = False
+ exit(1)
+ l += 1
+ self._number_reads += 1
+ self.changeMode(prevmode)
+ self.handle.seek(cursor)
+
+ def nextRead(self):
+ '''Returns next read of FASTQ file as tuple (identifier, sequence, description, quality scores). File pointer is set immediatly before the following entry'''
+ line = self.nextLine()
+ if not line:
+ return False
+ if line.strip() == "":
+ self.nextRead()
+ if self.checkIdentifierLine(line) == False:
+ print("FORMAT EXCEPTION: identifier lines should start with @")
+ exit(1)
+ identifier = line
+
+ line = self.nextLine()
+ if line.strip() == '':
+ print("FORMAT EXCEPTION: no read sequence found")
+ exit(1)
+ seq = line
+
+ line = self.nextLine()
+ if self.checkDescriptionLine(line) == False:
+ print("FORMAT EXCEPTION: description lines should start with +")
+ exit(1)
+ descr = line
+
+ line = self.nextLine()
+ if line.strip() == '':
+ print("FORMAT EXCEPTION: no quality scores found")
+ exit(1)
+ scores = line
+
+ return (identifier, seq, descr, scores)
+
+ def randomSubset(self, prop, tmp = False, selected = False):
+ '''generates a fastq file with a random read subset. Proportion [0, 1] can be defined. Returns a new fastq file object.'''
+ if selected == False:
+ prop = int(self.getReadNumber()*prop)
+ selected = random.sample(range(1, self.getReadNumber()), prop)
+ selected = set(selected)
+
+ subsetfastqObj = fastqfile(tmp = tmp)
+ e = 1
+ while e <= self.getReadNumber():
+ readdata = self.nextRead()
+ if e in selected:
+ subsetfastqObj.handle.write(''.join(readdata))
+ e += 1
+ return subsetfastqObj
+
+#############################################################################################
+
+class pairedfastqfiles():
+ '''combines two standard fastq file object containing as paired reads
+ read numbers have to be the same in both files
+
+ PARENTS
+ file generic file object
+
+ PARAMETER:
+ fname: filename of forward read file including relative or absolute path [False]. If false, unique
+ filename not existing is generated
+ rname: filename of reverse read file including relative or absolute path [False]. If false, unique
+ filename not existing is generated
+ tmp: tmp status[False]. If true, files are handled as temporary files.
+ That means, files are deleted on exit
+ mode: mode of file opening. If mode is false and file exists, mode r
+ (reading from beginning) is set by default. If mode is false and file does
+ not exists or fname/rname is false mode is set to w (create and write) by default.
+ descr: file description [False]
+
+
+ ATTRIBUTES:
+ self.forward fastq file object of the forward read file
+ self.reverse fastq file object of the reverse read file
+ self._number_reads read numbers (same for both files)
+
+ NON-MAGIC METHODS:
+ initialize() conmpares read numbers of paired files. Exception shown if different.
+ Cursor is NOT set to position 0
+ Updates self._number_reads
+ Dependency: self.getReadNumber()
+ attribute self._number_reads is set to False
+ getReadNumber() returns number of reads in the files
+ Dependency: self.screenLinFASTA()
+ randomSubset() generates paired fastq files with random paired read subsets. Proportion [0, 1] can be defined.
+ Returns a new paired fastq file object.
+ Dependency: self.nextRead(), self.getReadNumber()
+ '''
+
+ def __init__(self, fname = False, rname = False, tmp=False, mode = False, descr = False):
+ self.forward = fastqfile(name=fname, tmp=tmp, mode=mode, descr = descr)
+ self.reverse = fastqfile(name=rname, tmp=tmp, mode=mode, descr = descr)
+ self.initialize()
+
+ def __enter__(self):
+ return self
+
+ def initialize(self):
+ '''provides number of entries and single header as attributes'''
+ self.getReadNumber()
+
+ def getReadNumber(self):
+ if self.forward.getReadNumber() != self.reverse.getReadNumber():
+ print('forward file:', self.forward.fullname, '(reads:', self.forward.getReadNumber(),')')
+ print('reverse file:', self.reverse.fullname, '(reads:', self.reverse.getReadNumber(),')')
+ raise ValueError('paired files differ in read numbers')
+ else:
+ self._number_reads = self.forward.getReadNumber()*2
+ return self._number_reads
+
+ def randomSubset(self, prop, tmp = False):
+ '''generates paired fastq files with random read subsets. Proportion [0, 1] can be defined. Returns a new paired fastq file object.'''
+ prop = int(self.getReadNumber()*prop)
+ selected = random.sample(range(1, self.getReadNumber()), prop)
+ forward_subset = self.forward.randomSubset(prop, tmp = tmp, selected = selected)
+ forward_subset.closeFile()
+ reverse_subset = self.reverse.randomSubset(prop, tmp = tmp, selected = selected)
+ reverse_subset.closeFile()
+ subsetPairedfastqObj = pairedfastqfiles(fname = forward_subset.fullname, rname = reverse_subset.fullname, tmp = tmp)
+ return subsetPairedfastqObj
+
+ def setTmp(self):
+ '''Set fileObject tmp attribute True'''
+ self.forward.setTmp()
+ self.reverse.setTmp()
+
+ def unsetTmp(self):
+ '''Set fileObject tmp attribute False'''
+ self.forward.unsetTmp()
+ self.reverse.unsetTmp()
+
+ def __exit__(self, type, value, traceback):
+ self.forward.destroy()
+ self.reverse.destroy()
+
+
+
+#############################################################################################
+
+class logfile(file):
+ '''log file object (as I use it)
+
+ PARENTS
+ file generic file object
+
+ ATTRIBUTES
+ catlineChar Char or string used as category underlin [STRING]
+ catlineLen Length category underlining [INT]
+
+ PARAMETER:
+ name: filename including relative or absolute path [False]. If false, unique
+ filename not existing is generated
+ tmp: tmp status[False]. If true, file is handled as temporary file.
+ That means, file is deleted on exit
+ mode: mode of file opening. If mode is false and file exists, mode r
+ (reading from beginning) is set by default. If mode is false and file does
+ not exists or name is false mode is set to w (create and write) by default.
+ descr: file description [False]
+
+
+ ATTRIBUTES:
+ self.catlineLen length of category underlining
+ self.catlineChar char used for category underlining
+
+ NON-MAGIC METHODS:
+ initialize() cursor is set to position 0
+ atrribute self._number_reads is set to False
+
+ writeAsCols() create columns with individual width, text is left aligned, numbers
+ right aligned. Formatted string is written to log file. Input is a 2-level
+ list (1st level: lines, 2nd level: columns). Returns no value.
+ newCat() writes a new category with a given Name to the log file incl. underlining
+ (s. attributes)
+ lb() writes a linebreak to logfile. For compatibility reasons windows line
+ breaks (\r\n) are used
+ Dependency: self.write()
+ addExecTime() adds the execution time in hh:ss:mm to log file. Seconds have to be given.
+ Dependency: self.write(), self.lb()
+
+ '''
+
+ def __init__(self, name = False, tmp=False, mode = False, descr = False):
+ file.__init__(self, name=name, tmp=tmp, mode=mode, descr = descr)
+ self.catlineLen = 30
+ self.catlineChar = "="
+
+ def writeAsCols (self, input, sep = ' ' * 5):
+ '''create columns with individual width
+
+ PARAMETER
+ sep separator of columns [STRING]
+ '''
+ #allowed signs in numbers
+ pattern = re.compile(r'([+-])?([0-9]+)([.,]?)([0-9]+)(%?)')
+
+ #str conversion
+ lines = [[str(x) for x in line] for line in input]
+
+ #fill up rows to same number of columns (important for transposing)
+ maxlen = max([len(x) for x in lines])
+ [x.extend(['']*(maxlen-len(x))) for x in lines]
+
+ #find format parameters
+ width = [] #colum width or length
+ align = [] #alignment type (number = r; strings = l)
+ prec = [] #number precision
+ for column in zip(*lines):
+ width.append(len(column[0]))
+ prec.append(0)
+ align.append('r')
+ for field in column[1:]:
+ if align[-1] == 'l':
+ width[-1] = max(width[-1], len(field))
+ else:
+ match = pattern.match(field)
+ if match and match.group(0) == field:
+ if match.group(3) and match.group(4):
+ prec[-1] = max(prec[-1], len(match.group(4)))
+
+ if prec[-1] > 0:
+ k = 1
+ else:
+ k = 0
+ width[-1] = max(width[-1], len(match.group(2)) + prec[-1] + k + len(match.group(4)))
+ else:
+ align[-1] = 'l'
+ prec[-1] = False
+ width[-1] = max(width[-1], len(field))
+
+ #formatting
+ output = []
+ for line in lines:
+ f = 0
+ output.append([])
+ for field in line:
+ if align[f] == 'l':
+ output[-1].append(field + ' ' * (width[f] - len(field)))
+ elif align[f] == 'r':
+ match = pattern.match(field)
+ if not match or match.group(0) != field:
+ output[-1].append(' ' * (width[f] - len(field)) + field)
+ else:
+ if match.group(5):
+ percent = match.group(5)
+ else:
+ percent = ''
+ length = len(percent)
+
+ intpart = match.group(2)
+ if match.group(1):
+ intpart = match.group(1) + intpart
+ if match.group(3):
+ decpart = match.group(4)
+ else:
+ intpart += match.group(4)
+ decpart = ''
+ length += len(intpart)
+
+ #print('match:', match.group(0))
+ #print('intpart:', intpart)
+ #print('decpart:', decpart)
+ #print('prec:', prec[f])
+
+ if prec[f] > 0:
+ decpart += '0' * (prec[f] - len(match.group(4)))
+ length += len(decpart) + 1
+ formatted_number = ' ' * (width[f] - length) + intpart + '.' + decpart + percent
+ else:
+ formatted_number = ' ' * (width[f] - length) + intpart + percent
+ output[-1].append(formatted_number)
+ f += 1
+ self.write('\r\n'.join([sep.join(x) for x in output]))
+
+ def newCat(self, cat):
+ if self.catlineLen < len(cat):
+ catline = self.catlineChar * len(cat)
+ else:
+ catline = self.catlineChar * self.catlineLen
+ self.write('\r\n\r\n' + cat.upper() + '\r\n' + catline + '\r\n')
+
+ def lb(self):
+ self.write('\r\n')
+
+ def addExecTime(self, s):
+ self.lb()
+ self.lb()
+ self.write('--------------------------')
+ self.lb()
+ self.write('execution time (hh:mm:ss): ')
+ self.write('{:02.0f}:{:02.0f}:{:02.0f}'.format(s//3600, s%3600//60, s%60))
+
+
+
+
+
+
+
+
+
diff --git a/mapper.py b/mapper.py
new file mode 100755
index 0000000..a34d68c
--- /dev/null
+++ b/mapper.py
@@ -0,0 +1,520 @@
+from abc import ABCMeta, abstractmethod
+import argparse
+import os
+import gzip
+import custom_logger as log
+import re
+import sys
+if sys.version_info < (3, 2):
+ import subprocess32 as sp
+else:
+ import subprocess as sp
+
+
+class Mapper:
+ """
+ This is the abstract class to use when implementing a new mapper.
+ Try to minimize the output in the console, e.g. redirect the output of a call to dev/null or save it somewhere.
+ Make sure if you may want to use os.path.expanduser(PATH) to extend e.g. '~' to the home directory.
+ """
+
+ __metaclass__ = ABCMeta
+
+ @abstractmethod
+ def __init__(self, query, reference, output, options=None, logger=None):
+ """
+ Input:
+ query: Path to the query file.
+ Type: str for single-end reads, list of two str for paired-end reads.
+ reference: Path to the reference file.
+ Type: str
+ output: Path to the output file without the file extension (e.g. '.sam').
+ Type: str
+ options: Contains all additional options.
+ Type: list of str, organized like sys.argv
+ logger: Optional log.
+ Type: logging.logger object
+
+ Accessed from outside:
+ statistics: String with mapper statistics.
+ Type: str
+ version: Version of the used mapper.
+ Type: str
+
+ Other:
+ In self.options may be given: A path where to save the index. Default: '~/tmp/'
+ e.g.:
+ index_location: Expected path to the index, i.e. where it should be saved. Default: '~/tmp/'
+ Type: str
+ index: Real path to the index.
+ Type: str
+
+ and
+ verbosity: May be given in self.options.
+ 1: Report steps.
+ Type: int
+
+ You may add more initialization, e.g. detection of input file types (e.g. .fa or .fastq in Bowtie2).
+
+ self.report() at the end.
+ """
+
+ self.query = query
+ self.reference = os.path.expanduser(reference)
+ self.query_paired = len(query) == 2 #isinstance(query, list)
+ self.output = os.path.expanduser(output) # For example, without file extension
+ if self.query_paired:
+ self.query[0] = os.path.expanduser(self.query[0])
+ self.query[1] = os.path.expanduser(self.query[1])
+ else:
+ self.query = os.path.expanduser(self.query[0])
+ self.statistics = None
+ self.version = None
+ self.index = None
+ self.logger = logger
+ self.verbosity = 1
+ # parse options
+ self.report()
+
+ @abstractmethod
+ def map(self):
+ """ Create index if non exists (self.index). Call mapper to map self.query to self.reference using additional options.
+ Automatically detects single- or paired-end reads.
+ Return the absolute path of the location of the generated SAM-file.
+ Return None if an error occurs.
+ If the console output contains statistics (e.g. in Bowtie2), these should be saved in self.statistics."""
+
+ @abstractmethod
+ def get_version(self):
+ """ Return the version. """
+ return self.version
+
+ @abstractmethod
+ def write_unmapped(self, un):
+ """ Write unaligned reads to un. """
+
+ @abstractmethod
+ def get_statistics(self):
+ """ Return the statistics. """
+ return self.statistics
+
+ @abstractmethod
+ def cleanup(self):
+ """ Remove the index. You may want to use 'find' instead of 'rm'."""
+
+ @abstractmethod
+ def report(self):
+ """ Write all meaningful variables (at least the one defined in __init__) and their respective values to the logger. """
+
+class Bowtie2(Mapper):
+
+ def __init__(self, query, reference, output, tmp=None, options=None, logger=None, readgroups=True, unmapped=None, samtools=None, bowtie2=None):
+ self.query = query
+ self.reference = os.path.expanduser(reference)
+ self.output = os.path.expanduser('{}{}'.format(output,'.sam'))
+ self.reference_fasta = reference.endswith('.fa') or reference.endswith('.mfa')
+ self.samtools = samtools
+ self.bowtie2 = bowtie2
+ self.query_paired = len(query) == 2 #isinstance(query, list)
+ if self.query_paired:
+ self.query[0] = os.path.expanduser(self.query[0])
+ self.query[1] = os.path.expanduser(self.query[1])
+ else:
+ self.query = os.path.expanduser(self.query[0])
+ self.query_fasta = query[0].endswith('.fa') or query[0].endswith('.mfa') #if self.query_paired else query.endswith('.fa') or query.endswith('.mfa')
+ self.statistics = None
+ self.version = None
+ self.index = None
+ self.verbosity = 1
+ self.logger = logger
+ self.readgroups = readgroups
+ self.options = options
+ self.unmapped = unmapped
+ if self.unmapped is not None:
+ if self.query_paired:
+ if self.options is not None:
+ self.options += ' --un-conc {}_{}'.format(self.unmapped, os.path.basename(self.query[0]))
+ else:
+ self.options = ' --un-conc {}_{}'.format(self.unmapped, os.path.basename(self.query[0]))
+ else:
+ if self.options is not None:
+ self.options += ' --un {}_{}'.format(self.unmapped, os.path.basename(self.query))
+ else:
+ self.options = ' --un {}_{}'.format(self.unmapped, os.path.basename(self.query))
+ if tmp is None:
+ self.index_location = os.path.expanduser('~/tmp/{}'.format('.'.join(os.path.basename(reference).split('.')[:-1])))
+ else:
+ self.index_location = os.path.join(tmp, '.'.join(os.path.basename(reference).split('.')[:-1]))
+ try:
+ os.mkdir(os.path.expanduser('~/tmp/'))
+ except OSError:
+ pass
+ self.report()
+ #TODO options may change index_location or verbosity
+
+ def update(self, query, output, options):
+ self.query = query
+ self.output = os.path.expanduser('{}{}'.format(output,'.sam'))
+ self.query_paired = len(query) == 2 #isinstance(query, list)
+ if self.query_paired:
+ self.query[0] = os.path.expanduser(self.query[0])
+ self.query[1] = os.path.expanduser(self.query[1])
+ if self.unmapped is not None:
+ if options is not None:
+ self.options = options + ' --un-conc {}_{}'.format(self.unmapped, os.path.basename(self.query[0]))
+ else:
+ self.options = ' --un-conc {}_{}'.format(self.unmapped, os.path.basename(self.query[0]))
+ else:
+ self.query = os.path.expanduser(self.query[0])
+ if self.unmapped is not None:
+ if options is not None:
+ self.options = options + ' --un {}_{}'.format(self.unmapped, os.path.basename(self.query))
+ else:
+ self.options = ' --un {}_{}'.format(self.unmapped, os.path.basename(self.query))
+ self.query_fasta = query[0].endswith('.fa') or query[0].endswith('.mfa') #if self.query_paired else query.endswith('.fa') or query.endswith('.mfa')
+
+ def create_index(self):
+ if self.options is not None and '-x' in self.options:
+ self.index = None
+ else:
+ try:
+ if self.verbosity >= 1:
+ log.print_write(self.logger, 'info', '=== Building index ===')
+ try:
+ if self.reference == '':
+ raise Exception('No reference given.')
+ except:
+ log.print_write(self.logger, 'exception', 'ERROR: No reference given.')
+ sys.exit(1)
+ self.command = [x for x in [os.path.realpath(os.path.join(self.bowtie2, 'bowtie2-build')), '-f' if self.reference_fasta else None, self.reference, self.index_location] if x is not None]
+ log.call(self.command, self.logger)
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in Bowtie2.create_index()')
+ return False
+ self.index = self.index_location
+
+ def map(self):
+ if self.index is None:
+ self.create_index()
+ try:
+ if self.readgroups:
+ identifier = ''
+ if (self.query[0] if self.query_paired else self.query).endswith('.gz'):
+ with gzip.open(self.query[0] if self.query_paired else self.query, 'rt') as read:
+ for line in read:
+ if line[0] == '@':
+ identifier = line[1:].split(' ')[0].strip()
+ break
+ else:
+ with open(self.query[0] if self.query_paired else self.query, 'r') as read:
+ for line in read:
+ if line[0] == '@':
+ identifier = line[1:].split(' ')[0].strip()
+ break
+ try:
+ match = re.match("([\w-]+):(\d+):([\w-]+):(\d+):(\d+):(\d+):(\d+)", identifier)
+ instrument = match.group(1)
+ run = match.group(2)
+ flowcell = match.group(3)
+ lane = match.group(4)
+ tile = match.group(5)
+ x_pos = match.group(6)
+ y_pos = match.group(7)
+ except:
+ log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set.')
+
+ if self.query_paired:
+ try:
+ rg_sm = re.match("r\d_[UP]+_(.*)_L[\d]+_R[\d]+(_[\d]+){0,1}.*", os.path.basename(self.query[0])).group(1)
+ except AttributeError:
+ rg_sm = os.path.basename(self.query[0]).split('.')[0]
+ else:
+ try:
+ rg_sm = re.match("r\d_[UP]+_(.*)_L[\d]+_R[\d]+(_[\d]+){0,1}.*", os.path.basename(self.query)).group(1)
+ except AttributeError:
+ rg_sm = os.path.basename(self.query).split('.')[0]
+ rg_pl = 'ILLUMINA'
+ rg_id = '.'.join([flowcell, rg_sm, lane])
+ rg_pu = rg_id
+ rg_lb = 'lib1'
+ else:
+ rg_sm = None
+ rg_pl = None
+ rg_id = None
+ rg_pu = None
+ rg_lb = None
+
+ if self.verbosity >= 1:
+ log.print_write(self.logger, 'info', '=== Mapping query on reference ===')
+ self.command = [x for x in ([
+ os.path.realpath(os.path.join(self.bowtie2, 'bowtie2')),
+ '-f' if self.query_fasta else None,
+ '-x' if self.index else None,
+ self.index if self.index else None,
+ '-1' if self.query_paired else None,
+ self.query[0] if self.query_paired else None,
+ '-2' if self.query_paired else None,
+ self.query[1] if self.query_paired else None,
+ '-U' if not self.query_paired else None,
+ self.query if not self.query_paired else None,
+ '-S', self.output,
+ '--rg-id' if self.readgroups else None, rg_id,
+ '--rg' if self.readgroups else None, 'SM:{}'.format(rg_sm) if self.readgroups else None,
+ '--rg' if self.readgroups else None, 'PU:{}'.format(rg_pu) if self.readgroups else None,
+ '--rg' if self.readgroups else None, 'PL:{}'.format(rg_pl) if self.readgroups else None,
+ '--rg' if self.readgroups else None, 'LB:{}'.format(rg_lb) if self.readgroups else None]
+ + (self.options.split(' ') if self.options else list())
+ )
+ if x is not None] # TODO options
+ self.statistics = log.check_output(self.command, self.logger, stderr=sp.STDOUT)
+ return self.output
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in Bowtie2.map()')
+ return None
+
+ def set_version(self):
+ try:
+ self.command = [os.path.realpath(os.path.join(self.bowtie2, 'bowtie2')), '--version']
+ self.version = re.match('.*version (.*)\n', str(sp.check_output(self.command), 'utf-8')).group(1)
+ except sp.CalledProcessError:
+ log.write(self.logger, 'exception', 'ERROR: CPE in Bowtie2.set_version')
+ return False
+
+ def get_version(self):
+ if self.version is None:
+ self.set_version()
+ return self.version
+
+ def get_statistics(self):
+ return self.statistics
+
+ def get_fast_options(self):
+ return '-k 10'
+
+ def get_sensitive_options(self):
+ return '-a'
+
+ def cleanup(self):
+ if self.index is not None:
+ if self.verbosity >= 1:
+ log.write(self.logger, 'info', '=== Cleaning up mapper ===')
+ try:
+ self.command = ('find {} -type f -regextype posix-extended '
+ '-iregex \'.*{}\.(([0-9]+\.bt2)|(rev\.[0-9]+\.bt2)|fasta)$\' -delete'.format(os.path.dirname(self.index), os.path.basename(self.index)))
+ log.call(self.command, self.logger, shell=True)
+ self.index = None
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in Bowtie2.cleanup()')
+ return False
+
+ def report(self):
+ log.write(self.logger, 'warning', 'Bowtie2 Version: {}'.format(self.get_version()))
+ log.write(self.logger, 'warning', 'Parameters: ')
+ for arg in sorted(vars(self)):
+ if arg != 'command':
+ log.write(self.logger, 'warning', '{} = {}'.format(arg, getattr(self, arg)))
+
+class BWAMem:
+
+ def __init__(self, query, reference, output, tmp=None, options=None, logger=None, readgroups=True, unmapped=None, samtools=None, bwa=None):
+ self.query = query
+ self.reference = os.path.expanduser(reference)
+ self.query_paired = len(query) == 2 #isinstance(query, list)
+ self.output = os.path.expanduser('{}.sam'.format(output))
+ self.samtools = samtools
+ self.bwa = bwa
+ self.unmapped_raw = unmapped
+ self.unmapped = ''
+ if self.query_paired:
+ self.query[0] = os.path.expanduser(self.query[0])
+ self.query[1] = os.path.expanduser(self.query[1])
+ self.unmapped = '{}_{}'.format(self.unmapped_raw, os.path.splitext(os.path.basename(self.query[0]))[0])
+ else:
+ self.query = os.path.expanduser(self.query[0])
+ self.unmapped = '{}_{}'.format(self.unmapped_raw, os.path.splitext(os.path.basename(self.query))[0])
+ if self.unmapped_raw is None:
+ self.unmapped = None
+ self.statistics = None
+ self.version = None
+ self.index = None
+ self.options = options
+ self.index_given = '-x' in self.options if self.options is not None else False
+ self.logger = logger
+ self.verbosity = 1
+ self.readgroups = readgroups
+ if tmp is None:
+ self.index_location = os.path.expanduser('~/tmp/{}'.format('.'.join(os.path.basename(reference).split('.')[:-1])))
+ else:
+ self.index_location = os.path.join(tmp, '.'.join(os.path.basename(reference).split('.')[:-1]))
+ self.report()
+
+ def update(self, query, output, options):
+ self.query = query
+ self.output = os.path.expanduser('{}{}'.format(output,'.sam'))
+ self.query_paired = len(query) == 2 #isinstance(query, list)
+ if self.query_paired:
+ self.query[0] = os.path.expanduser(self.query[0])
+ self.query[1] = os.path.expanduser(self.query[1])
+ if self.unmapped_raw is not None:
+ self.unmapped = '{}_{}'.format(self.unmapped_raw, os.path.splitext(os.path.basename(self.query[0]))[0])
+ else:
+ self.query = os.path.expanduser(self.query[0])
+ if self.unmapped_raw is not None:
+ self.unmapped = '{}_{}'.format(self.unmapped_raw, os.path.splitext(os.path.basename(self.query))[0])
+ self.query_fasta = query[0].endswith('.fa') or query[0].endswith('.mfa') #if self.query_paired else query.endswith('.fa') or query.endswith('.mfa')
+
+ def create_index(self):
+ if self.options is not None and '-x' in self.options:
+ cache = self.options.split(' ')
+ pos = cache.index('-x')
+ self.index_location = cache[pos + 1]
+ del cache[pos:pos+2]
+ self.options = ' '.join(cache)
+ else:
+ try:
+ if self.verbosity >= 1:
+ log.print_write(self.logger, 'info', '=== Building index ===')
+ try:
+ if self.reference == '':
+ raise Exception('No reference given.')
+ except:
+ log.print_write(self.logger, 'exception', 'ERROR: No reference given.')
+ sys.exit(1)
+ self.command = [os.path.realpath(os.path.join(self.bwa, 'bwa')), 'index', '-p', self.index_location, self.reference]
+ log.call(self.command, self.logger)
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in BWAMem.create_index()')
+ return False
+ self.index = self.index_location
+
+ def map(self):
+ if self.index is None:
+ self.create_index()
+ try:
+ if self.readgroups:
+ identifier = ''
+ if (self.query[0] if self.query_paired else self.query).endswith('.gz'):
+ with gzip.open(self.query[0] if self.query_paired else self.query, 'rt') as read:
+ for line in read:
+ if line[0] == '@':
+ identifier = line[1:].split(' ')[0].strip()
+ break
+ else:
+ with open(self.query[0] if self.query_paired else self.query, 'r') as read:
+ for line in read:
+ if line[0] == '@':
+ identifier = line[1:].split(' ')[0].strip()
+ break
+ try:
+ match = re.match("([\w-]+):(\d+):([\w-]+):(\d+):(\d+):(\d+):(\d+)", identifier)
+ instrument = match.group(1)
+ run = match.group(2)
+ flowcell = match.group(3)
+ lane = match.group(4)
+ tile = match.group(5)
+ x_pos = match.group(6)
+ y_pos = match.group(7)
+ except:
+ log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set.')
+
+ if self.query_paired:
+ try:
+ rg_sm = re.match("r\d_[UP]+_(.*)_L[\d]+_R[\d]+(_[\d]+){0,1}.*", os.path.basename(self.query[0])).group(1)
+ except AttributeError:
+ rg_sm = os.path.basename(self.query[0]).split('.')[0]
+ else:
+ try:
+ rg_sm = re.match("r\d_[UP]+_(.*)_L[\d]+_R[\d]+(_[\d]+){0,1}.*", os.path.basename(self.query)).group(1)
+ except AttributeError:
+ rg_sm = os.path.basename(self.query).split('.')[0]
+ rg_pl = 'ILLUMINA'
+ rg_id = '.'.join([flowcell, rg_sm, lane])
+ rg_pu = rg_id
+ rg_lb = 'lib1'
+ else:
+ rg_sm = None
+ rg_pl = None
+ rg_id = None
+ rg_pu = None
+ rg_lb = None
+
+ if self.verbosity >= 1:
+ log.print_write(self.logger, 'info', '=== Mapping query on reference ===')
+ self.command = ' '.join([x for x in [
+ os.path.realpath(os.path.join(self.bwa, 'bwa')),
+ 'mem',
+ self.index,
+ self.query[0] if self.query_paired else None,
+ self.query[1] if self.query_paired else None,
+ self.query if not self.query_paired else None,
+ '-R' if self.readgroups else None, '\"@RG\\tID:{}\\tPU:{}\\tSM:{}\\tPL:{}\\tLB:{}\"'.format(rg_id, rg_pu, rg_sm, rg_pl, rg_lb) if self.readgroups else None,
+ self.options if self.options else None,
+ '>', self.output
+ ] if x is not None]) # TODO options
+ self.statistics = log.check_output(self.command, self.logger, shell=True, stderr=sp.STDOUT)
+ if self.unmapped is not None:
+ self.command = '{samtools} view -f4 {output} | {samtools} view -Sb - | {samtools} fastq - -1 {unmapped}.1.fastq -2 {unmapped}.2.fastq -s {unmapped}.singleton.fastq -0 {unmapped}.unpaired.fastq'.format(output=self.output, unmapped=self.unmapped, samtools=os.path.realpath(os.path.join(self.samtools, 'samtools')))
+ log.call(self.command, self.logger, shell=True)
+ try:
+ if os.path.getsize('{}.1.fastq'.format(self.unmapped)) == 0:
+ os.remove('{}.1.fastq'.format(self.unmapped))
+ if os.path.getsize('{}.2.fastq'.format(self.unmapped)) == 0:
+ os.remove('{}.2.fastq'.format(self.unmapped))
+ if os.path.getsize('{}.singleton.fastq'.format(self.unmapped)) == 0:
+ os.remove('{}.singleton.fastq'.format(self.unmapped))
+ if os.path.getsize('{}.unpaired.fastq'.format(self.unmapped)) == 0:
+ os.remove('{}.unpaired.fastq'.format(self.unmapped))
+ except:
+ pass
+ return self.output
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in BWAMem.map()')
+ return None
+
+ def set_version(self):
+ try:
+ self.command = [os.path.realpath(os.path.join(self.bwa, 'bwa'))]
+ p = re.compile('.*Version: (.*?)\n', re.DOTALL)
+ self.version = re.match(p, str(sp.Popen(self.command,stdout=sp.PIPE,stderr=sp.STDOUT).communicate()[0], 'utf-8')).group(1)
+ except sp.CalledProcessError as e:
+ log.write(self.logger, 'exception', 'ERROR: CPE in BWAMem.set_version')
+ return False
+
+ def get_version(self):
+ if self.version is None:
+ self.set_version()
+ return self.version
+
+ def get_statistics(self):
+ return self.statistics
+
+ def cleanup(self):
+ if self.index is not None and not self.index_given:
+ if self.verbosity >= 1:
+ log.write(self.logger, 'info', '=== Cleaning up mapper===')
+ try:
+ self.command = ('find {} -type f -regextype posix-extended '
+ '-iregex \'.*{}\.(amb|ann|bwt|pac|sa|fasta)$\' -delete'.format(os.path.dirname(self.index), os.path.basename(self.index)))
+ log.call(self.command, self.logger, shell=True)
+ self.index = None
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in BWAMem.cleanup()')
+ return False
+
+ def report(self):
+ log.write(self.logger, 'warning', 'BWA Version: {}'.format(self.get_version()))
+ log.write(self.logger, 'warning', 'Parameters: ')
+ for arg in sorted(vars(self)):
+ if arg != 'command':
+ log.write(self.logger, 'warning', '{} = {}'.format(arg, getattr(self, arg)))
+
+# add new mappers here
+def list_mapper():
+ return ('bowtie2', 'bwa-mem')
+
+# add new mappers here
+def get_mapper(mapper_key, query, reference, output, tmp=None, options=None, logger=None, readgroups=True, unmapped=None, samtools=None, bwa=None, bowtie2=None):
+ if mapper_key == 'bowtie2':
+ return Bowtie2(query, reference, output, tmp=tmp, options=options, logger=logger, readgroups=readgroups, unmapped=unmapped, samtools=samtools, bowtie2=bowtie2)
+ if mapper_key == 'bwa-mem':
+ return BWAMem(query, reference, output, tmp=tmp, options=options, logger=logger, readgroups=readgroups, unmapped=unmapped, samtools=samtools, bwa=bwa)
diff --git a/mapping.cwl b/mapping.cwl
new file mode 100755
index 0000000..42e1dde
--- /dev/null
+++ b/mapping.cwl
@@ -0,0 +1,41 @@
+#!/usr/bin/env cwl-runner
+class: CommandLineTool
+inputs:
+ - id: "#mapper"
+ type: Any
+ inputBinding:
+ position: 1
+ - id: "#read1"
+ type: Any
+ inputBinding:
+ position: 2
+ - id: "#reference"
+ type:
+ type: array
+ items: string
+ inputBinding:
+ prefix: --reference
+ separate: true
+ - id: "#read2"
+ type: ["null",Any]
+ inputBinding:
+ prefix: --read2
+ - id: "#noreport"
+ type: ["null",boolean]
+ default: false
+ inputBinding:
+ prefix: --no-report
+ - id: "#output"
+ type: Any
+ inputBinding:
+ prefix: --output
+ - id: "#options"
+ type: ["null",string]
+ inputBinding:
+ prefix: --options
+ - id: "#reffinder_options"
+ type: ["null",Any]
+ inputBinding:
+ prefix: --reffinder_options
+outputs: []
+baseCommand: ["./mapping.py"]
diff --git a/mapping.py b/mapping.py
new file mode 100755
index 0000000..ef0448a
--- /dev/null
+++ b/mapping.py
@@ -0,0 +1,543 @@
+#!/usr/bin/env python3
+import argparse
+from collections import defaultdict
+import configparser
+import datetime
+import imp
+import jinja2
+import logging
+import os
+import platform
+import pwd
+import re
+import socket
+import string
+import sys
+if sys.version_info < (3, 2):
+ import subprocess32 as sp
+else:
+ import subprocess as sp
+###
+import custom_logger as log
+import datetime
+import extract_genbank
+import reffinder
+import mapper
+import metrics
+
+__author__ = "Enrico Seiler"
+__credits__ = ["Wojtek Dabrowski"]
+__license__ = "GPL"
+__version__ = "1.0.0"
+__maintainer__ = "Enrico Seiler"
+__email__ = "seilere at rki.de"
+__status__ = "Development"
+
+class MAPPING:
+ def __init__(self, logger=None):
+ self.name = 'Mapping Pipeline'
+ self.version = __version__
+ self.logger = logger
+ self.python_version = sys.version.split(' ')[0]
+ self._argument_parser()
+ self.args.output = os.path.realpath(self.args.output)
+ self.user = pwd.getpwuid(os.getuid()).pw_name
+ self.server = socket.getfqdn()
+ self.system = platform.system()
+ self.machine = platform.architecture()[0]
+ self.call = ' '.join(sys.argv)
+ if self.system == 'Linux':
+ self.linux_dist = platform.linux_distribution()[0] # Ubuntu
+ self.linux_version = platform.linux_distribution()[1] # 14.04
+ self.release = ' '.join([self.linux_dist, self.linux_version])
+ elif self.system == 'Windows': # Windows
+ self.windows_version = platform.win32_ver()[0] # 7
+ self.windows_build = platform.win32_ver()[1] # 6.1.7601
+ self.windows_csd = platform.win32_ver()[2] # SP1
+ self.release = ' '.join([self.windows_version, self.windows_csd, self.windows_build])
+ elif self.system == 'Java':
+ # ...
+ pass
+ else:
+ self.mac_release = platform.mac_ver()[0]
+ self.mac_version = platform.mac_ver()[1][0]
+ self.metrics_results = dict()
+ self.mapper = None
+ self.sam_files = list()
+ self.delete = list()
+ self.tmp = os.path.join(os.path.expanduser('~/tmp/'), datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f'))
+ try:
+ os.mkdir(os.path.expanduser('~/tmp/'))
+ except OSError:
+ pass
+ try:
+ os.mkdir(self.tmp)
+ except OSError:
+ pass
+ self.run()
+ if self.args.join:
+ self._join()
+ if self.args.join_sample:
+ self._join_sample()
+ self.run_metrics()
+ self.cleanup()
+
+ def run_metrics(self):
+ if not self.args.noreport:
+ for sam in self.sam_files:
+ self.sam = sam
+ self.metric = metrics.METRICS(sam, output=self.tmp, logger=self.logger, samtools=os.path.realpath(os.path.join(self.args.samtools, 'samtools')))
+ self.current_project = os.path.basename(self._sam_name(os.path.basename(sam))) #read1
+ self.metrics_results[self.current_project] = dict()
+ self.metric.read_stats()
+ self.metric.compute_metrics()
+ self.metrics_results[self.current_project] = self.metric.cov_stats.copy()
+ self.metrics_results[self.current_project].update(self.metric.read_stats)
+ self.end_time = self._timestr(d=datetime.datetime.now())
+ self.report()
+
+ def report(self):
+ self.report_file = os.path.join(self.args.output, 'mapping_{}_{}.tex'.format(os.path.basename(self._sam_name(self.sam)), datetime.datetime.strftime(datetime.datetime.now(), '%d-%m-%y_%H-%M')))
+ # self.env = jinja2.Environment()
+ # self.env.loader = jinja2.FileSystemLoader('.')
+
+ # self.template = self.env.get_template(os.path.join(os.path.dirname(__file__), 'report.tex'))
+ # self.pdf_latex = self.template.render(pipeline=self, metric=self.metric)
+ template_dir = os.path.dirname(__file__) #'/usr/share/mapping-pipeline'
+ template_file = 'report.tex'
+ loader = jinja2.FileSystemLoader(template_dir)
+ env = jinja2.Environment(loader=loader)
+ template = env.get_template(template_file)
+ self.args.reference = self.args.reference[0]
+ pdf_latex = template.render(pipeline=self, metric=self.metric)
+
+ with open(self.report_file, 'w') as self.latex:
+ self.latex.write(pdf_latex)
+
+ process = sp.Popen(['pdflatex', '-interaction=nonstopmode', '-output-directory={}'.format(os.path.dirname(self.report_file)), self.report_file], stdout = sp.PIPE, stderr = sp.PIPE)
+ for line in iter(process.stderr.readline, b''):
+ print(line)
+
+ process.communicate()
+ self.metric.cleanup()
+ def cleanup(self):
+ log.print_write(self.logger, 'info', '=== Cleaning up ===')
+ try:
+ self.command = ('find {} -type f -regextype posix-extended '
+ '-iregex \'.*\.(aux|log|tex)$\' -delete'.format(os.path.realpath(self.args.output)))
+ log.call(self.command, self.logger, shell=True)
+ self.mapper.cleanup()
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in MAPPING.cleanup()')
+ return False
+ try:
+ self.command = 'rm -dfr {}'.format(self.tmp)
+ log.call(self.command, self.logger, shell=True)
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in MAPPING.cleanup()')
+ pass
+ try:
+ for delete in self.delete:
+ self.command = 'rm -f {}'.format(delete)
+ log.call(self.command, self.logger, shell=True)
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in MAPPING.cleanup()')
+ pass
+
+ def run_reffinder(self, mode, walk=None):
+ log.print_write(self.logger, 'info', '=== Running Reffinder ===')
+ if self.args.reffinder_options is None:
+ if mode == 0:
+ self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} --tmp {}".format(self.args.read1, self.args.read2, ' '.join(self.args.reference), self.tmp), logger=self.logger))
+ elif mode == 1:
+ self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single --tmp {}".format(self.args.read1, ' '.join(self.args.reference), self.tmp), logger=self.logger))
+ elif mode == 2:
+ self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} --tmp {}".format(os.path.join(walk, self.r), os.path.join(walk, self.r2), ' '.join(self.args.reference), self.tmp), logger=self.logger))
+ else:
+ self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single --tmp {}".format(os.path.join(walk, self.r), ' '.join(self.args.reference), self.tmp), logger=self.logger))
+ else:
+ if mode == 0:
+ self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} {} --tmp {}".format(self.args.read1, self.args.read2, ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+ elif mode == 1:
+ self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single {} --tmp {}".format(self.args.read1, ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+ elif mode == 2:
+ self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} {} --tmp {}".format(os.path.join(walk, self.r), os.path.join(walk, self.r2), ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+ else:
+ self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single {} --tmp {}".format(os.path.join(walk, self.r), ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+ def convert_to_accession(self):
+ pass
+ #pattern = re.compile('>gi\|\d*\|[^|]*\|([^|]*)\|(.*)')
+ #with open(self.args.reference[0], 'r+') as f:
+ # d = f.readlines()
+ # f.seek(0)
+ # fl = d[0]
+ # del(d[0])
+ # if '>gi' in fl:
+ # f.write('>{}\n'.format(' '.join(re.match(pattern, fl).groups())))
+ # for l in d:
+ # f.write(l)
+ # f.write('\n')
+ # f.truncate()
+ def run(self):
+ if self.args.reference is not None:
+ # Handle genbank
+ new_ref = []
+ for ref in self.args.reference:
+ if ref.endswith('.gb') or ref.endswith('.gbk'):
+ new_ref.append(extract_genbank.main(ref, os.path.join(self.tmp, os.path.basename(''.join(ref.split('.')[:-1])+'.fasta'))))
+ else:
+ new_ref.append(ref)
+ self.args.reference = new_ref
+ # if --multi-fasta is set, split refs
+ if self.args.multifasta:
+ print('=== Splitting multifasta ===')
+ self.split_multifasta()
+ #TODO RefFinder, if set to run, change self.args.reference to the returned reference.
+ self.find_ref = len(self.args.reference) >= 2
+ # Convert gi to accession
+ self.convert_to_accession()
+ else:
+ self.find_ref = False
+ # If only reads were given, run the pipeline on the reads
+ if not os.path.isdir(self.args.read1):
+ self.read_counter = 1
+ if self.find_ref:
+ if self.args.read2 is not None:
+ self.run_reffinder(0)
+ else:
+ self.run_reffinder(1)
+ elif self.args.reference:
+ self.args.reference = self.args.reference[0]
+ self._run_once(self.args.read1, self.args.read2)
+ # Else we have a folder containing project folders with reads. Run the pipeline for each project.
+ else:
+ self.read_counter = 1
+ [self.root, self.subdir, self.files] = next(os.walk(self.args.read1))
+ for self.sub in self.subdir:
+ self._run_all(next(os.walk(os.path.join(self.root, self.sub))))
+ self._run_all([self.root, self.subdir, self.files])
+ def _run_all(self, walk):
+ # Run pipeline for each pair of reads.
+ for self.r in [x for x in walk[2] if x.endswith('.fastq') or x.endswith('fastq.gz')]:
+ if self.r.startswith('r1_P'):
+ self.r2 = self.r[0] + '2' + self.r[2:]#TODO regex?
+ if not os.path.isfile(self.r2):
+ try:
+ # QCumber output
+ pattern = '(r)([1,2])(_P_.*_L\d\d\d_)(R)([1,2])(_\d\d\d\.fastq.*)'
+ frags = re.match(pattern, self.r).groups()
+ self.r2 = '{}{}{}{}{}{}'.format(frags[0], '2', frags[2], frags[3], '2', frags[5])
+ except AttributeError:
+ # Generic
+ pattern = '(r)([1,2])(_P_.*)'
+ frags = re.match(pattern, self.r).groups()
+ self.r2 = '{}{}{}{}{}{}'.format(frags[0], '2', frags[3])
+ if self.args.reference is not None:
+ self.cache_ref = self.args.reference
+ if self.find_ref:
+ self.run_reffinder(2, walk[0])
+ else:
+ self.args.reference = self.args.reference[0]
+ self._run_once(os.path.join(walk[0], self.r), read2=os.path.join(walk[0], self.r2))
+ if self.args.reference is not None:
+ self.args.reference = self.cache_ref
+ self.read_counter += 1
+ elif self.r.startswith('r2_P'):
+ pass
+ else:
+ if self.args.reference is not None:
+ self.cache_ref = self.args.reference
+ if self.find_ref:
+ self.run_reffinder(3, walk[0])
+ else:
+ self.args.reference = self.args.reference[0]
+ self._run_once(os.path.join(walk[0], self.r))
+ if self.args.reference is not None:
+ self.args.reference = self.cache_ref
+ self.read_counter += 1
+ def _sam_name(self, s):
+ if s.startswith('r1_P_') or s.startswith('r2_P_'):
+ st = 'P_' + s[5:]
+ elif s.startswith('r1_UP_') or s.startswith('r2_UP_'):
+ st = 'UP_' + s[6:]
+ else:
+ st = s
+ try:
+ self.sam_name = re.match('(.*)(\.fastq\.gz|\.fastq)$', st).group(1)
+ except:
+ self.sam_name = '.'.join(st.split('.')[:-1])
+ return os.path.join(self.args.output, self.sam_name)
+ def _join(self):
+ log.print_write(self.logger, 'info', '=== Joining SAM files ===')
+ command = '{java} -jar {jar} MergeSamFiles O={o} '.format(java=self.args.java, jar=os.path.join(self.args.picard, 'picard.jar'), o=os.path.join(self.args.output, 'joined.sam'))
+ for sam in self.sam_files:
+ command += 'I={}.sam '.format(sam)
+ self.delete.append(sam + '.sam')
+ log.call(command, self.logger, shell=True)
+ self.sam_files = [os.path.join(self.args.output, 'joined.sam')]
+ def _join_sample(self):
+ groups = defaultdict(set)
+ pattern = re.compile('^U{0,1}P_(.*)_L\d\d\d_R[1,2]_\d\d\d')
+ try:
+ for sam in self.sam_files:
+ sam = os.path.basename(sam)
+ sample = re.match(pattern, sam).group(1)
+ groups[sample].add(sam)
+ except:
+ pattern = re.compile('^U{0,1}P_(.*)')
+ for sam in self.sam_files:
+ sam = os.path.basename(sam)
+ sample = re.match(pattern, sam).group(1)
+ groups[sample].add(sam)
+ self.sam_files = list()
+ for (key, values) in groups.items():
+ command = '{java} -jar {jar} MergeSamFiles O={o} '.format(java=self.args.java, jar=os.path.join(self.args.picard, 'picard.jar'), o=os.path.join(self.args.output, key+'.sam'))
+ for sam in values:
+ command += 'I={}.sam '.format(os.path.join(self.args.output, sam))
+ self.delete.append(os.path.join(self.args.output, sam + '.sam'))
+ log.call(command, self.logger, shell=True)
+ self.sam_files.append(os.path.join(self.args.output, key+'.sam'))
+ def _run_once(self, read1, read2=None):
+ log.print_write(self.logger, 'info', '=== Processing read #{} ==='.format(self.read_counter))
+ log.write(self.logger, 'info', 'Read1 = {}'.format(read1))
+ log.write(self.logger, 'info', 'Read2 = {}'.format(read2))
+ self.start_time = self._timestr(d=datetime.datetime.now())
+ self.current_read1 = read1
+ self.current_read2 = read2
+ # Mapping
+ self.sam_files.append(self._sam_name(os.path.basename(read1)))
+ if self.mapper is None:
+ self.mapper = mapper.get_mapper(self.args.mapper, [x for x in [read1, read2] if x is not None], self.args.reference if self.args.reference else '', self._sam_name(os.path.basename(read1)), self.tmp, self.args.options, self.logger, not self.args.norg, os.path.realpath(self.args.unmapped) if self.args.unmapped is not None else None, self.args.samtools, self.args.bwa, self.args.bowtie2)
+ else:
+ self.mapper.update([x for x in [read1, read2] if x is not None], self._sam_name(os.path.basename(read1)), self.args.options)
+ self.sam_file = self.mapper.map()
+ self.mapper_version = self.args.mapper + ' ' + self.mapper.get_version()
+ self.sam_files.append(self._sam_name(self.sam_file))
+ def split_multifasta(self):
+ i = 0
+ f = None
+ files = list()
+ for ref in self.args.reference:
+ for line in open(ref, 'r'):
+ if '>' in line:
+ if f is not None:
+ f.close()
+ f = open(os.path.join(self.tmp, '{}.fasta'.format(i)), 'w')
+ files.append(os.path.join(self.tmp, '{}.fasta'.format(i)))
+ i += 1
+ f.write('{}'.format(line))
+ else:
+ f.write('{}'.format(line))
+ if f is not None:
+ f.close
+ self.args.reference = files
+
+ def _valid_picard(self, f):
+ if os.path.isfile(os.path.join(os.path.realpath(f), 'picard.jar')):
+ return f
+ else:
+ raise argparse.ArgumentTypeError('Cannot find picard.jar in folder {}'.format(os.path.realpath(f)))
+
+ def _valid_unmapped(self, f):
+ if not os.path.isdir(os.path.realpath(f)):
+ return f
+ else:
+ raise argparse.ArgumentTypeError('{} is a directory. Please specify a path containing the prefix for unmapped reads.'.format(os.path.realpath(f)))
+
+ def _valid_samtools(self, f):
+ if os.path.isfile(os.path.join(os.path.realpath(f), 'samtools')):
+ return f
+ else:
+ raise argparse.ArgumentTypeError('Cannot find samtools in folder {}'.format(os.path.realpath(f)))
+
+ def _valid_java(self, f):
+ try:
+ version_string = sp.check_output(f + " -version", shell=True, stderr=sp.STDOUT)
+ except:
+ raise argparse.ArgumentTypeError('It appears that {} is not a java executable (absolute or available in the PATH environment)'.format(f))
+ try:
+ version = version_string.decode("ASCII").split("\n")[0].split('"')[1].split("_")[0]
+ vn = float(".".join(version.split(".")[:2]))
+ except:
+ raise argparse.ArgumentTypeError('It appears that {} is not a java executable (absolute or available in the PATH environment)'.format(f))
+ if vn < 1.8:
+ raise argparse.ArgumentTypeError('{} is java version {}, however, at least version 1.8 is required'.format(f, vn))
+ return f
+
+ def _valid_bwa(self, f):
+ if os.path.isfile(os.path.join(os.path.realpath(f), 'bwa')):
+ return f
+ else:
+ raise argparse.ArgumentTypeError('Cannot find bwa in folder {}'.format(os.path.realpath(f)))
+
+ def _valid_bowtie2(self, f):
+ if os.path.isfile(os.path.join(os.path.realpath(f), 'bowtie2')):
+ return f
+ else:
+ raise argparse.ArgumentTypeError('Cannot find bowtie2 in folder {}'.format(os.path.realpath(f)))
+
+ def _argument_parser(self):
+ # Parse any conf_file specification
+ # We make this parser with add_help=False so that
+ # it doesn't parse -h and print help.
+ conf_parser = argparse.ArgumentParser(
+ description=__doc__, # printed with -h/--help
+ # Don't mess with format of description
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ # Turn off help, so we print all options in response to -h
+ add_help=False
+ )
+ config = os.path.join(os.path.dirname(__file__), 'run.config')
+ if '--config' not in sys.argv and os.path.isfile(config):
+ sys.argv += ['--config', config]
+ conf_parser.add_argument("--config", help="Specify config file", metavar="FILE")
+ conf_args, remaining_argv = conf_parser.parse_known_args()
+ if conf_args.config:
+ with open(conf_args.config, 'r') as f:
+ for line in f:
+ if '=' in line:
+ [option, argument] = line.strip().split('=')
+ if option not in sys.argv:
+ remaining_argv += [option, argument]
+ else:
+ if line.strip() not in sys.argv:
+ remaining_argv += [line.strip()]
+
+ self.parser = argparse.ArgumentParser(description='Mapping pipeline', add_help=False)
+ self.required = self.parser.add_argument_group('required arguments')
+ self.optional = self.parser.add_argument_group('optional arguments')
+ self.required.add_argument('mapper', type=self._is_mapper, help='Select a mapper. Supported: {}'.format(' '.join(mapper.list_mapper())))
+ self.required.add_argument('read1', type=str, help='Either unpaired reads, first mate of paired reads, project folder or folder containing project folders.')
+ self.optional.add_argument('--reference', dest='reference', type=str, help='Reference genome. FASTA or GenBank format. Parameter can be used multiple times.', action='append')
+ self.optional.add_argument('-h', '--help', action='help', help='Show this help message and exit')
+ self.optional.add_argument('--no-report', dest='noreport', action='store_true', default=False, help='Do not generate report.')
+ self.optional.add_argument('--read2', dest='read2', nargs='?', type=str, help='Second mate of paired reads.', default=None)
+ self.optional.add_argument('--output', dest='output', nargs='?', type=str, help='Output location. (Default: %(default)s)', default=os.getcwd())
+ self.optional.add_argument('--multi-fasta', dest='multifasta', action='store_true', default=False, help='If set, split multifasta files into single fasta files.')
+ self.optional.add_argument('--options', dest='options', nargs='?', type=str, help='Options for the mapper. Usage: --options "option(s)"')
+ self.optional.add_argument('--reffinder_options', dest='reffinder_options', nargs='?', type=str, help='Options for reffinder. Usage: --reffinder_options="option(s)"')
+ self.optional.add_argument('--join', dest='join', action='store_true', default=False, help='Join all created alignment files into one file.')
+ self.optional.add_argument('--join_sample', dest='join_sample', action='store_true', default=False, help='Join all created alignment files per sample into one file.')
+ self.optional.add_argument('--no-rg', dest='norg', action='store_true', default=False, help='Do not set read groups.')
+ self.optional.add_argument('--unmapped', dest='unmapped', nargs='?', default=None, type=self._valid_unmapped, help='Write unmapped reads to given file. Read name will be automatically added.')
+ self.optional.add_argument('--java', dest='java', nargs='?', default='java', type=self._valid_java, help='Java version 1.8+ executable.')
+ self.optional.add_argument('--samtools', dest='samtools', nargs='?', default='/usr/bin', type=self._valid_samtools, help='Path to folder containing samtools.')
+ self.optional.add_argument('--bwa', dest='bwa', nargs='?', default='/usr/bin', type=self._valid_bwa, help='Path to folder containg bwa.')
+ self.optional.add_argument('--bowtie2', dest='bowtie2', nargs='?', default='/usr/bin/', type=self._valid_bowtie2, help='Path to folder containing bowtie2.')
+ self.optional.add_argument('--picard', dest='picard', nargs='?', default='/usr/bin/', type=str, help='Path to folder containing picard.jar')
+ self.optional.add_argument('--config', help='Define config file to read parameters from. If run.config exists in the source directory, it is if --config is not defined.')
+ self.args = self.parser.parse_args(remaining_argv)
+ if self.args.join_sample or self.args.join:
+ self.args.picard = self._valid_picard(os.path.expanduser(self.args.picard))
+ if self.args.options is not None:
+ if self.args.options[0] == self.args.options[-1] == '"':
+ self.args.options = self.args.options[1:-1] # workaround for cwl
+ def _is_mapper(self, m):
+ self.mapper_list = mapper.list_mapper()
+ if m in self.mapper_list:
+ return m
+ else:
+ raise argparse.ArgumentTypeError('Mapper {} is not supported. Available mappers are: {}'.format(m, ' '.join(self.mapper_list)))
+ def _timestr(self,ger=False,d=None):
+ def lz(i):
+ if i < 10:
+ return '0{}'.format(i)
+ else:
+ return i
+
+ weekdays = {
+ 1: 'Monday',
+ 2: 'Tuesday',
+ 3: 'Wednesday',
+ 4: 'Thursday',
+ 5: 'Friday',
+ 6: 'Saturday',
+ 7: 'Sunday',
+ 8: 'Montag',
+ 9: 'Dienstag',
+ 10: 'Mittwoch',
+ 11: 'Donnerstag',
+ 12: 'Freitag',
+ 13: 'Samstag',
+ 14: 'Sonntag'
+ }
+ tstr = '{} {}.{}.{} {}:{}:{}'
+ if d is None:
+ d = datetime.datetime.now()
+ if not ger:
+ tstr = tstr.format(weekdays[d.isoweekday()], lz(d.day), lz(d.month), d.year, lz(d.hour), lz(d.minute), lz(d.second))
+ else:
+ tstr = tstr.format(weekdays[d.isoweekday()+7], lz(d.day), lz(d.month), d.year, lz(d.hour), lz(d.minute), lz(d.second))
+ return tstr
+
+def timestr(ger=False,d=None):
+ def lz(i):
+ if i < 10:
+ return '0{}'.format(i)
+ else:
+ return i
+
+ weekdays = {
+ 1: 'Monday',
+ 2: 'Tuesday',
+ 3: 'Wednesday',
+ 4: 'Thursday',
+ 5: 'Friday',
+ 6: 'Saturday',
+ 7: 'Sunday',
+ 8: 'Montag',
+ 9: 'Dienstag',
+ 10: 'Mittwoch',
+ 11: 'Donnerstag',
+ 12: 'Freitag',
+ 13: 'Samstag',
+ 14: 'Sonntag'
+ }
+ tstr = '{} {}.{}.{} {}:{}:{}'
+ if d is None:
+ d = datetime.datetime.now()
+ if not ger:
+ tstr = tstr.format(weekdays[d.isoweekday()], lz(d.day), lz(d.month), d.year, lz(d.hour), lz(d.minute), lz(d.second))
+ else:
+ tstr = tstr.format(weekdays[d.isoweekday()+7], lz(d.day), lz(d.month), d.year, lz(d.hour), lz(d.minute), lz(d.second))
+ return tstr
+
+def valid_filename(s):
+ valid_chars = "-_.: %s%s" % (string.ascii_letters, string.digits)
+ filename = ''.join(c for c in s if c in valid_chars)
+ return filename.replace(' ','_').replace('.','-').replace(':','-')
+
+if __name__ == '__main__':
+ if '-h' in sys.argv or '--help' in sys.argv:
+ m = MAPPING(None)
+ else:
+ out = None
+ try:
+ out = sys.argv[sys.argv.index('--output')+1]
+ except Exception:
+ pass
+ if out is None:
+ try:
+ cache = [y for x in sys.argv for y in x.split('=')]
+ out = cache[cache.index('--output')+1]
+ except Exception:
+ pass
+ if out is None:
+ out = ''
+ t0 = datetime.datetime.now()
+ log_file = os.path.join(out, ('{}_log.txt'.format(valid_filename(timestr(d=t0)))))
+ try:
+ os.mkdir(os.path.dirname(log_file))
+ except:
+ pass
+ logging.basicConfig(format='%(levelname)s:%(message)s',filename=log_file,filemode='w',level=logging.DEBUG)
+ logger = logging.getLogger(log_file)
+ logger.info('Log is written to: {}'.format(os.path.abspath(log_file)))
+ logger.info('This pipeline was invoked by: {}'.format(pwd.getpwuid(os.getuid()).pw_name))
+ logger.info('This pipeline started at: {}'.format(timestr(d=t0)))
+ m = MAPPING(logger)
+ t1 = datetime.datetime.now()
+ logger.info('This pipeline finished at: {}'.format(timestr(d=t1)))
+ logger.info('Runtime: {}'.format(t1-t0))
+
+
diff --git a/metrics.py b/metrics.py
new file mode 100755
index 0000000..a9a9c8a
--- /dev/null
+++ b/metrics.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python
+
+import custom_logger as log
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import os
+import pysam
+import pylab
+import itertools
+import string
+from scipy import stats
+from collections import Counter
+from collections import deque
+from itertools import islice
+import subprocess as sp
+
+def window(iterable, size=2, step=1, fillvalue=None):
+ if size < 0 or step < 1:
+ raise ValueError
+ it = iter(iterable)
+ q = deque(islice(it, size), maxlen=size)
+ if not q:
+ return # empty iterable or size == 0
+ #q.extend(fillvalue for _ in range(size - len(q))) # pad to size
+ while True:
+ #yield iter(q) # iter() to avoid accidental outside modifications
+ yield q
+ q.append(next(it))
+ #q.extend(next(it, fillvalue) for _ in range(step - 1))
+ q.extend(next(it) for _ in range(step - 1))
+
+def valid_filename(s):
+ valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
+ filename = ''.join(c for c in s if c in valid_chars)
+ filename = filename.replace(' ','_')
+ if len(filename) > 200:
+ return(filename[:200])
+ else:
+ return filename
+
+class SAM:
+ def __init__(self, sam_file_path, logger=None, samtools=None):
+ self.samtools = samtools
+ self.sam_file_path = sam_file_path
+ self.sam_file = None
+ self.logger = logger
+ #self.sam_alignments = list()
+ self._read_sam()
+ self.sam_flags = None
+ def _read_sam(self):
+ sp.check_call('{samtools} view -bS {sam_file} | {samtools} sort - -o {bam} 2> /dev/null'.format(samtools=self.samtools, sam_file=self.sam_file_path, bam=self.sam_file_path[:-3] + 'bam'), shell=True)
+ self.sam_file_path = self.sam_file_path[:-3] + 'bam'
+ sp.check_call('{samtools} index {sam_file} {index}'.format(samtools=self.samtools, sam_file=self.sam_file_path, index=self.sam_file_path[:-1] + 'i'), shell=True)
+ self.sam_file = pysam.AlignmentFile(self.sam_file_path, 'rb')
+ #for alignment in self.sam_file.fetch():
+ # self.sam_alignments.append(alignment)
+ #TODO return new iterator?
+ #self.sam_alignments = self.sam_file.fetch()
+ def _read_flags(self):
+ self.sam_flags = list()
+ for self.alignment in self.get_alignments():
+ self.sam_flags.append((self.alignment.query_name, self.alignment.flag))
+ def get_alignments(self, reference=None, until_eof=False):
+ return self.sam_file.fetch(reference = reference, multiple_iterators = True, until_eof=until_eof)
+ def get_flags(self):
+ for self.alignment in self.get_alignments(until_eof=True):
+ yield self.alignment.flag
+ def get_sam_file(self):
+ return self.sam_file
+ def get_sam_file_path(self):
+ return self.sam_file_path
+ def get_reference_count(self):
+ return self.sam_file.nreferences
+ def get_reference_name(self, ref_id):
+ return self.sam_file.getrname(ref_id)
+ def get_lengths(self):
+ return self.sam_file.lengths
+ def get_references(self):
+ return self.sam_file.references
+ def get_reference_pos(self, reference):
+ return self.sam_file.references.index(reference)
+ def get_reference_length(self, reference):
+ return self.sam_file.lengths[self.get_reference_pos(reference)]
+
+class METRICS:
+ def __init__(self, sam_file_path, output=None, logger=None, samtools=None):
+ self.sam = SAM(sam_file_path, logger, samtools)
+ self.logger = logger
+ if output is not None:
+ self.output = os.path.realpath(output)
+ else:
+ self.output = os.path.realpath('./')
+ self.reference_coverage_count = None
+ self.reference_mapq_avg = None
+ self.reads_per_quality_result = []
+ self.bases_per_coverage_result = []
+ self.mapq_graph_result = []
+ self.coverage_graph_result = []
+ self.cov_stats_result = []
+ #self.read_stats_result = []
+
+ def reads_per_quality(self, reference, i):
+ """Generate one pdf for each reference showing the number of reads (y-axis) that map with a certain mapping quality (x-axis)."""
+ log.print_write(self.logger, 'info', '=== Reads per Quality #{} ==='.format(i))
+ # The SAM convention states that 255 is the highest value for the mapping quality.
+ reads_per_quality = np.zeros(256, np.int)
+ # Go through all alignments int the SAM file and increase the counter for the mapping quality in the respective reference.
+ for alignment in self.sam.get_alignments(reference=reference):
+ reads_per_quality[alignment.mapping_quality] += 1
+ # Trim all zeros at the end of the array.
+ rpq = np.trim_zeros(reads_per_quality, 'b')
+ # If there is something left to plot.
+ if rpq.size > 0:
+ # Plot the reads per quality as a bar chart into a pdf file.
+ with PdfPages(os.path.join(self.output, 'reads_per_quality_{}.pdf'.format(valid_filename(reference)))) as pdf:
+ self.reads_per_quality_result.append([os.path.realpath(os.path.join(self.output, 'reads_per_quality_{}.pdf'.format(valid_filename(reference)))), reference.replace('_','\_')])
+ fig, ax = plt.subplots( nrows=1, ncols=1 ) # create figure & 1 axis
+ ax.set_yscale('symlog')
+ ax.bar(range(rpq.size), rpq, align='center')
+ pylab.xlim([-0.5,rpq.size-0.5])
+ pylab.ylim(ymin=-0.05)
+ ax.set_xlabel('Mapping quality')
+ ax.set_ylabel('Read Count')
+ ax.set_title('Reads per Mapping Quality')
+ pdf.savefig() # saves the current figure into a pdf page
+ plt.close()
+
+ def compute_metrics(self):
+ scored_references = list()
+ score_reference = np.zeros(self.sam.get_reference_count(), np.int)
+ for alignment in self.sam.get_alignments():
+ score_reference[alignment.reference_id] += 1
+ scored_references = [self.sam.get_references()[x] for x in np.argsort(score_reference)[::-1][:10]]
+ for i, reference in enumerate(scored_references):
+ if i == 10:
+ break
+ self.count_coverage(reference)
+ self.count_mapq(reference)
+ self.reads_per_quality(reference, i+1)
+ self.bases_per_coverage(reference, i+1)
+ self.coverage_graph(reference, i+1)
+ self.mapq_graph(reference, i+1)
+ self.coverage_stats(reference, i+1)
+
+ def count_coverage(self, reference):
+ """Count coverage for each position in each reference, i.e. reference_count[reference_id][position] = coverage."""
+ # We will count coverages for all references(self.sam.lengths). One row will contain at most the max(lengths references) entries.
+ self.reference_coverage_count = np.zeros(self.sam.get_reference_length(reference), np.int)
+ # Go through all alignments in the SAM file.
+ for alignment in self.sam.get_alignments(reference=reference):
+ # If the length is None, the query/read does not map anywhere.
+ if alignment.reference_length is not None:
+ # Increase the coverage by one for the positions in the reference that the read maps to.
+ self.reference_coverage_count[alignment.reference_start:alignment.reference_end] += 1
+
+ def count_mapq(self, reference):
+ """Append mapping quality for each position in each reference, i.e. reference_count[reference_id][position] = [mapqs]."""
+ # We will count coverages for all references(self.sam.lengths). One row will contain at most the max(lengths references) entries.
+ reference_mapq_sum = np.zeros(self.sam.get_reference_length(reference), np.float)
+ reference_mapq_count = np.zeros(self.sam.get_reference_length(reference), np.int)
+ # Go through all alignments in the SAM file.
+ for alignment in self.sam.get_alignments(reference=reference):
+ # If the length is None, the query/read does not map anywhere.
+ if alignment.reference_length is not None:
+ # Increase the coverage by one for the positions in the reference that the read maps to.
+ mapq = alignment.mapping_quality
+ reference_mapq_sum[alignment.reference_start:alignment.reference_end] += mapq
+ reference_mapq_count[alignment.reference_start:alignment.reference_end] += 1
+ with np.errstate(divide='ignore', invalid='ignore'):
+ self.reference_mapq_avg = np.true_divide(reference_mapq_sum, reference_mapq_count)
+ self.reference_mapq_avg = np.nan_to_num(self.reference_mapq_avg)
+ self.reference_mapq_avg[self.reference_mapq_avg == np.inf] = 0
+
+ def _integer_mean(self, array):
+ """Calculate float mean of the values in array and return the rounded float as int."""
+ if len(array) == 0:
+ return 0
+ return int(round(np.sum(array)/len(array)))
+
+ def _set_axis(self, ax, factor):
+ """Manipulate the xlabel for a plot."""
+ # The new label should be the "compression" factor caused by the binning * the position.
+ def _factor_label(x, pos):
+ return str(int(round(x * factor)))
+ # Create a new formatter using _factor_label
+ formatter = matplotlib.ticker.FuncFormatter(_factor_label)
+ # Set the formatter
+ ax.xaxis.set_major_formatter(formatter)
+
+ def bases_per_coverage(self, reference, i):
+ """Generate one pdf for each reference showing the number of bases (y-axis) that have a certain coverage (x-axis)."""
+ log.print_write(self.logger, 'info', '=== Bases per Coverage #{} ==='.format(i))
+ # If we have not count the coverages yet, have to count the coverages.
+ if self.reference_coverage_count is None:
+ self.count_coverage(reference)
+ # Trim coverages of 0 at the end.
+ reference_coverage = np.trim_zeros(self.reference_coverage_count, 'b')
+ # If there are some coverages left.
+ if reference_coverage.size > 0:
+ # Count how often a coverage appears. bincount[coverage]=#coverage
+ bases_per_coverage = np.bincount(reference_coverage)
+ # Bin the coverages into 1000 bin and calculate the mean for every bin.
+ binned_coverage = stats.binned_statistic(range(bases_per_coverage.size), bases_per_coverage, statistic = self._integer_mean, bins = 1000)[0]
+ # Trim again in case the maximal coverage is less than 1000.
+ binned_coverage = np.trim_zeros(binned_coverage, 'b')
+ # Get the number of binned coverages.
+ len_coverage = binned_coverage.size
+ # Plot the bases per coverage as a bar chart into a pdf file.
+ if len_coverage > 0:
+ with PdfPages(os.path.join(self.output, 'bases_per_coverage_{}.pdf'.format(valid_filename(reference)))) as pdf:
+ self.bases_per_coverage_result.append([os.path.realpath(os.path.join(self.output, 'bases_per_coverage_{}.pdf'.format(valid_filename(reference)))), reference.replace('_','\_')])
+ fig, ax = plt.subplots(nrows=1, ncols=1)
+ # Change labels if there are more unbinned coverages than binned coverages, i.e. the binning influences the labels.
+ if len_coverage <= bases_per_coverage.size:
+ factor = float(bases_per_coverage.size)/len_coverage
+ self._set_axis(ax, factor)
+ ax.set_yscale('symlog')
+ ax.bar(range(len_coverage), binned_coverage, align='center', color='b', linewidth=0, edgecolor='b')
+ pylab.xlim([-0.005*len_coverage, len_coverage+0.005*len_coverage])
+ ax.set_xlabel('Coverage')
+ ax.set_ylabel('Base Count')
+ ax.set_title('Bases per Coverage')
+ pylab.ylim(ymin=-0.05)
+ pdf.savefig()
+ plt.close()
+ if len(self.bases_per_coverage_result) == 0:
+ self.bases_per_coverage_result.append(None)
+
+ def coverage_graph(self, reference, i):
+ """Generate one pdf for each reference showing the genome position (y-axis) an the respective coverage (x-axis). The genome is reduced/binned into 1000 points."""
+ log.print_write(self.logger, 'info', '=== Coverage Graph #{} ==='.format(i))
+ if self.reference_coverage_count is None:
+ self.count_coverage(reference)
+ # Trim coverages of 0 at the end.
+ reference_coverage = np.trim_zeros(self.reference_coverage_count, 'b')
+ # If there is something left of the reference.
+ if reference_coverage.size > 0:
+ # Use a sliding window to create ~1000 datapoints out of the reference.
+ reference_coverage_mean = []
+ reference_coverage_max = []
+ reference_coverage_min = []
+ window_size = min(int(reference_coverage.size/10), 50000)
+ for w in window(reference_coverage, size=window_size, step=max(1, int((reference_coverage.size-window_size)/1000))):
+ reference_coverage_mean.append(int(np.mean(w)))
+ reference_coverage_max.append(max(w))
+ reference_coverage_min.append(min(w))
+ reference_coverage_mean = np.array(reference_coverage_mean)
+ reference_coverage_max = np.array(reference_coverage_max)
+ reference_coverage_min = np.array(reference_coverage_min)
+ # Get the number of binned reference.
+ len_reference = reference_coverage_mean.size
+ # Plot bases per coverage as line diagram.
+ if len_reference > 0:
+ with PdfPages(os.path.join(self.output, 'coverage_graph_{}.pdf'.format(valid_filename(reference)))) as pdf:
+ self.coverage_graph_result.append([os.path.realpath(os.path.join(self.output, 'coverage_graph_{}.pdf'.format(valid_filename(reference)))), reference.replace('_', '\_')])
+ fig, ax = plt.subplots(nrows=1, ncols=1)
+ # Change labels if there are more unbinned entries than binned entries, i.e. the binning influences the labels.
+ if len_reference <= reference_coverage.size:
+ factor = float(reference_coverage.size)/len_reference
+ self._set_axis(ax, factor)
+ ax.set_yscale('symlog')
+ ax.plot(range(len_reference), reference_coverage_mean, 'r', label='mean')
+ ax.plot(range(len_reference), reference_coverage_max, 'b', label='max')
+ ax.plot(range(len_reference), reference_coverage_min, 'g', label='min')
+ ax.legend(bbox_to_anchor=(0., 1.05, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
+ ax.set_xlabel('Position')
+ ax.set_ylabel('Coverage')
+ ax.set_title('Coverage Graph')
+ pylab.xlim([0, len_reference])
+ pylab.ylim(ymin=-0.05)
+ pdf.savefig()
+ plt.close()
+ if len(self.coverage_graph_result) == 0:
+ self.coverage_graph_result.append(None)
+
+ def mapq_graph(self, reference, i):
+ """Generate one pdf for each reference showing the genome position (y-axis) an the respective coverage (x-axis). The genome is reduced/binned into 1000 points."""
+ log.print_write(self.logger, 'info', '=== Mapping Quality Graph #{} ==='.format(i))
+ if self.reference_mapq_avg is None:
+ self.count_mapq(reference)
+ # Trim coverages of 0 at the end.
+ reference_mapq = np.trim_zeros(self.reference_mapq_avg, 'b')
+ # If there is something left of the reference.
+ if reference_mapq.size > 0:
+ # Use a sliding window to create ~1000 datapoints out of the reference.
+ reference_mapq_mean = []
+ reference_mapq_max = []
+ reference_mapq_min = []
+ window_size = min(int(reference_mapq.size/10), 50000)
+ for w in window(reference_mapq, size=window_size, step=max(1, int((reference_mapq.size-window_size)/1000))):
+ reference_mapq_mean.append(int(np.mean(w)))
+ reference_mapq_max.append(max(w))
+ reference_mapq_min.append(min(w))
+ reference_mapq_mean = np.array(reference_mapq_mean)
+ reference_mapq_max = np.array(reference_mapq_max)
+ reference_mapq_min = np.array(reference_mapq_min)
+ # Get the number of binned reference.
+ len_reference = reference_mapq_mean.size
+ # Plot bases per coverage as line diagram.
+ if len_reference > 0:
+ with PdfPages(os.path.join(self.output, 'mapq_graph_{}.pdf'.format(valid_filename(reference)))) as pdf:
+ self.mapq_graph_result.append([os.path.realpath(os.path.join(self.output, 'mapq_graph_{}.pdf'.format(valid_filename(reference)))), reference.replace('_', '\_')])
+ fig, ax = plt.subplots(nrows=1, ncols=1)
+ # Change labels if there are more unbinned entries than binned entries, i.e. the binning influences the labels.
+ if len_reference <= reference_mapq.size:
+ factor = float(reference_mapq.size)/len_reference
+ self._set_axis(ax, factor)
+ #self.ax.set_yscale('symlog')
+ ax.plot(range(len_reference), reference_mapq_mean, 'r', label='mean')
+ ax.plot(range(len_reference), reference_mapq_max, 'b', label='max')
+ ax.plot(range(len_reference), reference_mapq_min, 'g', label='min')
+ ax.legend(bbox_to_anchor=(0., 1.05, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
+ ax.set_xlabel('Position')
+ ax.set_ylabel('Mapping Quality')
+ ax.set_title('Mapping Quality Graph')
+ pylab.xlim([0, len_reference])
+ pylab.ylim(ymin=-0.1)
+ pdf.savefig()
+ plt.close()
+ if len(self.mapq_graph_result) == 0:
+ self.mapq_graph_result.append(None)
+
+ def coverage_stats(self, reference, i):
+ log.print_write(self.logger, 'info', '=== Coverage Statistics #{} ==='.format(i))
+ self.cov_stats = dict()
+ if self.reference_coverage_count is None:
+ self.count_coverage(reference)
+ # Trim coverages of 0 at the end.
+ reference_coverage = np.trim_zeros(self.reference_coverage_count, 'b')
+ # If there is something left of the reference.
+ if reference_coverage.size > 0:
+ mean_coverage = np.mean(reference_coverage)
+ maximum_coverage = np.max(reference_coverage)
+ minimum_coverage = np.min(reference_coverage)
+ sd_coverage = np.std(reference_coverage)
+ bases_zero_coverage = (reference_coverage == 0).sum()
+ conf_int = mean_coverage + 2 * sd_coverage
+ lower_conf_int = (reference_coverage < conf_int).sum()
+ greater_conf_int = (reference_coverage > conf_int).sum()
+ cov_stats = {'mean': mean_coverage, 'max': maximum_coverage, 'min': minimum_coverage, 'sd': sd_coverage, 'zero': bases_zero_coverage, 'lower': lower_conf_int, 'greater': greater_conf_int}
+ self.cov_stats_result.append(cov_stats)
+ if len(self.cov_stats_result) == 0:
+ self.cov_stats_result.append({'mean': -1, 'max': -1, 'min': -1, 'sd': -1, 'zero': -1, 'lower': -1, 'greater': -1})
+ #return self.cov_stats
+
+ def read_stats(self):
+ log.print_write(self.logger, 'info', '=== Read Statistics ===')
+ concordant = 0
+ discordant = 0
+ paired = 0
+ unmapped = 0
+ total = 0
+ mapped = 0
+ #for (query, flag) in self.sam.get_flags():
+ for flag in self.sam.get_flags():
+ bflag = bin(flag)
+ total += 1
+ if flag >= 1 and bflag[-1] == '1':
+ paired += 1
+ if flag >= 2 and bflag[-2] == '1':
+ concordant += 1
+ else:
+ discordant += 1
+ if flag >= 4 and bflag[-3] == '1':
+ unmapped += 1
+ mapped = total - unmapped
+ self.read_stats = dict()
+ self.read_stats['concordant'] = concordant
+ self.read_stats['discordant'] = discordant
+ self.read_stats['paired'] = paired
+ self.read_stats['mapped'] = mapped
+ self.read_stats['unmapped'] = unmapped
+ self.read_stats['total'] = total
+ #self.read_stats_result.append(self.stats)
+ #return self.stats
+
+ def cleanup(self):
+ log.write(self.logger, 'info', '=== Cleaning up metrics ===')
+ try:
+ for mfile in self.reads_per_quality_result+self.bases_per_coverage_result+self.mapq_graph_result+self.coverage_graph_result+[(self.sam.sam_file_path, 'bam'), (self.sam.sam_file_path[:-1] + 'i', 'bai')]:
+ if mfile is not None:
+ self.command = 'rm {}'.format(mfile[0])
+ log.call(self.command, self.logger, shell=True)
+ except sp.CalledProcessError:
+ log.print_write(self.logger, 'exception', 'ERROR: CPE in metrics.cleanup()')
+ return False
+ self.reads_per_quality_result = []
+ self.bases_per_coverage_result = []
+ self.mapq_graph_result = []
+ self.coverage_graph_result = []
+ return True
diff --git a/reffinder.py b/reffinder.py
new file mode 100755
index 0000000..4d02b01
--- /dev/null
+++ b/reffinder.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#created by Stephan Fuchs (FG13, RKI), 2015
+version = '0.0.9'
+
+import argparse
+import time
+import re
+import sys
+if sys.version_info < (3, 2):
+ import subprocess32 as sp
+else:
+ import subprocess as sp
+import os
+import random
+import math
+import custom_logger as log
+
+#sys.path.insert(0, "/home/RKID1/fuchss/data/scripts/libs")
+
+import fux_filing as filing
+
+# options is a string containing parameters, e.g. "--fastq sth. --ref sth. -p 0.2"
+def parse_arguments(options=None, logger=None):
+ #processing command line arguments
+ parser = argparse.ArgumentParser(prog="REFFINDER", description='')
+ parser.add_argument('--fastq', metavar="FILE(S)", help="input FASTQ file(s)", type=argparse.FileType('r'), required=True, nargs='+')
+ parser.add_argument('--ref', metavar="FILE(S)", help="input reference FASTA file(s)", type=str, required=True, nargs='+')
+ parser.add_argument('--tmp', type=str, help='Temporary directory to use', default=os.path.expanduser('~/tmp/'))
+ parser.add_argument('-p', metavar="DECIMAL", help="proportion of reads used for mapping (default: 0.1)", type=float, action="store", default=0.1)
+ parser.add_argument('-t', metavar='INT', help="number of threads/CPUs used for FastQC (default: 20)", type=int, action="store", default=20)
+ parser.add_argument('-s', help="if set, subset fastq files are stored in a sub folder. Please consider, that high amount of data may be generated.", action="store_true")
+ parser.add_argument('-b', help="if set, subset BAM files are stored in a sub folder. Please consider, that high amount of data may be generated.", action="store_true")
+ parser.add_argument('-i', help="if set, reference index files are not deleted.", action="store_true")
+ parser.add_argument('--mode', help="select paired (default) or unpaired mode", choices=['paired', 'single'], action="store", default='paired')
+ parser.add_argument('--mapping', help="select mapping algorithm (default: bwasw).", choices=['bwasw', 'bwamem'], action="store", default='bwasw')
+ parser.add_argument('--version', action='version', version='%(prog)s ' + version)
+ if options is None:
+ args = parser.parse_args()
+ else:
+ args = parser.parse_args(options.split(' '))
+ args.logger = logger
+ try:
+ os.mkdir(args.tmp)
+ except OSError:
+ pass
+ return args
+
+
+#FUNCTIONS
+def indexRef(ref, args):
+ if args.mapping in ['bwasw', 'bwamem']:
+ cmd = ['bwa', 'index', '-p', os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), '-a', 'is', ref]
+ elif args.mapping == smalt:
+ cmd = ['smalt', 'index', '-s', '6', ref, ref]
+ log.call(cmd, args.logger)
+ cmd = ['samtools', 'faidx', ref]
+ log.call(cmd, args.logger)
+
+def mapping(ref, fastq, revfastq, samfile, args):
+ #for single mapping set revfastq to False
+ if args.mapping == 'bwasw':
+ cmd = ['bwa', 'bwasw', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), fastq, revfastq, '-f', samfile]
+ if revfastq == False:
+ del(cmd[6])
+ log.call(cmd, args.logger)
+ elif args.mapping == 'bwamem':
+ with open(samfile,"wb") as outfile:
+ cmd = ['bwa', 'mem', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), fastq, revfastq]
+ if revfastq == False:
+ del(cmd[6])
+ sp.check_call(cmd, stdout=outfile)
+ elif args.mapping == 'smalt':
+ cmd = ['smalt', 'map', '-n', str(args.t), '-f', 'sam', '-o', samfile, ref, fastq, revfastq]
+ if revfastq == False:
+ del(cmd[-1])
+ log.call(cmd, args.logger)
+
+def sam2bam(samfile, bamfile, delSam = True):
+ with open(bamfile, 'w') as bf:
+ cmd = ['samtools', 'view', '-F', '4', '-Sb', samfile]
+ sp.check_call(cmd, stdout=bf)
+ if delSam:
+ os.remove(samfile)
+
+def getMappedReads(bamfile):
+ cmd = ['samtools', 'flagstat', bamfile]
+ out = sp.check_output(cmd).decode("utf-8")
+ mapped_reads = re.compile('.*?(\d*) \+ \d* mapped.*', re.DOTALL)
+ m = mapped_reads.match(out)
+ #return int(out[2].split('+')[0])
+ return int(m.group(1))
+
+def getBwaVers():
+ cmd = ['bwa']
+ process = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
+ lines = process.communicate()[1].decode("utf-8").split('\n')
+ return lines[2].split(': ')[1]
+
+def getSamtoolsVers():
+ cmd = ['samtools']
+ process = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
+ lines = process.communicate()[1].decode("utf-8").split('\n')
+ return lines[2].split(': ')[1]
+
+def clean(args):
+ exts = ['amb', 'ann', 'bwt', 'fai', 'pac', 'sa']
+ for ref in args.ref:
+ for ext in exts:
+ if os.path.isfile(ref + '.' + ext):
+ os.remove(ref + '.' + ext)
+
+def find_reference(args):
+ #START
+ creationtime = time.strftime("%Y-%m-%d_%H-%M-%S")
+ starttime = time.time()
+
+ #consistency checks
+ if args.mode == 'paired' and len(args.fastq) % 2 != 0:
+ log.write(args.logger, 'debug', "EXCEPTION: in paired mode the number of fastq files must be even")
+ exit(1)
+
+ if args.p <= 0 or args.p > 1:
+ log.write(args.logger, 'debug', "EXCEPTION: read proportion must between 0 (exclusive) and 1 (inclusive).")
+ exit(1)
+
+ #generate dir
+ if args.s or args.b:
+ storedir = 'REFFINDER_' + creationtime + '/'
+ os.mkdir(storedir)
+ else:
+ storedir = ''
+
+ #files to fastq file objects
+ fastqFileObjs = []
+ for fastqfile in args.fastq:
+ fastqfile.close()
+ fastqFileObjs.append(filing.fastqfile(fastqfile.name))
+
+ #generating reference index files
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', "generate reference index files ...")
+ for ref in args.ref:
+ indexRef(ref, args)
+
+ #generating subset fastq files and mapping
+ fastq_mapped_reads =[] #minimal quality mapped reads based in fastq files order
+ ref_mapped_reads =[] #minimal quality mapped reads based in ref files order
+ total_reads = 0 #total reads
+
+ if args.mode == 'paired':
+ datasets = [['forward read file', 'reverse read file', 'reads', 'subset reads']]
+ elif args.mode == 'single':
+ datasets = [['read file', 'reads', 'subset reads']]
+
+ current_task = 0
+
+ #processing paired files
+ if args.mode == 'paired':
+ total_tasks = int((len(args.fastq)/2) * len(args.ref))
+ i = 0
+ while i < len(args.fastq):
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'processing paired files ...')
+ log.write(args.logger, 'debug', 'forward read file: {}'.format(args.fastq[i].name))
+ log.write(args.logger, 'debug', 'reverse read file: {}'.format(args.fastq[i+1].name))
+ pairedfastqFileObj = filing.pairedfastqfiles(fname = args.fastq[i].name, rname=args.fastq[i+1].name)
+ log.write(args.logger, 'debug', 'reads in each file: {}'.format(pairedfastqFileObj.getReadNumber()))
+ datasets.append([args.fastq[i].name, args.fastq[i+1].name, pairedfastqFileObj.getReadNumber()])
+
+ #generating subsets
+ log.write(args.logger, 'debug', 'generating read subsets ...')
+ mapped_reads = []
+ with pairedfastqFileObj.randomSubset(prop=args.p) as subsetObj:
+ log.write(args.logger, 'debug', 'reads in each subset file: {}'.format(subsetObj.getReadNumber()))
+ datasets[-1].append(subsetObj.getReadNumber())
+ total_reads += subsetObj.getReadNumber()
+ j = 0
+ for ref in args.ref:
+ current_task += 1
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'START MAPPING TASK {} OF {}'.format(current_task, total_tasks))
+
+ #mapping
+ log.write(args.logger, 'debug', 'mapping to {} using {} {}'.format(ref, args.mapping, '...'))
+ samfile = subsetObj.forward.uniqueFilename(ext='sam')
+ mapping(ref, subsetObj.forward.fullname, subsetObj.reverse.fullname, samfile, args)
+
+ #convert to bam
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'converting sam to bam ...')
+ bamfile = subsetObj.forward.uniqueFilename(ext='bam')
+ sam2bam(samfile, bamfile)
+
+ #get reads aligend with minimal mapping quality
+ mapped_reads.append(getMappedReads(bamfile))
+ if len(ref_mapped_reads) < j + 1:
+ ref_mapped_reads.append(0)
+ ref_mapped_reads[j] += mapped_reads[-1]
+
+ if args.b:
+ fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
+ reffilepath, reffilename = os.path.split(ref)
+ os.rename(bamfile, storedir + 'subset_' + fastqfilename + '__to__' + reffilename + '.bam')
+ else:
+ os.remove(bamfile)
+
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'mapped reads: {}'.format(mapped_reads[-1]))
+ j += 1
+
+ if args.s:
+ fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
+ subsetObj.forward.renameFile('subset_' + fastqfilename, path=storedir)
+ fastqfilepath, fastqfilename = os.path.split(args.fastq[i+1].name)
+ subsetObj.reverse.renameFile('subset_' + fastqfilename, path=storedir)
+ else:
+ subsetObj.setTmp()
+
+ #make fastq_mapped_reads relative
+ k = 0
+ for reads in mapped_reads:
+ mapped_reads[k] = str(round(mapped_reads[k]/subsetObj.getReadNumber() * 100, 2)) + '%'
+ k += 1
+ fastq_mapped_reads.append(mapped_reads)
+ i += 2
+
+
+ #processing single files
+ if args.mode == 'single':
+ total_tasks = int(len(args.fastq) * len(args.ref))
+ i = 0
+ while i < len(args.fastq):
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'processing fastq file {}'.format(args.fastq[i].name))
+ fastqFileObj = filing.fastqfile(name = args.fastq[i].name)
+ log.write(args.logger, 'debug', 'number of reads: {}'.format(fastqFileObj.getReadNumber()) )
+ datasets.append([args.fastq[i].name, fastqFileObj.getReadNumber()])
+
+ log.write(args.logger, 'debug', 'generating read subset ...')
+ mapped_reads = []
+ with fastqFileObj.randomSubset(prop=args.p) as subsetObj:
+ log.write(args.logger, 'debug', 'reads in subset file: {}'.format(subsetObj.getReadNumber()))
+ datasets[-1].append(subsetObj.getReadNumber())
+ j = 0
+ for ref in args.ref:
+ current_task += 1
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'START MAPPING TASK {} OF {}'.format(current_task, total_tasks))
+
+ #mapping
+ log.write(args.logger, 'debug', 'mapping to {} using {}'.format(ref, args.mapping))
+ samfile = subsetObj.uniqueFilename(ext='sam')
+ mapping(ref, subsetObj.fullname, False, samfile, args)
+
+ #convert to bam
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'converting sam to bam ...')
+ bamfile = subsetObj.uniqueFilename(ext='bam')
+ sam2bam(samfile, bamfile)
+
+ #get reads aligend with minimal mapping quality
+ mapped_reads.append(getMappedReads(bamfile))
+ if len(ref_mapped_reads) < j + 1:
+ ref_mapped_reads.append(0)
+ ref_mapped_reads[j] += mapped_reads[-1]
+
+ if args.b:
+ fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
+ reffilepath, reffilename = os.path.split(ref)
+ os.rename(bamfile, storedir + 'subset_' + fastqfilename + '__to__' + reffilename + '.bam')
+ else:
+ os.remove(bamfile)
+
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'mapped reads: {}'.format(mapped_reads[-1]))
+ j += 1
+
+ if args.s:
+ fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
+ subsetObj.renameFile('subset_' + fastqfilename, path=storedir)
+ else:
+ subsetObj.setTmp()
+
+ #make fastq_mapped_reads relative
+ k = 0
+ for reads in mapped_reads:
+ mapped_reads[k] = str(round(mapped_reads[k]/subsetObj.getReadNumber() * 100, 2)) + '%'
+ k += 1
+ fastq_mapped_reads.append(mapped_reads)
+ i += 1
+
+ #make ref_mapped_reads relative
+ i = 0
+ for reads in ref_mapped_reads:
+ if total_reads == 0:
+ ref_mapped_reads[i] = '0%'
+ else:
+ ref_mapped_reads[i] = str(round(reads/total_reads * 100, 2)) + '%'
+ i += 1
+
+
+
+ #get reference charts
+ log.write(args.logger, 'debug', '')
+ log.write(args.logger, 'debug', 'prepare output ...')
+
+ refcharts = [['rank', 'reference file(s)', 'avg of mapped reads']]
+
+ total_datasets = len(datasets) -1 #minus 1^since it contains a headline
+ order = sorted(set(ref_mapped_reads), reverse=True)
+ n = 0
+
+ for value in order:
+ n += 1
+ refkeys = [i for i, j in enumerate(ref_mapped_reads) if j == value]
+ refs = []
+ for key in refkeys:
+ refs.append(args.ref[key])
+ refcharts.append([str(n), ', '.join(refs), value])
+
+
+ #get rankmatrix
+ if args.mode == 'paired':
+ rankmatrix = [['forward read file', 'reverse read file']]
+ elif args.mode == 'single':
+ rankmatrix = [['read file']]
+
+ for i in range(1, len(args.ref) + 1):
+ rankmatrix[0].append('rank ' + str(i) + ' (mapped reads)')
+
+ i = 0
+ for mapped_reads in fastq_mapped_reads:
+ if args.mode == 'paired':
+ rankmatrix.append([args.fastq[i].name, args.fastq[i+1].name])
+ elif args.mode == 'single':
+ rankmatrix.append([args.fastq[i].name])
+
+ order = sorted(set(mapped_reads), reverse=True)
+ for value in order:
+ refkeys = [i for i, j in enumerate(mapped_reads) if j == value]
+ refs = []
+ for key in refkeys:
+ refs.append(args.ref[key])
+
+ rankmatrix[-1].append(', '.join(refs) + ' (' + str(value) + ')')
+
+ if args.mode == 'paired':
+ i += 2
+ elif args.mode == 'single':
+ i += 1
+
+ #get refmatrix
+ if args.mode == 'paired':
+ refmatrix = [['forward read file', 'reverse read file']]
+ elif args.mode == 'single':
+ refmatrix = [['read file']]
+
+ for ref in args.ref:
+ refmatrix[0].append(ref)
+
+ i = 0
+ for mapped_reads in fastq_mapped_reads:
+ if args.mode == 'paired':
+ refmatrix.append([args.fastq[i].name, args.fastq[i+1].name])
+ i += 2
+ elif args.mode == 'single':
+ refmatrix.append([args.fastq[i].name])
+ i += 1
+ refmatrix[-1].extend(mapped_reads)
+
+ #cleaning
+ if args.i == False:
+ clean(args)
+
+ #print best reference
+ maxvalue = max(set(ref_mapped_reads))
+ refkeys = [i for i, j in enumerate(ref_mapped_reads) if j == maxvalue]
+ log.write(args.logger, 'debug', 'best matching reference:')
+ #log.write(args.logger, 'debug', args.ref[key].name)
+ log.write(args.logger, 'debug', args.ref[refkeys[0]])
+ #return(args.ref[key].name)
+ return(args.ref[refkeys[0]])
+
+if __name__ == '__main__':
+ find_reference(parse_arguments())
+
diff --git a/report.tex b/report.tex
new file mode 100755
index 0000000..a641807
--- /dev/null
+++ b/report.tex
@@ -0,0 +1,135 @@
+\documentclass[a4paper]{article}
+\usepackage[document]{ragged2e}
+\usepackage{python}
+\usepackage{afterpage}
+\usepackage[english]{babel}
+\usepackage{graphicx}
+\usepackage[utf8]{inputenc}
+\usepackage{xcolor}
+\usepackage[T1]{fontenc}
+\renewcommand*\familydefault{\ttdefault} %% Only if the base font of the document is to be typewriter style
+\usepackage{subfigure}
+\usepackage{geometry}
+\usepackage{tcolorbox}
+\usepackage{adjustbox}
+\usepackage{setspace}
+\usepackage{float}
+\usepackage[obeyspaces]{url}
+\usepackage{grffile}
+
+\geometry{a4paper,left=25mm,right=20mm,
+ top=12mm,
+ bottom=20mm}
+
+\setlength\parindent{0pt}
+
+\tolerance=1
+\emergencystretch=\maxdimen
+\hyphenpenalty=1000
+\hbadness=1000
+
+\begin{document}
+
+{\bf {\LARGE{ {{pipeline.name}} } Version {{pipeline.version}} } }\\
+\line(1,0){ \textwidth }
+
+\begin{tabular}{lp{0.85\textwidth}}
+
+Executor: & {{pipeline.user}} \\
+Dataset: & {\bfseries{\verb|{{pipeline.sam_name}}|} }\\
+Started: & {{pipeline.start_time}}\\
+Finished: & {{pipeline.end_time}}\\
+ & \\
+Operating System & \\
+Server: & {{pipeline.server}}\\
+System: & {{pipeline.system}}\\
+Release: & {{pipeline.release}}\\
+%Version: & {{pipeline.operating_version}}\\
+Machine: & {{pipeline.machine}}\\
+ & \\
+ Tool versions & \\
+Python: & {{pipeline.python_version}}\\
+Mapper: & {{pipeline.mapper_version}}\\
+\end{tabular}\\
+
+%----------------- Workflow -------------------%
+\line(1,0){ \textwidth } \\
+\begin{tabular}{lp{0.85\textwidth}}
+Processed read(s): & \path{{"{"+pipeline.current_read1+"}"}}\\
+{%if pipeline.current_read2 is not none%} & \path{{"{"+pipeline.current_read2+"}"}}\\{%endif%}
+{%if pipeline.args.reference is defined and pipeline.args.reference is not none%} Reference file: & \path{{"{"+pipeline.args.reference+"}"}}\\{%endif%}
+\end{tabular}\\
+\line(1,0){ \textwidth } \\
+\begin{tabular}{lp{0.85\textwidth}}
+Call: & \path{{"{"+pipeline.call+"}"}}\\
+\end{tabular}\\
+\line(1,0){ \textwidth } \\
+
+%-------------------- Summary -------------------%
+\begin{tcolorbox}
+{\large{Read statistics} } \\
+\begin{tabular}{ll}
+Total reads: & {{metric.read_stats['total']}}\\
+Concordant reads: & {{metric.read_stats['concordant']}}\\
+Discordant reads: & {{metric.read_stats['discordant']}}\\
+Paired reads: & {{metric.read_stats['paired']}}\\
+Mapped reads: & {{metric.read_stats['mapped']}}\\
+Unmapped reads: & {{metric.read_stats['unmapped']}}\\
+\end{tabular}
+\end{tcolorbox}
+
+\vspace{5mm}
+
+{% for i in range(metric.reads_per_quality_result|length) %}
+\afterpage{
+\begin{tcolorbox}
+\begin{center}
+ {\Large Reference}\\
+ \textbf{ {{metric.reads_per_quality_result[i][1][:85]}} }\\
+\end{center}
+\end{tcolorbox}
+ \begin{figure}[H]
+ \centering
+ \begin{minipage}{0.5\textwidth}%
+ \begin{subfigure}%
+ {%if metric.coverage_graph_result[i] is defined and metric.coverage_graph_result[i][0] is not none%} {\includegraphics[width=\textwidth]{/{{metric.coverage_graph_result[i][0]}}}} {%endif%}
+ \end{subfigure}%
+ \end{minipage}%
+ \begin{minipage}{0.5\textwidth}%
+ \begin{subfigure}%
+ {%if metric.bases_per_coverage_result[i] is defined and metric.bases_per_coverage_result[i][0] is not none%} {\includegraphics[width=\textwidth]{/{{metric.bases_per_coverage_result[i][0]}}}} {%endif%}
+ \end{subfigure}%
+ \end{minipage}%
+ \end{figure}
+ \begin{figure}[H]
+ \centering
+ \begin{minipage}{0.5\textwidth}%
+ \begin{subfigure}%
+ {%if metric.reads_per_quality_result[i] is defined and metric.reads_per_quality_result[i][0] is not none%} {\includegraphics[width=\textwidth]{/{{metric.reads_per_quality_result[i][0]}}}} {%endif%}
+ \end{subfigure}%
+ \end{minipage}%
+ \begin{minipage}{0.5\textwidth}%
+ \begin{subfigure}%
+ {%if metric.mapq_graph_result[i] is defined and metric.mapq_graph_result[i][0] is not none%} {\includegraphics[width=\textwidth]{/{{metric.mapq_graph_result[i][0]}}}} {%endif%}
+ \end{subfigure}%
+ \end{minipage}%
+ \end{figure}
+ \begin{tcolorbox}
+ {\large{Coverage statistics} } \\
+ \begin{tabular}{ll}
+ {%if metric.cov_stats_result[i] is defined%}
+ Average: & {{metric.cov_stats_result[i]['mean']}}\\
+ Maximum: & {{metric.cov_stats_result[i]['max']}}\\
+ Minimum: & {{metric.cov_stats_result[i]['min']}}\\
+ Standard Deviation: & {{metric.cov_stats_result[i]['sd']}}\\
+ #Bases 0 coverage: & {{metric.cov_stats_result[i]['zero']}}\\
+ #Bases < mean+2*sd: & {{metric.cov_stats_result[i]['lower']}}\\
+ #Bases > mean+2*sd: & {{metric.cov_stats_result[i]['greater']}}\\
+ {%endif%}
+ \end{tabular}
+ \end{tcolorbox}
+}
+\clearpage
+{% endfor %}
+
+\end{document}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mapping-pipeline.git
More information about the debian-med-commit
mailing list