[med-svn] [mapping-pipeline] 01/10: Imported Upstream version 1.2.0

Andreas Tille tille at debian.org
Tue Feb 21 12:56:43 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository mapping-pipeline.

commit 3ccb5c672659957aa1cc5760f668a1f51340bd59
Author: Andreas Tille <tille at debian.org>
Date:   Mon Dec 12 12:55:25 2016 +0100

    Imported Upstream version 1.2.0
---
 .gitignore         |   89 +++++
 LICENSE            |  165 ++++++++
 README.md          |  128 +++++++
 __init__.py        |    0
 custom_logger.py   |  157 ++++++++
 example.config     |    3 +
 example.json       |    8 +
 extract_genbank.py |   52 +++
 fux_filing.py      | 1079 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 mapper.py          |  520 +++++++++++++++++++++++++
 mapping.cwl        |   41 ++
 mapping.py         |  543 ++++++++++++++++++++++++++
 metrics.py         |  394 +++++++++++++++++++
 reffinder.py       |  387 +++++++++++++++++++
 report.tex         |  135 +++++++
 15 files changed, 3701 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..72364f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,89 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000..65c5ca8
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..3248df7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,128 @@
+# Mapping Pipeline
+
+## Dependencies
+
+This tool was implemented in Python and needs Python 3.4 or higher.
+
+Required software:
+* [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)
+* [BWA](http://bio-bwa.sourceforge.net/)
+* [Latex*](https://www.latex-project.org/)
+* [samtools](http://www.htslib.org/)
+* [Picard*](https://broadinstitute.github.io/picard/)
+
+At least one mapping software is needed (BWA or Bowtie2). <br>
+*To generate an output report Latex is needed. <br>
+*To merge all SAM files in a project Picard is needed.
+
+Required Python packages:
+* jinja2
+* matplotlib
+* scipy
+* numpy
+* pylab
+* pysam
+
+---
+
+## Usage
+
+```
+./mapping.py <mapper> <read1> <option(s)>
+```
+
+required input:
+* mapper: specify mapper to use {bowtie2, bwa-mem}
+* read1: either unpaired reads, first mate of paired reads, project folder or folder containing project folders. <br>
+    When given a folder, the read names must contain a certain prefix to be recognized. <br>
+        For paired reads: "r1\_P\_" and "r2\_P\_" for first and second mates, respectively. <br>
+        For unpaired reads: "r1\_UP\_" or "r2\_UP\_".
+
+options:
+* --read2 \<read2\>: second mate of paired reads
+* --no-report: do not generate report
+* --output \<out\>: set output directory
+* --options "\<options\>": set options for mapper
+* --reffinder_options="\<options\>": set options for reffinder
+* --reference \<reference\>: location of the reference genome in either FASTA or GenBank format
+* --join: join all SAM files within project, needs --picard
+* --join_sample: join all SAM files per sample, needs --picard
+* --multi-fasta: Splits multifasta files into single fasta files to use with reffinder.
+* --no-rg: Do no set read groups.
+* --unmapped \<path/to/folder/unmapped\>: Write unmapped reads to file. '.fastq' is automatically added.
+* --samtools \<samtools_folder\>: Location of folder containing samtools.
+* --picard \<picard_folder\>: Location of folder containing picard.jar.
+* --bwa \<bwa_folder\>: Location of folder containing bwa.
+* --java \<java\>: Java 1.8+ executable
+* --bowtie2 \<bowtie2_folder\>: Location of folder containing bowtie2.
+* --config <path/to/file>: Load parameter from file.
+
+config file:<br>
+You can also use a config file containing parameters.<br>
+example.config:
+```
+--samtools=/usr/bin/
+--bwa=/usr/bin/
+--bowtie2=/usr/bin/
+```
+
+Example usage:
+```
+./mapping.py <...> --config /home/example.config
+```
+
+
+
+###### GenBank file
+GenBank files may be acquired from [NCBI](http://www.ncbi.nlm.nih.gov/genbank/).  
+Make sure to include the sequence (*Customize view* -> *Display options* -> *Show sequence*) before downloading the file (*Send* -> *Complete record* -> *File* -> *GenBank (full)*).
+
+---
+
+## Included Files
+
+This pipeline consists of ten files:
+* mapping.py: main script for the Mapping Pipeline
+* mapper.py: manages different mappers
+* metrics.py: creates report
+* custom_logger.py: contains functions for writing log files
+* mapping.cwl: provides interface for use with cwl-runner
+* report.tex: template for report
+* extract_genbank.py: Originally written by Wojtek Dabrowski. Extracts sequence from Genbank file.
+* reffinder.py: Written by Stephan Fuchs. Provides best reference for mapping when given multiple references.
+* fux_filing.py: Written by Stephan Fuchs. Provides functions for reffinder.
+* example.config: Example config file.
+
+---
+
+## Example
+
+```
+./mapping.py bwa-mem --reference path/reference1.fasta --reference path/reference2.gb path/read1.fastq --read2 path/read2.fastq --output path --options "-M"
+./mapping.py bwa-mem path/read1.fastq --read2 path/read2.fastq --option "-x /index/bwa"
+./mapping.py bowtie2 path/to/project --reference path/to/reference --join path/to/picard
+```
+
+---
+
+## Report
+
+The report includes general information about the run, e.g. start time, user and options.
+Furthermore, several metrics are computed:
+* total reads
+* concordant reads
+* discordant reads
+* paired reads
+* mapped reads
+* unmapped reads
+* coverage graph
+* bases per coverage graph
+* reads per mapping quality graph
+* mapping quality graph
+* average coverage
+* maximum coverage
+* minimum coverage
+* standard deviation of coverage
+* number of bases with coverage of 0
+* number of bases with coverage > mean+2*sd
+* number of bases with coverage < mean+2*sd
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/custom_logger.py b/custom_logger.py
new file mode 100755
index 0000000..7dd7acb
--- /dev/null
+++ b/custom_logger.py
@@ -0,0 +1,157 @@
+import logging
+import select
+import sys
+if sys.version_info < (3, 2):
+    import subprocess32 as sp
+else:
+    import subprocess as sp
+
+def call(popenargs, logger, stdout_log_level=logging.DEBUG,
+         stderr_log_level=logging.ERROR, **kwargs):
+    """
+    Variant of subprocess.call that accepts a logger instead of stdout/stderr,
+    and logs stdout/stderr messages via logger.debug.
+    """
+    try:
+        child = sp.Popen(popenargs, stdout=sp.PIPE, stderr=sp.PIPE, **kwargs)
+
+        log_level = {
+            child.stdout: stdout_log_level,
+            child.stderr: stdout_log_level} # stderr_log_level
+
+        def check_io():
+            ready_to_read = select.select(
+                [child.stdout, child.stderr], [], [], 1000)[0]
+            for io in ready_to_read:
+                line = str(io.readline(), 'utf-8')
+                if line and logger is not None:
+                    logger.log(log_level[io], line[:-1])
+
+        # keep checking stdout/stderr until the child exits
+        while child.poll() is None:
+            check_io()
+
+        check_io()  # check again to catch anything after the process exits
+    except Exception as e:
+        raise e
+
+    if child.returncode != 0:
+        raise sp.CalledProcessError(child.returncode, popenargs, None)
+    else:
+        return child.wait()
+
+def check_output(popenargs, logger, stdout_log_level=logging.DEBUG,
+         stderr_log_level=logging.ERROR, **kwargs):
+    """
+    Variant of subprocess.check_output that accepts a logger instead of stdout/stderr,
+    and logs stdout/stderr messages via logger.debug and returns the output in the same
+    manner as subprocess.check_output.
+    """
+    
+    log_stderr = True
+    try:
+        kwargs['stderr']
+        del kwargs['stderr']
+    except KeyError:
+        log_stderr = False
+
+    output = ''
+
+    try:
+        child = sp.Popen(popenargs, stdout=sp.PIPE, stderr=sp.PIPE, **kwargs)
+
+        log_level = {
+            child.stdout: stdout_log_level,
+            child.stderr: stdout_log_level} #stderr_log_level
+
+        def check_io():
+            ready_to_read = select.select(
+                [child.stdout, child.stderr], [], [], 1000)[0]
+            for io in ready_to_read:
+                line = str(io.readline(), 'utf-8')
+                if line and logger is not None:
+                    logger.log(log_level[io], line[:-1])
+                    if log_level[io] == 'stdout_log_level':
+                        return line
+                    elif log_stderr:
+                        return line
+
+        # keep checking stdout/stderr until the child exits
+        while child.poll() is None:
+            s = check_io()
+            if s is not None:
+                output += s      
+        s = check_io()  # check again to catch anything after the process exits
+        if s is not None: 
+            output += s      
+
+    except Exception as e:
+        raise e
+
+    if child.returncode != 0:
+        raise sp.CalledProcessError(child.returncode, popenargs, None)
+    else:
+        child.wait()
+        return output
+
+def write(logger, mode, message):
+    """Write message to the log.
+
+    Writes message to logger using log_level mode.
+
+    Args:
+        message: Message to write in the log.
+        logger: Open logger instance.
+        mode: Log_lvl to write lmessage to.
+    """
+
+    if logger is not None:
+        if message is None:
+            message = pmessage
+        if mode == 'error':
+            logger.error(message)
+        elif mode == 'critical':
+            logger.critial(message)
+        elif mode == 'debug':
+            logger.debug(message)
+        elif mode == 'warning':
+            logger.warning(message)
+        elif mode == 'info':
+            logger.info(message)
+        elif mode == 'exception':
+            logger.exception(message)
+        else:
+            logger.error(message)
+
+
+def print_write(logger, mode, pmessage, lmessage = None):
+    """Print a message to the screen and write another one to the log.
+
+    Prints pmessage to the screen and writes lmessage to logger using
+    log_level mode.
+
+    Args:
+        pmessage: Message to print to the screen.
+        lmessage: Message to write in the log.
+        logger: Open logger instance.
+        mode: Log_lvl to write lmessage to.
+    """
+
+    print(pmessage)
+    if logger is not None:
+        if lmessage is None:
+            lmessage = pmessage
+        if mode == 'error':
+            logger.error(lmessage)
+        elif mode == 'critical':
+            logger.critial(lmessage)
+        elif mode == 'debug':
+            logger.debug(lmessage)
+        elif mode == 'warning':
+            logger.warning(lmessage)
+        elif mode == 'info':
+            logger.info(lmessage)
+        elif mode == 'exception':
+            logger.exception(lmessage)
+        else:
+            logger.error(lmessage)
diff --git a/example.config b/example.config
new file mode 100644
index 0000000..b3d7106
--- /dev/null
+++ b/example.config
@@ -0,0 +1,3 @@
+--samtools=/usr/bin/
+--bwa=/usr/bin/
+--bowtie2=/usr/bin/
diff --git a/example.json b/example.json
new file mode 100755
index 0000000..cbb13a0
--- /dev/null
+++ b/example.json
@@ -0,0 +1,8 @@
+{
+    "mapper": "bwa-mem",
+    "reference": ["/path/to/reference", "/path/to/another/reference"],
+    "read1": "/path/to/read1",
+    "output": "/path/to/output/folder",
+    "options": "\"-M\"",
+    "noreport": true
+}
diff --git a/extract_genbank.py b/extract_genbank.py
new file mode 100755
index 0000000..0cccc4e
--- /dev/null
+++ b/extract_genbank.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python3
+def extract_name(infile):
+  definition="UNKNOWN"
+  accession="UNKNOWN"
+  lines = 0
+  for line in infile:
+    if line.startswith("DEFINITION"):
+      definition = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
+    if line.startswith("ACCESSION"):
+      accession = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
+    lines += 1
+    if accession != "UNKNOWN" and definition != "UNKNOWN":
+      break
+    if lines > 4:
+      break
+  return accession + " " + definition
+
+def extract_sequence(infile):
+  res = []
+  numline = 0
+  for line in infile:
+      if line.startswith("ORIGIN"):
+          break
+  for line in infile:
+    if line[0] == "/":
+      seq = "".join(res)
+      return "".join(seq)
+    res += line.strip().split(" ")[1:]
+    numline += 1
+
+def write_sequence(outfile, seq, name, splitlen):
+  f = open(outfile, "w")
+  f.write(">" + name + "\n")
+  pos = 0
+  numline = 0
+  while pos < len(seq):
+    f.write(seq[pos:pos+splitlen])
+    pos += splitlen
+    f.write("\n")
+    numline += 1
+
+def main(infile, fastafile):
+  f = open(infile, "r")
+  linelength = 80
+  name = extract_name(f)
+  seq = extract_sequence(f)
+  f.close()
+  write_sequence(fastafile, seq, name, linelength)
+  return fastafile
+
+
+
diff --git a/fux_filing.py b/fux_filing.py
new file mode 100755
index 0000000..e6338a6
--- /dev/null
+++ b/fux_filing.py
@@ -0,0 +1,1079 @@
+#!/usr/bin/env python3
+#copyright Stephan Fuchs (RKI, FG13, FuchsS at rki.de)
+import os
+import hashlib
+import uuid 
+import re
+import subprocess
+import random
+import sys
+import time
+
+#############################################################################################
+
+class file():
+	'''generic file object:
+	CHILDS		
+	fastafile			standard fasta file object
+	linfastafile		linearized fasta file object
+	fastqfile			standard fastq file object
+	pairedfastqfile		combines two standard fastq file object containing as paired reads
+	logfile				log file object (as I use it)	
+	
+	PARAMETER
+	name:	filename including relative or absolute path [False]. If false, unique 
+			filename not existing is generated
+	tmp:	tmp status[False]. If true, file is handled as temporary file.
+			That means, file is deleted on exit
+	mode:	mode of file opening. If mode is false and file exists, mode r 
+			(reading from beginning) is set by default. If mode is false and file does 
+			not exists or name is false mode is set to w (create and write) by default.
+	descr:	file description [False]
+	
+	ATTRIBUTES:
+	descr:			file description [STRING]
+	self.name:		file name without path [STRING]
+	self.path:		absolute path to file [STRING]
+	self.fullname:	absolute path and filename [STRING]
+	self.handle:	file handle [FILE RESSOURCE]
+	self.mode:		mode of file opening. If false, file is not open [STRING|False]
+	self._tmp:		temporary file [BOOL]
+
+	NON-MAGIC METHODS:
+	setFilelocation()	extracts name and path from given filename and sets self.name,
+						self.path, and self.fullname attributes	
+	openFile()			opens file handle with the given mode and sets self.mode attribute
+	reopenFile()		reopens file handle with the given mode. Modes w or w+ are replaced 
+						by a and a+, respectively, to avoid file truncation. Updates 
+						self.mode attribute.
+						Dependency: self.openFile()
+	closeFile()			closes file handle and sets self.mode to false
+	nextLine()			returns next line from file
+	changeMode()		changes file opening mode. By default cursor position is not stored 
+						but set to the file start and truncation of the file avoided. Updates 
+						self.mode attribute.
+						Dependency: self.openFile(), self.closeFile()
+	renameFile()		renames file on disk and keeps opening mode and state. Only modes 
+						w or w+ are replaced by a and a+, respectively, to avoid file 
+						truncation.  By default cursor position is not stored but set to the 
+						file start. File truncation If path is set file is moved to the new path.
+						Updates self.name, self.path, and self.fullname attributes.
+						Dependency: self.reopenFile(), self.setFilelocation()
+	setTmp()			file is handled as temporary file. That means, file is deleted on exit.
+						Filename is extended by .tmp extension. Opening Mode and cursor 
+						position are restored. Modes w or w+ are replaced by a and a+, respectively, 
+						to avoid file truncation. Attributes self.name, self.path, self.fullname,
+						self.mode, self._tmp are updated.
+						Dependency: renameFile(), setFilelocation()
+	unsetTmp()			file handled as non-temporary file. If file extension is .tmp extension is
+						removed. Opening Mode and cursor position are restored. Modes w or w+ 
+						are replaced by a and a+, respectively, to avoid file truncation. 
+						Attributes self.name, self.path, self.fullname,	self.mode, self._tmp 
+						are updated.
+						Dependency: renameFile(), setFilelocation()			
+	checkFile()			[static] checks if file exists. Only parameter is file name including path. 
+						returns True or False.
+	uniqueFilename()	[static] returns a non-existent 8 chars long filename based on uuid4. 
+						If path parameter is not submitted, the actual path is selected.
+	fuseFiles()			[class] fuses files returns full name of the fused file (filename + path). 
+						Files are submitted as list. Name and Path of the fused file can 
+						be also submitted.
+	destroy()			destroys file object by closing file handle and in case of temporary
+						file deleting file from disk.
+						Dependency: self.closeFile()	
+	write()				write to file
+	flush()				write buffer to file
+	'''
+	
+	def __init__(self, name = False, tmp = False, mode = False, descr = False):
+		self._tmp = tmp
+		self.descr = descr
+
+		if name == False:
+			name = self.uniqueFilename()
+			if self._tmp:
+				name += '.tmp'
+				
+		self.setFilelocation(name)
+		
+		if mode:
+			self._mode = mode		
+		else:
+			if self.checkFile(self.fullname):
+				self._mode='r'
+			else:
+				self._mode='w'				
+		if self._mode:
+			self.openFile(self._mode)				
+		
+	def __enter__(self):
+		return self		
+			
+	def setFilelocation(self, fullname = False):
+		'''sets or updates name, path, full name attributes'''		
+		if fullname:
+			self.path, self.name = os.path.split(fullname)
+		self.path = os.path.abspath(self.path)
+		self.fullname = os.path.abspath(os.path.join(self.path, self.name))
+		
+	def openFile(self, mode):
+		'''opens file handle'''
+		self._mode = mode
+		self.handle = open(self.fullname, self._mode)	
+		
+	def reopenFile(self, mode):
+		'''reopens file handle. Modes w or w+ are replaced by a and a+, respectively, to avoid file truncation'''
+		self._mode = mode
+		if mode == 'w':
+			mode = 'a'
+		elif mode == 'w+':
+			mode = 'a+'
+		self.handle = open(self.fullname, self._mode)		
+		
+	def closeFile(self):
+		'''closes file handle'''
+		if self.handle.closed == False:
+			self._mode = False
+			self.handle.close()
+
+	def nextLine(self):		
+		'''returns next line from file. Does not check if file is open for reading!'''
+		return self.handle.readline()
+		
+	def changeMode(self, mode, keepCursor = False, truncate = False):
+		'''
+		changes the mode the file is opened.
+		Parameter:	
+		mode		new mode
+		keepCursor	Pointer within the file is kept. If False (default) cursor is set to beginning of the file.
+		truncate	if False (default) truncating modes (w, w+) are replaced by appending modes (a, a+)
+		'''
+		cursor = 0
+		if truncate == False:
+			if mode == 'w':
+				mode = 'a'
+			elif mode == 'w+':
+				mode = 'a+'		
+		if self.handle.closed == False:
+			if keepCursor:
+				cursor = self.handle.tell()
+			self.closeFile()
+		if mode:
+			self.openFile(mode)
+			if cursor > 0:
+				self.handle.seek(cursor)			
+			
+	def renameFile(self, newfilename, path = False, keepCursor = False):
+		''' Renames a file. File is opened again if it was open (mode is restored). If keepCursor is True cursor is set to the last curor position within the file.'''
+		if path == False:
+			path = self.path
+		cursor = 0
+		mode = self._mode
+		self.handle.close()
+		if self.handle.closed == False:
+			if keepCursor:
+				cursor = self.handle.tell()	
+			self.closeFile()		
+		newfullname = os.path.join(path, newfilename)		
+		if self.checkFile(self.fullname):
+			os.rename(self.fullname, newfullname)
+		self.setFilelocation(newfullname)		
+		if mode:
+			self.reopenFile(mode)
+		if cursor > 0:
+			self.handle.seek(cursor)				
+	
+	def setTmp(self):
+		'''Set fileObject tmp attribute True'''	
+		if self._tmp == False:
+			if self.name[-4:] != '.tmp':
+				newfilename = self.name + '.tmp'
+				self.renameFile(newfilename, keepCursor = True)
+				self.name = newfilename
+				self.setFilelocation()
+			self._tmp = True
+	
+	def unsetTmp(self):
+		'''Set fileObject tmp attribute False'''
+		if self._tmp == True:
+			if self.name[-4:] == '.tmp':
+				newfilename = self.name[:-4]
+				self.renameFile(newfilename, keepCursor = True)
+				self.name = newfilename
+				self.setFilelocation()				
+		self._tmp = False	
+	
+	def write(self, str):
+		'''write to file'''
+		self.handle.write(str)
+		
+	def flush(self):
+		'''write buffer to file'''
+		self.handle.flush()		
+		os.fsync(self.handle.fileno())
+	
+	@staticmethod  
+	def checkFile(fullname):
+		'''checks if file exists. Only parameter is file name including path. Returns True or False.'''		
+		return os.path.isfile(fullname)
+	
+	@staticmethod  		
+	def uniqueFilename(path=os.getcwd(), ext = ''):
+		'''provides a filename based on uuid not in use. If no path parameter is used, the current working dir is used. Returns proposed filename including path as string.'''
+		if ext != '':
+			ext = '.' + ext
+		while True:
+			fullname = os.path.join(path, str(uuid.uuid4())[:8]) + ext
+			if file.checkFile(fullname) == False:
+				break
+		return fullname		
+		
+	@classmethod    
+	def fuseFiles(cls, childs, name=False, path=os.getcwd()):
+		'''
+		Creates a fused file. Returns a full filename (including path) of fused file.
+		Parameter:
+		childs	list of file names (including paths) whose content should be fused
+		name	filename of fused file (including path). File will be overwritten! If false, a unique filename is created
+		path	path for fused file (default: current working dir).		
+		'''
+		pass
+		if name == False:
+			fullname = cls.uniqueFilename(path)
+		else:
+			fullname = os.path.join(path, name)
+		
+		with open(fullname, "wb") as outfile:
+			for file in childs:
+				with open(file, "rb") as infile:
+					while True:
+						content = infile.read(100000000)
+						if not content:
+							break
+						outfile.write(content)
+				outfile.write(b'\n')
+		return fullname
+		
+	
+	def destroy(self):
+		if self.handle.closed == False:
+			self.closeFile()	
+		if self._tmp:
+			try:		
+				os.remove(self.fullname) 
+			except:
+				pass
+	
+	def __exit__(self, type, value, traceback):
+		self.destroy()
+
+#############################################################################################
+			
+class fastafile(file):	
+	'''standard fasta file object
+	PARENTS
+	file	generic file object
+	
+	PARAMETER:
+	name:	filename including relative or absolute path [False]. If false, unique 
+			filename not existing is generated
+	tmp:	tmp status[False]. If true, file is handled as temporary file.
+			That means, file is deleted on exit
+	mode:	mode of file opening. If mode is false and file exists, mode r 
+			(reading from beginning) is set by default. If mode is false and file does 
+			not exists or name is false mode is set to w (create and write) by default.
+	descr:	file description [False]
+		
+	ATTRIBUTES:
+	self._number_header 		Number of fasta head lines in the file*
+	self._number_singleheader	Number of fasta single headers in the file*
+								Fasta head lines can contain multiple headers separated
+								by SOH (chr(1))*
+	self._number_comments		Number of comment lines in the file*
+	self.number_accessions		Number accessions in the file*
+	self._accessions			set of accessions in the file*
+	
+	* to avoid unnecessary (and sometimes time consuming) parsing, attributes are generated if
+	needed the first time	
+
+	NON-MAGIC METHODS:
+	getHeaderNumber()				returns number of fasta headers in the file
+	getSingleHeaderNumber()			returns number of single headers in the file
+	getCommentNumber()				returns number of comment lines in the file
+	screenFASTA()					calculates number of headers, single headers, and comments in the file.
+									For this opening mode is changed to r and cursor set to the beginning of
+									the file. Importantly, original opening	mode and cursor positions are
+									restored after actions. However, original modes w or w+ are replaced 
+									by a and a+, respectively, to avoid file truncation. Updates 
+									self._number_header, self._number_singleheader, and self._number_comments. 
+									No return value.
+									Dependency:  self.changeMode(), self.isComment(), self.isHeader()	self.isHeader
+	nextEntry()						Returns next entry of FASTA file as tuple (header, seq, comment). Cursor 
+									is set immediatly before the following entry
+									Dependency: self.isComment(), self.isHeader
+	isHeader()						returns true if submitted string is a valid fasta header, otherwise false.
+	isComment()						returns true if submitted string is a fasta comment, otherwise false.
+	makeNR()						fuses fasta entries with the same full-length sequence to one entry with
+									a multiple header. Returns a fasta file object pointing to the generated NR 
+									(non-redundant) file.
+									Dependency: fastafile(), linfastafile(), self.linearize(), self.changeMode()
+	linearize()						generates a linearized representation of a fasta file. Each line represents an
+									entry organized in columns: header, seq, comment, seqlen (optionally)
+									Separator of columns is \t by default and masked in headers and comments using
+									space by default. Returns a non-temporary linfastafile object.
+									Dependency: self.changeMode()
+	getSeqHash()					[static] returns an UTF-8 encoded MD5 hash of a given sequence (string)
+	getAccessions()					screens file for accessions of type: accession_prefix|accession_number
+									Accession prefixes have submitted as list. For this opening mode is changed 
+									to r and cursor set to the beginning of	the file. Importantly, original opening	mode 
+									and cursor positions are restored after actions. However, original modes w or w+ 
+									are replaced by a and a+, respectively, to avoid file truncation. Updates 
+									self.number_accessions, self._accessions
+									No return value.
+	'''
+	
+	def __init__(self, name = False, tmp = False, mode = False, descr = False):		
+		file.__init__(self, name = name, tmp = tmp, mode = mode, descr = descr)
+		self._number_header = False
+		self._number_singleheader = False
+		self._number_comments = False	
+		self.number_accessions = False
+		self._accessions = False	
+
+	def getHeaderNumber(self):
+		if self._number_header == False:
+			self.screenFASTA()
+		return self._number_header
+		
+	def getSingleHeaderNumber(self):
+		if self._number_singleheader == False:
+			self.screenFASTA()
+		return self._number_singleheader		
+		
+	def getCommentNumber(self):
+		if self._number_comments == False:
+			self.screenFASTA()
+		return self._number_comments	
+	
+	def screenFASTA(self):
+		'''provides number of entries and single header as attributes'''
+		prevmode = self._mode
+		cursor = self.handle.tell()
+		self.changeMode('r')
+		self._number_header = 0
+		self._number_singleheader = 0
+		self._number_comments = 0
+		for line in self.handle:
+			if self.isHeader(line):
+				self._number_header += 1
+				self._number_singleheader += 1 + line.count(chr(1))
+			elif self.isComment(line):
+				self._number_comments += 1	
+		self.changeMode(prevmode)				
+		self.handle.seek(cursor)	
+		
+	def nextEntry(self):
+		'''Returns next entry of FASTA file as tuple (header, seq, comment). File pointer is set immediatly before the following entry'''
+		e = 0
+		seq = []
+		comment=[]
+		header = False
+		while e < 2:
+			cursor = self.handle.tell()			
+			#read line
+			line = self.nextLine()
+			if not line:
+				break
+			line = line.strip()			
+			#header detection
+			if self.isHeader(line):
+				e += 1
+				# entry start
+				if e == 1:
+					header = line
+				#entry end
+				else:
+					self.handle.seek(cursor)
+					break
+			elif self.isComment(line):
+				comment.append(line)
+			else:
+				seq.append(line)		
+		if header == False:
+			return False
+		else:
+			return (header, ''.join(seq), ' '.join(comment))
+			
+	def isHeader(self, str):
+		#str = str.lstrip()
+		if len(str) > 0 and str[0] == '>':
+			return True
+		else:
+			return False
+			
+	def isComment(self, str):
+		#str = str.lstrip()
+		if len(str) > 0 and str[0] == ';':
+			return True
+		else:
+			return False		
+			
+	def makeNR(self, name=False, path=os.getcwd(), format = 'fasta'):
+		#linearize and add seqlen + seqhash
+		with self.linearize(addSeqLen=True) as linFile:
+			linFile.setTmp()
+			#sort by length and seq
+			with linfastafile(tmp = True, mode = False) as sortedFile:					
+				cmd = ['sort',  '-k', '4,2', '-t', '\t', '-o' , sortedFile.fullname, linFile.fullname] #linux
+				cmd = ['S:\\cygwin64\\bin\\sort.exe', '-k', '4,2', '-t', '	', '-o' , sortedFile.fullname, linFile.fullname] #cygwin
+				#print (" ".join(cmd))
+				subprocess.check_call(cmd)
+				
+				#compare seqs in sorted file and make NR fasta
+				if format == 'fasta':
+					nrfile = fastafile()
+				else:
+					nrfile = linfastafile()		
+				
+				sortedFile.changeMode('r')
+				length = -1
+				seq = ''
+				header = []
+				for line in sortedFile.handle:
+					line = line.strip()
+					if len(line) > 0:
+						fields = line.split('\t')
+						if int(fields[3]) == length and fields[1] == seq:
+								header.extend(fields[0][1:].split(chr(1)))
+						else:
+							if len(header) > 0:
+								nrfile.handle.write(">" + chr(1).join(set(header)) + '\n' + seq + '\n')
+							header = fields[0][1:].split(chr(1))
+							seq = fields[1].strip()
+							length = int(fields[3])
+				nrfile.handle.write(">" + chr(1).join(set(header)) + '\n' + seq + '\n')
+		return nrfile	
+		
+
+	def linearize(self, name = False, sep = '\t', sepmask = ' ', addSeqLen = False):
+		'''
+		Saves a linearized copy of a FASTA file and returns a associated linfastafile Object.
+		Options:
+		sep			separator used between columns of the linearized file (by default: \t)
+		sepmask		replacement of naturally occuring separators (by default: space)
+		addSeqLen	Sequence length is added to file
+		Output:
+		linfastafile Object point to a text file with different columns
+		col1	header
+		col2	sequence
+		col3	comment
+		col4	sequence length (if addSeqLen = True)
+		'''
+		prevmode = self._mode
+		cursor = self.handle.tell()
+		self.changeMode('r')
+		out = []
+		i = 0
+		
+		with linfastafile(name, sep, sepmask, tmp=False) as linfile:
+			while True:
+				i += 1
+				entry = self.nextEntry()
+				if entry == False:
+					break			
+				entry=[x.replace(sep, sepmask) for x in list(entry)]
+				if addSeqLen:
+					entry.append(str(len(entry[1])))
+				out.append(sep.join(entry))		
+				if i >= 1000000:
+					linfile.handle.write('\n'.join(out) + '\n')
+					out = []
+					i = 0
+			linfile.handle.write('\n'.join(out) + '\n')
+			self.changeMode(prevmode)
+			self.handle.seek(cursor)
+			linfile.closeFile()
+		return linfile
+		
+	@staticmethod
+	def getSeqHash(seq):
+		'''returns MD5 hash of sequence '''
+		return hashlib.md5(seq.encode('utf-8')).hexdigest()		
+
+	
+	def getAccessions(self, actypes):
+		if self._accessions:
+			return self._accessions
+		self._accessions = set()	
+		prevmode = self._mode
+		cursor = self.handle.tell()
+		self.changeMode('r')
+		self.number_accessions = 0
+		regexp = r"(?:>|" + chr(1) + "|\|)(?:" + "|".join(actypes) + ")\|[^|\r\n ]+"
+		pattern = re.compile(regexp)
+		for line in self.handle:
+			line = line.strip()
+			if self.isHeader(line):
+				matches = re.findall(pattern, line)
+				if matches:
+					self._accessions.update(matches)
+		self.number_accessions = len(self._accessions)	
+		self.changeMode(prevmode)				
+		self.handle.seek(cursor)		
+		
+#############################################################################################
+		
+class linfastafile(file):
+	'''linearized fasta file object
+	these files are separator-delimited representations of FASTA files usefull for parsing
+	each line represents one entry
+		
+	PARENTS
+	file	generic file object
+	
+	PARAMETER:
+	name:		filename including relative or absolute path [False]. If false, unique 
+				filename not existing is generated
+	tmp:		tmp status[False]. If true, file is handled as temporary file.
+				That means, file is deleted on exit
+	mode:		mode of file opening. If mode is false and file exists, mode r 
+				(reading from beginning) is set by default. If mode is false and file does 
+				not exists or name is false mode is set to w (create and write) by default.
+	descr:		file description [False]
+	sep:		separator of columns [CHAR]
+	sepmask:	char used to mask separator in headers or comments [CHAR]. Must be different from sep
+	
+		
+	ATTRIBUTES:
+	self._number_header 		Number of fasta head lines in the file*
+	self._number_singleheader	Number of fasta single headers in the file*
+								Fasta head lines can contain multiple headers separated
+								by SOH (chr(1))*
+	self._number_comments		Number of comment lines in the file*
+	self._number_accessions		Number accessions in the file*
+	self._accessions			set of accessions in the file*
+	self.number_seq 			Number of sequences*
+	
+	* to avoid unnecessary (and sometimes time consuming) parsing, attributes are generated if
+	needed the first time	
+
+	NON-MAGIC METHODS:
+	initialize()		cursor is set to position 0
+						atrributes self._number_header, self._number_seq, self._number_singleheader,
+						self._number_comments are set to False
+	getHeaderNumber()				returns number of fasta headers in the file
+									Dependency: self.screenLinFASTA()
+	getSingleHeaderNumber()			returns number of single headers in the file
+									Dependency: self.screenLinFASTA()
+	getCommentNumber()				returns number of comments in the file
+									Dependency: self.screenLinFASTA()
+	getSequenceNumber()				returns number of sequences in the file
+									Dependency: self.screenLinFASTA()
+	screenLinFASTA()				calculates number of headers, single headers, and comments in the file.
+									For this opening mode is changed to r and cursor set to the beginning of
+									the file. Importantly, original opening	mode and cursor positions are
+									restored after actions. However, original modes w or w+ are replaced 
+									by a and a+, respectively, to avoid file truncation. Updates 
+									self._number_header, self._number_singleheader, and self._number_comments. 
+									No return value.
+									Dependency:  self.changeMode()
+	getAccessions()					screens file for accessions of type: accession_prefix|accession_number
+									Accession prefixes have submitted as list. For this opening mode is changed 
+									to r and cursor set to the beginning of	the file. Importantly, original opening	mode 
+									and cursor positions are restored after actions. However, original modes w or w+ 
+									are replaced by a and a+, respectively, to avoid file truncation. Updates 
+									self.number_accessions, self._accessions
+									No return value.	
+	'''
+	
+	def __init__(self, name = False,  sep = '\t', sepmask = ' ', tmp=False, mode = False, descr = False):	
+		file.__init__(self, name=name, tmp=tmp, mode=mode, descr = descr)
+		self.number_header = 0
+		self._sep = sep
+		self._sepmask = sepmask
+		self.number_header = False
+		self.number_seq = False
+		self.number_singleheader = False
+		self.number_comments = False
+		self._accessions = False
+		self.number_accessions = False
+		
+		self.initialize()
+
+	def initialize(self):
+		'''provides number of entries and single header as attributes'''
+		filepointer = self.handle.tell()
+		self.handle.seek(0)
+		self._number_header = False
+		self._number_seq = False
+		self._number_singleheader = False
+		self._number_comments = False
+		
+	def getHeaderNumber(self):
+		if self._number_header == False:
+			self.screenLinFASTA()
+		return self.number_header
+		
+	def getSingleHeaderNumber(self):
+		if self._number_singleheader == False:
+			self.screenLinFASTA()
+		return self._number_singleheader		
+		
+	def getCommentNumber(self):
+		if self._number_comments == False:
+			self.screenLinFASTA()
+		return self._number_comments	
+
+	def getSeqNumber(self):
+		if self.number_seq == False:
+			self.screenLinFASTA()
+		return self.number_seq		
+	
+	def screenLinFASTA(self):
+		'''provides number of entries and single header as attributes'''
+		prevmode = self._mode
+		cursor = self.handle.tell()
+		self.changeMode('r')
+		self.number_header = 0
+		self.number_seq = 0
+		self.number_singleheader = 0
+		self.number_comments = 0
+		for line in self.handle:
+			if len(line) > 0:
+				fields = [x.strip() for x in line.split(self._sep)]
+				if len(fields[0]) > 0:
+					self.number_header += 1
+					self.number_singleheader += 1 + fields[0].count(chr(1))
+				if len(fields[1]) > 0:
+					self.number_seq += 1
+				if len(fields[2]) > 0:
+					self.number_comment += 1
+		self.changeMode(prevmode)				
+		self.handle.seek(cursor)			
+
+
+	def getAccessions(self, actypes):
+		if self._accessions:
+			return self._accessions
+		self._accessions = set()			
+		filepointer = self.handle.tell()		
+		self.handle.seek(0)
+		self.number_accessions = 0
+		regexp = r"(?:>|" + chr(1) + "|\|)(?:" + "|".join(actypes) + ")\|[^|\r\n ]+"
+		pattern = re.compile(regexp)
+		for line in self.handle:
+			matches = re.findall(pattern, line.split(self._sep)[0])
+			if matches:
+				self._accessions.update(matches)
+		self.handle.seek(filepointer)		
+		self.number_accessions = len(self._accessions)
+		
+#############################################################################################
+		
+class fastqfile(file):
+	'''standard fastq file object
+	
+	PARENTS
+	file	generic file object
+	
+	PARAMETER:
+	name:		filename including relative or absolute path [False]. If false, unique 
+				filename not existing is generated
+	tmp:		tmp status[False]. If true, file is handled as temporary file.
+				That means, file is deleted on exit
+	mode:		mode of file opening. If mode is false and file exists, mode r 
+				(reading from beginning) is set by default. If mode is false and file does 
+				not exists or name is false mode is set to w (create and write) by default.
+	descr:		file description [False]
+	
+		
+	ATTRIBUTES:
+	self._number_reads	number of reads in the file
+	
+	* to avoid unnecessary (and sometimes time consuming) parsing, attributes are generated if
+	needed the first time	
+
+	NON-MAGIC METHODS:
+	initialize()					cursor is set to position 0
+									atrribute self._number_reads is set to False
+	getReadNumber()					returns number of reads in the file
+									Dependency: self.screenFastq()
+	checkIdentifierLine()			returns true if given string starts with @
+	checkDescriptionLine()			returns true if given string starts with +
+	screenFastq()					calculates number of headers, single headers, and comments in the file.
+									For this opening mode is changed to r and cursor set to the beginning of
+									the file. Importantly, original opening	mode and cursor positions are
+									restored after actions. However, original modes w or w+ are replaced 
+									by a and a+, respectively, to avoid file truncation. Updates 
+									self._number_header, self._number_singleheader, and self._number_comments. 
+									No return value. Can exit and print Exception message if standard format is 
+									violated.
+									Dependency:  self.changeMode()
+	nextRead()						Returns next read of FASTQ file as tuple (identifier, sequence, description, quality scores). 
+									File pointer is set immediatly before the following entry
+									Dependency:  self.nextLine()
+	randomSubset()					generates a fastq file with a random read subset. Proportion [0, 1] can be defined.
+									Returns a new fastq file object.
+									Dependency: self.nextRead(), self.getReadNumber()
+	'''
+	
+	def __init__(self, name = False, tmp=False, mode = False, descr = False):	
+		file.__init__(self, name=name, tmp=tmp, mode=mode, descr = descr)	
+		self.initialize()
+
+	def initialize(self):
+		'''provides number of entries and single header as attributes'''
+		filepointer = self.handle.tell()
+		self.handle.seek(0)
+		self._number_reads = False
+		
+	def getReadNumber(self):
+		'''returns number of reads stored in the fastq file'''
+		if self._number_reads == False:
+			self.screenFastq()
+		return self._number_reads
+		
+	def checkIdentifierLine(self, line):
+		if line[0] == "@":
+			return True
+		else:
+			return False
+			
+	def checkDescriptionLine(self, line):
+		if line[0] == "+":
+			return True
+		else:
+			return False				
+	
+	def screenFastq(self):
+		'''provides number of reads as attributes'''
+		prevmode = self._mode
+		cursor = self.handle.tell()
+		self.changeMode('r')
+		self._number_reads = 0
+		l = 0
+		while True:
+			identifier = self.handle.readline()
+			l += 1
+			if not identifier:
+				break
+			if identifier == "":
+				continue
+			if self.checkIdentifierLine(identifier) == False:
+				print("FORMAT EXCEPTION in line", l, ": identifier lines should start with @")
+				self._number_reads = False
+				exit(1)
+			self.nextLine()
+			l += 1
+			if self.checkDescriptionLine(self.nextLine()) == False:
+				print("FORMAT EXCEPTION in line", l , ": description lines should start with +")
+				self._number_reads = False
+				exit(1)			
+			l += 1
+			if self.nextLine() == "":
+				print("FORMAT EXCEPTION in line", l , ": empty score line")
+				self._number_reads = False
+				exit(1)			
+			l += 1			
+			self._number_reads += 1
+		self.changeMode(prevmode)				
+		self.handle.seek(cursor)
+
+	def nextRead(self):		
+		'''Returns next read of FASTQ file as tuple (identifier, sequence, description, quality scores). File pointer is set immediatly before the following entry'''
+		line = self.nextLine()
+		if not line:
+			return False
+		if line.strip() == "":
+			self.nextRead()
+		if self.checkIdentifierLine(line) == False:
+			print("FORMAT EXCEPTION: identifier lines should start with @")
+			exit(1)
+		identifier = line
+		
+		line = self.nextLine()
+		if line.strip() == '':
+			print("FORMAT EXCEPTION: no read sequence found")
+			exit(1)			
+		seq = line
+		
+		line = self.nextLine()
+		if self.checkDescriptionLine(line) == False:
+			print("FORMAT EXCEPTION: description lines should start with +")
+			exit(1)			
+		descr = line
+		
+		line = self.nextLine()
+		if line.strip() == '':
+			print("FORMAT EXCEPTION: no quality scores found")
+			exit(1)			
+		scores = line
+		
+		return (identifier, seq, descr, scores)
+		
+	def randomSubset(self, prop, tmp = False, selected = False):
+		'''generates a fastq file with a random read subset. Proportion [0, 1] can be defined. Returns a new fastq file object.'''
+		if selected == False:
+			prop = int(self.getReadNumber()*prop)
+			selected = random.sample(range(1, self.getReadNumber()), prop)
+		selected = set(selected)
+		
+		subsetfastqObj = fastqfile(tmp = tmp)
+		e = 1
+		while e <= self.getReadNumber():
+			readdata = self.nextRead()
+			if e in selected:
+				subsetfastqObj.handle.write(''.join(readdata))
+			e += 1
+		return subsetfastqObj
+		
+#############################################################################################	
+
+class pairedfastqfiles():
+	'''combines two standard fastq file object containing as paired reads
+	read numbers have to be the same in both files
+	
+	PARENTS
+	file	generic file object
+	
+	PARAMETER:
+	fname:		filename of forward read file including relative or absolute path [False]. If false, unique 
+				filename not existing is generated
+	rname:		filename of reverse read file including relative or absolute path [False]. If false, unique 
+				filename not existing is generated				
+	tmp:		tmp status[False]. If true, files are handled as temporary files.
+				That means, files are deleted on exit
+	mode:		mode of file opening. If mode is false and file exists, mode r 
+				(reading from beginning) is set by default. If mode is false and file does 
+				not exists or fname/rname is false mode is set to w (create and write) by default.
+	descr:		file description [False]
+	
+		
+	ATTRIBUTES:
+	self.forward		fastq file object of the forward read file
+	self.reverse		fastq file object of the reverse read file
+	self._number_reads	read numbers (same for both files)
+	
+	NON-MAGIC METHODS:
+	initialize()					conmpares read numbers of paired files. Exception shown if different.
+									Cursor is NOT set to position 0
+									Updates self._number_reads
+									Dependency: self.getReadNumber()
+									attribute self._number_reads is set to False
+	getReadNumber()					returns number of reads  in the files
+									Dependency: self.screenLinFASTA()
+	randomSubset()					generates paired fastq files with random paired read subsets. Proportion [0, 1] can be defined.
+									Returns a new paired fastq file object.
+									Dependency: self.nextRead(), self.getReadNumber()
+	'''	
+	
+	def __init__(self, fname = False, rname = False, tmp=False, mode = False, descr = False):	
+		self.forward = fastqfile(name=fname, tmp=tmp, mode=mode, descr = descr)
+		self.reverse = fastqfile(name=rname, tmp=tmp, mode=mode, descr = descr)
+		self.initialize()
+	
+	def __enter__(self):
+		return self			
+
+	def initialize(self):
+		'''provides number of entries and single header as attributes'''
+		self.getReadNumber()
+		
+	def getReadNumber(self):
+		if self.forward.getReadNumber() != self.reverse.getReadNumber():
+			print('forward file:', self.forward.fullname, '(reads:', self.forward.getReadNumber(),')')
+			print('reverse file:', self.reverse.fullname, '(reads:', self.reverse.getReadNumber(),')')
+			raise ValueError('paired files differ in read numbers')
+		else:
+			self._number_reads = self.forward.getReadNumber()*2
+		return self._number_reads
+			
+	def randomSubset(self, prop, tmp = False):
+		'''generates paired fastq files with random read subsets. Proportion [0, 1] can be defined. Returns a new paired fastq file object.'''
+		prop = int(self.getReadNumber()*prop)
+		selected = random.sample(range(1, self.getReadNumber()), prop)
+		forward_subset = self.forward.randomSubset(prop, tmp = tmp, selected = selected)
+		forward_subset.closeFile()
+		reverse_subset = self.reverse.randomSubset(prop, tmp = tmp, selected = selected)
+		reverse_subset.closeFile()
+		subsetPairedfastqObj = pairedfastqfiles(fname = forward_subset.fullname, rname = reverse_subset.fullname, tmp = tmp)
+		return subsetPairedfastqObj		
+		
+	def setTmp(self):
+		'''Set fileObject tmp attribute True'''	
+		self.forward.setTmp()		
+		self.reverse.setTmp()
+	
+	def unsetTmp(self):
+		'''Set fileObject tmp attribute False'''
+		self.forward.unsetTmp()		
+		self.reverse.unsetTmp()		
+		
+	def __exit__(self, type, value, traceback):
+		self.forward.destroy()		
+		self.reverse.destroy()		
+			
+			
+	
+#############################################################################################	
+		
+class logfile(file):
+	'''log file object (as I use it)
+	
+	PARENTS
+	file	generic file object
+	
+	ATTRIBUTES
+	catlineChar		Char or string used as category underlin [STRING]
+	catlineLen 		Length category underlining [INT]
+			
+	PARAMETER:
+	name:		filename including relative or absolute path [False]. If false, unique 
+				filename not existing is generated
+	tmp:		tmp status[False]. If true, file is handled as temporary file.
+				That means, file is deleted on exit
+	mode:		mode of file opening. If mode is false and file exists, mode r 
+				(reading from beginning) is set by default. If mode is false and file does 
+				not exists or name is false mode is set to w (create and write) by default.
+	descr:		file description [False]
+	
+		
+	ATTRIBUTES:
+	self.catlineLen		length of category underlining
+	self.catlineChar	char used for category underlining
+
+	NON-MAGIC METHODS:
+	initialize()					cursor is set to position 0
+									atrribute self._number_reads is set to False
+									
+	writeAsCols()					create columns with individual width, text is left aligned, numbers 
+									right aligned. Formatted string is written to log file. Input is a 2-level 
+									list (1st level: lines, 2nd level: columns). Returns no value.
+	newCat()						writes a new category with a given Name to the log file incl. underlining 
+									(s. attributes)
+	lb()							writes a linebreak to logfile. For compatibility reasons windows line 
+									breaks (\r\n) are used
+									Dependency: self.write()
+	addExecTime()					adds the execution time in hh:ss:mm to log file. Seconds have to be given.
+									Dependency: self.write(), self.lb()
+	
+	'''
+	
+	def __init__(self, name = False, tmp=False, mode = False, descr = False):	
+		file.__init__(self, name=name, tmp=tmp, mode=mode, descr = descr)	
+		self.catlineLen = 30
+		self.catlineChar = "="
+		
+	def writeAsCols (self, input, sep = ' ' * 5):
+		'''create columns with individual width
+		
+		PARAMETER		
+		sep 		separator of columns [STRING]
+		'''
+		#allowed signs in numbers
+		pattern = re.compile(r'([+-])?([0-9]+)([.,]?)([0-9]+)(%?)')
+    
+		#str conversion
+		lines = [[str(x) for x in line] for line in input] 
+    
+		#fill up rows to same number of columns (important for transposing)
+		maxlen = max([len(x) for x in lines])
+		[x.extend(['']*(maxlen-len(x))) for x in lines]    
+		
+		#find format parameters
+		width = [] #colum width or length
+		align = [] #alignment type (number = r; strings = l)
+		prec = [] #number precision
+		for column in zip(*lines):
+			width.append(len(column[0]))
+			prec.append(0)
+			align.append('r')
+			for field in column[1:]:
+				if align[-1] ==  'l':
+					width[-1] = max(width[-1], len(field))
+				else:
+					match = pattern.match(field)
+					if match and match.group(0) == field:
+						if match.group(3) and match.group(4):
+							prec[-1] = max(prec[-1], len(match.group(4)))
+						
+						if prec[-1] > 0:
+							k = 1
+						else:
+							k = 0
+						width[-1] = max(width[-1], len(match.group(2)) + prec[-1] + k + len(match.group(4)))
+					else:
+						align[-1] = 'l'
+						prec[-1] = False
+						width[-1] = max(width[-1], len(field))
+				
+		#formatting
+		output = []
+		for	line in lines:
+			f = 0
+			output.append([])
+			for field in line:
+				if align[f] == 'l':
+					output[-1].append(field + ' ' * (width[f] - len(field)))
+				elif align[f] == 'r':
+					match = pattern.match(field)
+					if not match or match.group(0) != field: 
+						output[-1].append(' ' * (width[f] - len(field)) + field)
+					else:
+						if match.group(5):
+							percent = match.group(5)
+						else:
+							percent = ''
+						length = len(percent)
+						
+						intpart = match.group(2)
+						if match.group(1):
+							intpart = match.group(1) + intpart
+						if match.group(3):
+							decpart = match.group(4) 
+						else:
+							intpart += match.group(4)
+							decpart = ''							
+						length += len(intpart)
+						
+						#print('match:', match.group(0))
+						#print('intpart:', intpart)
+						#print('decpart:', decpart)
+						#print('prec:', prec[f])
+						
+						if prec[f] > 0:
+							decpart += '0' * (prec[f] - len(match.group(4)))
+							length += len(decpart) + 1
+							formatted_number = ' ' * (width[f] - length) + intpart + '.' + decpart + percent
+						else:
+							formatted_number = ' ' * (width[f] - length) + intpart + percent
+						output[-1].append(formatted_number)		
+				f += 1
+		self.write('\r\n'.join([sep.join(x) for x in output]))
+		
+	def newCat(self, cat):			
+		if self.catlineLen < len(cat):
+			catline = self.catlineChar * len(cat)
+		else:
+			catline = self.catlineChar * self.catlineLen 
+		self.write('\r\n\r\n' + cat.upper() + '\r\n' + catline + '\r\n')
+
+	def lb(self):			
+		self.write('\r\n')	
+
+	def addExecTime(self, s):
+		self.lb()
+		self.lb()
+		self.write('--------------------------')
+		self.lb()
+		self.write('execution time (hh:mm:ss): ') 
+		self.write('{:02.0f}:{:02.0f}:{:02.0f}'.format(s//3600, s%3600//60, s%60))			
+
+		
+		
+							
+						
+						
+						
+					
+					
diff --git a/mapper.py b/mapper.py
new file mode 100755
index 0000000..a34d68c
--- /dev/null
+++ b/mapper.py
@@ -0,0 +1,520 @@
+from abc import ABCMeta, abstractmethod
+import argparse
+import os
+import gzip
+import custom_logger as log
+import re
+import sys
+if sys.version_info < (3, 2):
+    import subprocess32 as sp
+else:
+    import subprocess as sp
+
+
+class Mapper:
+    """ 
+    This is the abstract class to use when implementing a new mapper. 
+    Try to minimize the output in the console, e.g. redirect the output of a call to dev/null or save it somewhere.
+    Make sure if you may want to use os.path.expanduser(PATH) to extend e.g. '~' to the home directory.
+    """
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def __init__(self, query, reference, output, options=None, logger=None):
+        """ 
+        Input:
+            query: Path to the query file. 
+                Type: str for single-end reads, list of two str for paired-end reads.
+            reference: Path to the reference file. 
+                Type: str
+            output: Path to the output file without the file extension (e.g. '.sam').
+                Type: str
+            options: Contains all additional options.
+                Type: list of str, organized like sys.argv
+            logger: Optional log.
+                Type: logging.logger object
+
+        Accessed from outside:
+            statistics: String with mapper statistics.
+                Type: str
+            version: Version of the used mapper.
+                Type: str
+
+        Other:
+            In self.options may be given: A path where to save the index. Default: '~/tmp/'
+            e.g.:
+                index_location: Expected path to the index, i.e. where it should be saved. Default: '~/tmp/'
+                    Type: str
+                index: Real path to the index.
+                    Type: str
+
+            and
+            verbosity: May be given in self.options.
+                        1: Report steps.
+                Type: int
+
+        You may add more initialization, e.g. detection of input file types (e.g. .fa or .fastq in Bowtie2).
+
+        self.report() at the end.
+        """
+
+        self.query = query
+        self.reference = os.path.expanduser(reference)
+        self.query_paired = len(query) == 2 #isinstance(query, list)
+        self.output = os.path.expanduser(output) # For example, without file extension
+        if self.query_paired:
+            self.query[0] = os.path.expanduser(self.query[0])
+            self.query[1] = os.path.expanduser(self.query[1])
+        else:
+            self.query = os.path.expanduser(self.query[0])
+        self.statistics = None
+        self.version = None
+        self.index = None
+        self.logger = logger
+        self.verbosity = 1
+        # parse options
+        self.report()
+    
+    @abstractmethod
+    def map(self):
+        """ Create index if non exists (self.index). Call mapper to map self.query to self.reference using additional options. 
+            Automatically detects single- or paired-end reads. 
+            Return the absolute path of the location of the generated SAM-file.
+            Return None if an error occurs.
+            If the console output contains statistics (e.g. in Bowtie2), these should be saved in self.statistics."""
+
+    @abstractmethod
+    def get_version(self):
+        """ Return the version. """
+        return self.version
+
+    @abstractmethod
+    def write_unmapped(self, un):
+        """ Write unaligned reads to un. """
+
+    @abstractmethod
+    def get_statistics(self):
+        """ Return the statistics. """
+        return self.statistics 
+    
+    @abstractmethod
+    def cleanup(self):
+        """ Remove the index. You may want to use 'find' instead of 'rm'."""
+
+    @abstractmethod
+    def report(self):
+        """ Write all meaningful variables (at least the one defined in __init__) and their respective values to the logger. """
+
+class Bowtie2(Mapper):
+
+    def __init__(self, query, reference, output, tmp=None, options=None, logger=None, readgroups=True, unmapped=None, samtools=None, bowtie2=None):
+        self.query = query
+        self.reference = os.path.expanduser(reference)
+        self.output = os.path.expanduser('{}{}'.format(output,'.sam'))
+        self.reference_fasta = reference.endswith('.fa') or reference.endswith('.mfa')
+        self.samtools = samtools
+        self.bowtie2 = bowtie2
+        self.query_paired = len(query) == 2 #isinstance(query, list)
+        if self.query_paired:
+            self.query[0] = os.path.expanduser(self.query[0])
+            self.query[1] = os.path.expanduser(self.query[1])
+        else:
+            self.query = os.path.expanduser(self.query[0])
+        self.query_fasta = query[0].endswith('.fa') or query[0].endswith('.mfa') #if self.query_paired else query.endswith('.fa') or query.endswith('.mfa')
+        self.statistics = None
+        self.version = None
+        self.index = None
+        self.verbosity = 1
+        self.logger = logger
+        self.readgroups = readgroups
+        self.options = options
+        self.unmapped = unmapped
+        if self.unmapped is not None:
+            if self.query_paired:
+                if self.options is not None:
+                    self.options += ' --un-conc {}_{}'.format(self.unmapped, os.path.basename(self.query[0]))
+                else:
+                    self.options = ' --un-conc {}_{}'.format(self.unmapped, os.path.basename(self.query[0]))
+            else:
+                if self.options is not None:
+                    self.options += ' --un {}_{}'.format(self.unmapped, os.path.basename(self.query))
+                else:
+                    self.options = ' --un {}_{}'.format(self.unmapped, os.path.basename(self.query))
+        if tmp is None:
+            self.index_location = os.path.expanduser('~/tmp/{}'.format('.'.join(os.path.basename(reference).split('.')[:-1])))
+        else:
+            self.index_location = os.path.join(tmp, '.'.join(os.path.basename(reference).split('.')[:-1]))
+        try:
+            os.mkdir(os.path.expanduser('~/tmp/'))
+        except OSError:
+            pass
+        self.report()
+        #TODO options may change index_location or verbosity
+    
+    def update(self, query, output, options):
+        self.query = query
+        self.output = os.path.expanduser('{}{}'.format(output,'.sam'))
+        self.query_paired = len(query) == 2 #isinstance(query, list)
+        if self.query_paired:
+            self.query[0] = os.path.expanduser(self.query[0])
+            self.query[1] = os.path.expanduser(self.query[1])
+            if self.unmapped is not None:
+                if options is not None:
+                    self.options = options + ' --un-conc {}_{}'.format(self.unmapped, os.path.basename(self.query[0]))
+                else:
+                    self.options = ' --un-conc {}_{}'.format(self.unmapped, os.path.basename(self.query[0]))
+        else:
+            self.query = os.path.expanduser(self.query[0])
+            if self.unmapped is not None:
+                if options is not None:
+                    self.options = options + ' --un {}_{}'.format(self.unmapped, os.path.basename(self.query))
+                else:
+                    self.options = ' --un {}_{}'.format(self.unmapped, os.path.basename(self.query))
+        self.query_fasta = query[0].endswith('.fa') or query[0].endswith('.mfa') #if self.query_paired else query.endswith('.fa') or query.endswith('.mfa')
+
+    def create_index(self):
+        if self.options is not None and '-x' in self.options:
+            self.index = None
+        else:
+            try:
+                if self.verbosity >= 1:
+                    log.print_write(self.logger, 'info', '=== Building index ===')
+                try:
+                    if self.reference == '':
+                        raise Exception('No reference given.')
+                except:
+                    log.print_write(self.logger, 'exception', 'ERROR: No reference given.')
+                    sys.exit(1)
+                self.command = [x for x in [os.path.realpath(os.path.join(self.bowtie2, 'bowtie2-build')), '-f' if self.reference_fasta else None, self.reference, self.index_location] if x is not None]
+                log.call(self.command, self.logger)
+            except sp.CalledProcessError:
+                log.print_write(self.logger, 'exception', 'ERROR: CPE in Bowtie2.create_index()')
+                return False
+            self.index = self.index_location
+
+    def map(self):
+        if self.index is None:
+            self.create_index()
+        try:
+            if self.readgroups:
+                identifier = ''
+                if (self.query[0] if self.query_paired else self.query).endswith('.gz'):
+                    with gzip.open(self.query[0] if self.query_paired else self.query, 'rt') as read:
+                        for line in read:
+                            if line[0] == '@':
+                                identifier = line[1:].split(' ')[0].strip()
+                                break
+                else:
+                    with open(self.query[0] if self.query_paired else self.query, 'r') as read:
+                        for line in read:
+                            if line[0] == '@':
+                                identifier = line[1:].split(' ')[0].strip()
+                                break
+                try:
+                    match = re.match("([\w-]+):(\d+):([\w-]+):(\d+):(\d+):(\d+):(\d+)", identifier)
+                    instrument = match.group(1)
+                    run = match.group(2)
+                    flowcell = match.group(3)
+                    lane = match.group(4)
+                    tile = match.group(5)
+                    x_pos = match.group(6)
+                    y_pos = match.group(7)
+                except:
+                    log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set.')
+
+                if self.query_paired:
+                    try:
+                        rg_sm = re.match("r\d_[UP]+_(.*)_L[\d]+_R[\d]+(_[\d]+){0,1}.*", os.path.basename(self.query[0])).group(1)
+                    except AttributeError:
+                        rg_sm = os.path.basename(self.query[0]).split('.')[0]
+                else:
+                    try:
+                        rg_sm = re.match("r\d_[UP]+_(.*)_L[\d]+_R[\d]+(_[\d]+){0,1}.*", os.path.basename(self.query)).group(1)
+                    except AttributeError:
+                        rg_sm = os.path.basename(self.query).split('.')[0]
+                rg_pl = 'ILLUMINA'
+                rg_id = '.'.join([flowcell, rg_sm, lane])
+                rg_pu = rg_id
+                rg_lb = 'lib1'
+            else:
+                rg_sm = None
+                rg_pl = None
+                rg_id = None
+                rg_pu = None
+                rg_lb = None
+
+            if self.verbosity >= 1:
+                log.print_write(self.logger, 'info', '=== Mapping query on reference ===')
+            self.command = [x for x in ([
+                    os.path.realpath(os.path.join(self.bowtie2, 'bowtie2')),
+                    '-f' if self.query_fasta else None,
+                    '-x' if self.index else None,
+                    self.index if self.index else None,
+                    '-1' if self.query_paired else None,
+                    self.query[0] if self.query_paired else None,
+                    '-2' if self.query_paired else None,
+                    self.query[1] if self.query_paired else None,
+                    '-U' if not self.query_paired else None, 
+                    self.query if not self.query_paired else None,
+                    '-S', self.output,
+                    '--rg-id' if self.readgroups else None, rg_id,
+                    '--rg' if self.readgroups else None, 'SM:{}'.format(rg_sm) if self.readgroups else None,
+                    '--rg' if self.readgroups else None, 'PU:{}'.format(rg_pu) if self.readgroups else None,
+                    '--rg' if self.readgroups else None, 'PL:{}'.format(rg_pl) if self.readgroups else None,
+                    '--rg' if self.readgroups else None, 'LB:{}'.format(rg_lb) if self.readgroups else None] 
+                    + (self.options.split(' ') if self.options else list())
+                    )
+                if x is not None] # TODO options
+            self.statistics = log.check_output(self.command, self.logger, stderr=sp.STDOUT)
+            return self.output
+        except sp.CalledProcessError:
+            log.print_write(self.logger, 'exception', 'ERROR: CPE in Bowtie2.map()')
+            return None
+
+    def set_version(self):
+        try:
+            self.command = [os.path.realpath(os.path.join(self.bowtie2, 'bowtie2')), '--version']
+            self.version = re.match('.*version (.*)\n', str(sp.check_output(self.command), 'utf-8')).group(1)
+        except sp.CalledProcessError:
+            log.write(self.logger, 'exception', 'ERROR: CPE in Bowtie2.set_version')
+            return False
+
+    def get_version(self):
+        if self.version is None:
+            self.set_version()
+        return self.version
+
+    def get_statistics(self):
+        return self.statistics 
+
+    def get_fast_options(self):
+        return '-k 10'
+
+    def get_sensitive_options(self):
+        return '-a'
+
+    def cleanup(self):
+        if self.index is not None:
+            if self.verbosity >= 1:
+                log.write(self.logger, 'info', '=== Cleaning up mapper ===')
+            try:
+                self.command = ('find {} -type f -regextype posix-extended '
+                        '-iregex \'.*{}\.(([0-9]+\.bt2)|(rev\.[0-9]+\.bt2)|fasta)$\' -delete'.format(os.path.dirname(self.index), os.path.basename(self.index)))
+                log.call(self.command, self.logger, shell=True)
+                self.index = None
+            except sp.CalledProcessError:
+                log.print_write(self.logger, 'exception', 'ERROR: CPE in Bowtie2.cleanup()')
+                return False
+
+    def report(self):
+        log.write(self.logger, 'warning', 'Bowtie2 Version: {}'.format(self.get_version()))
+        log.write(self.logger, 'warning', 'Parameters: ')
+        for arg in sorted(vars(self)):
+            if arg != 'command':
+                log.write(self.logger, 'warning', '{} = {}'.format(arg, getattr(self, arg)))
+
+class BWAMem:
+
+    def __init__(self, query, reference, output, tmp=None, options=None, logger=None, readgroups=True, unmapped=None, samtools=None, bwa=None):
+        self.query = query
+        self.reference = os.path.expanduser(reference)
+        self.query_paired = len(query) == 2 #isinstance(query, list)
+        self.output = os.path.expanduser('{}.sam'.format(output))
+        self.samtools = samtools
+        self.bwa = bwa
+        self.unmapped_raw = unmapped
+        self.unmapped = ''
+        if self.query_paired:
+            self.query[0] = os.path.expanduser(self.query[0])
+            self.query[1] = os.path.expanduser(self.query[1])
+            self.unmapped = '{}_{}'.format(self.unmapped_raw, os.path.splitext(os.path.basename(self.query[0]))[0])
+        else:
+            self.query = os.path.expanduser(self.query[0])
+            self.unmapped = '{}_{}'.format(self.unmapped_raw, os.path.splitext(os.path.basename(self.query))[0])
+        if self.unmapped_raw is None:
+            self.unmapped = None
+        self.statistics = None
+        self.version = None
+        self.index = None
+        self.options = options
+        self.index_given = '-x' in self.options if self.options is not None else False
+        self.logger = logger
+        self.verbosity = 1
+        self.readgroups = readgroups
+        if tmp is None:
+            self.index_location = os.path.expanduser('~/tmp/{}'.format('.'.join(os.path.basename(reference).split('.')[:-1])))
+        else:
+            self.index_location = os.path.join(tmp, '.'.join(os.path.basename(reference).split('.')[:-1]))
+        self.report()
+
+    def update(self, query, output, options):
+        self.query = query
+        self.output = os.path.expanduser('{}{}'.format(output,'.sam'))
+        self.query_paired = len(query) == 2 #isinstance(query, list)
+        if self.query_paired:
+            self.query[0] = os.path.expanduser(self.query[0])
+            self.query[1] = os.path.expanduser(self.query[1])
+            if self.unmapped_raw is not None:
+                self.unmapped = '{}_{}'.format(self.unmapped_raw, os.path.splitext(os.path.basename(self.query[0]))[0])
+        else:
+            self.query = os.path.expanduser(self.query[0])
+            if self.unmapped_raw is not None:
+                self.unmapped = '{}_{}'.format(self.unmapped_raw, os.path.splitext(os.path.basename(self.query))[0])
+        self.query_fasta = query[0].endswith('.fa') or query[0].endswith('.mfa') #if self.query_paired else query.endswith('.fa') or query.endswith('.mfa')
+    
+    def create_index(self):
+        if self.options is not None and '-x' in self.options:
+            cache = self.options.split(' ')
+            pos = cache.index('-x')
+            self.index_location = cache[pos + 1]
+            del cache[pos:pos+2]
+            self.options = ' '.join(cache)
+        else:
+            try:
+                if self.verbosity >= 1:
+                    log.print_write(self.logger, 'info', '=== Building index ===')
+                try:
+                    if self.reference == '':
+                        raise Exception('No reference given.')
+                except:
+                    log.print_write(self.logger, 'exception', 'ERROR: No reference given.')
+                    sys.exit(1)
+                self.command = [os.path.realpath(os.path.join(self.bwa, 'bwa')), 'index', '-p', self.index_location, self.reference]
+                log.call(self.command, self.logger)
+            except sp.CalledProcessError:
+                log.print_write(self.logger, 'exception', 'ERROR: CPE in BWAMem.create_index()')
+                return False
+        self.index = self.index_location
+
+    def map(self):
+        if self.index is None:
+            self.create_index()
+        try:
+            if self.readgroups:
+                identifier = ''
+                if (self.query[0] if self.query_paired else self.query).endswith('.gz'):
+                    with gzip.open(self.query[0] if self.query_paired else self.query, 'rt') as read:
+                        for line in read:
+                            if line[0] == '@':
+                                identifier = line[1:].split(' ')[0].strip()
+                                break
+                else:
+                    with open(self.query[0] if self.query_paired else self.query, 'r') as read:
+                        for line in read:
+                            if line[0] == '@':
+                                identifier = line[1:].split(' ')[0].strip()
+                                break
+                try:
+                    match = re.match("([\w-]+):(\d+):([\w-]+):(\d+):(\d+):(\d+):(\d+)", identifier)
+                    instrument = match.group(1)
+                    run = match.group(2)
+                    flowcell = match.group(3)
+                    lane = match.group(4)
+                    tile = match.group(5)
+                    x_pos = match.group(6)
+                    y_pos = match.group(7)
+                except:
+                    log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set.')
+
+                if self.query_paired:
+                    try:
+                        rg_sm = re.match("r\d_[UP]+_(.*)_L[\d]+_R[\d]+(_[\d]+){0,1}.*", os.path.basename(self.query[0])).group(1)
+                    except AttributeError:
+                        rg_sm = os.path.basename(self.query[0]).split('.')[0]
+                else:
+                    try:
+                        rg_sm = re.match("r\d_[UP]+_(.*)_L[\d]+_R[\d]+(_[\d]+){0,1}.*", os.path.basename(self.query)).group(1)
+                    except AttributeError:
+                        rg_sm = os.path.basename(self.query).split('.')[0]
+                rg_pl = 'ILLUMINA'
+                rg_id = '.'.join([flowcell, rg_sm, lane])
+                rg_pu = rg_id
+                rg_lb = 'lib1'
+            else:
+                rg_sm = None
+                rg_pl = None
+                rg_id = None
+                rg_pu = None
+                rg_lb = None
+
+            if self.verbosity >= 1:
+                log.print_write(self.logger, 'info', '=== Mapping query on reference ===')
+            self.command = ' '.join([x for x in [
+                os.path.realpath(os.path.join(self.bwa, 'bwa')),
+                'mem',
+                self.index,
+                self.query[0] if self.query_paired else None,
+                self.query[1] if self.query_paired else None,
+                self.query if not self.query_paired else None,
+                '-R' if self.readgroups else None, '\"@RG\\tID:{}\\tPU:{}\\tSM:{}\\tPL:{}\\tLB:{}\"'.format(rg_id, rg_pu, rg_sm, rg_pl, rg_lb) if self.readgroups else None,
+                self.options if self.options else None,
+                '>', self.output
+                ] if x is not None]) # TODO options
+            self.statistics = log.check_output(self.command, self.logger, shell=True, stderr=sp.STDOUT)
+            if self.unmapped is not None:
+                self.command = '{samtools} view -f4 {output} | {samtools} view -Sb - | {samtools} fastq - -1 {unmapped}.1.fastq -2 {unmapped}.2.fastq -s {unmapped}.singleton.fastq -0 {unmapped}.unpaired.fastq'.format(output=self.output, unmapped=self.unmapped, samtools=os.path.realpath(os.path.join(self.samtools, 'samtools')))
+                log.call(self.command, self.logger, shell=True)
+                try:
+                    if os.path.getsize('{}.1.fastq'.format(self.unmapped)) == 0:
+                        os.remove('{}.1.fastq'.format(self.unmapped))
+                    if os.path.getsize('{}.2.fastq'.format(self.unmapped)) == 0:
+                        os.remove('{}.2.fastq'.format(self.unmapped))
+                    if os.path.getsize('{}.singleton.fastq'.format(self.unmapped)) == 0:
+                        os.remove('{}.singleton.fastq'.format(self.unmapped))
+                    if os.path.getsize('{}.unpaired.fastq'.format(self.unmapped)) == 0:
+                        os.remove('{}.unpaired.fastq'.format(self.unmapped))
+                except:
+                    pass
+            return self.output
+        except sp.CalledProcessError:
+            log.print_write(self.logger, 'exception', 'ERROR: CPE in BWAMem.map()')
+            return None
+
+    def set_version(self):
+        try:
+            self.command = [os.path.realpath(os.path.join(self.bwa, 'bwa'))]
+            p = re.compile('.*Version: (.*?)\n', re.DOTALL)
+            self.version = re.match(p, str(sp.Popen(self.command,stdout=sp.PIPE,stderr=sp.STDOUT).communicate()[0], 'utf-8')).group(1)
+        except sp.CalledProcessError as e:
+            log.write(self.logger, 'exception', 'ERROR: CPE in BWAMem.set_version')
+            return False
+
+    def get_version(self):
+        if self.version is None:
+            self.set_version()
+        return self.version
+
+    def get_statistics(self):
+        return self.statistics 
+    
+    def cleanup(self):
+        if self.index is not None and not self.index_given:
+            if self.verbosity >= 1:
+                log.write(self.logger, 'info', '=== Cleaning up mapper===')
+            try:
+                self.command = ('find {} -type f -regextype posix-extended '
+                        '-iregex \'.*{}\.(amb|ann|bwt|pac|sa|fasta)$\' -delete'.format(os.path.dirname(self.index), os.path.basename(self.index)))
+                log.call(self.command, self.logger, shell=True)
+                self.index = None
+            except sp.CalledProcessError:
+                log.print_write(self.logger, 'exception', 'ERROR: CPE in BWAMem.cleanup()')
+                return False
+    
+    def report(self):
+        log.write(self.logger, 'warning', 'BWA Version: {}'.format(self.get_version()))
+        log.write(self.logger, 'warning', 'Parameters: ')
+        for arg in sorted(vars(self)):
+            if arg != 'command':
+                log.write(self.logger, 'warning', '{} = {}'.format(arg, getattr(self, arg)))
+
+# add new mappers here
+def list_mapper():
+    return ('bowtie2', 'bwa-mem')
+
+# add new mappers here
+def get_mapper(mapper_key, query, reference, output, tmp=None, options=None, logger=None, readgroups=True, unmapped=None, samtools=None, bwa=None, bowtie2=None): 
+    if mapper_key == 'bowtie2':
+        return Bowtie2(query, reference, output, tmp=tmp, options=options, logger=logger, readgroups=readgroups, unmapped=unmapped, samtools=samtools, bowtie2=bowtie2)
+    if mapper_key == 'bwa-mem':
+        return BWAMem(query, reference, output, tmp=tmp, options=options, logger=logger, readgroups=readgroups, unmapped=unmapped, samtools=samtools, bwa=bwa)
diff --git a/mapping.cwl b/mapping.cwl
new file mode 100755
index 0000000..42e1dde
--- /dev/null
+++ b/mapping.cwl
@@ -0,0 +1,41 @@
+#!/usr/bin/env cwl-runner
+class: CommandLineTool
+inputs:
+  - id: "#mapper"
+    type: Any
+    inputBinding:
+      position: 1
+  - id: "#read1"
+    type: Any
+    inputBinding:
+      position: 2
+  - id: "#reference"
+    type:
+      type: array
+      items: string
+      inputBinding:
+        prefix: --reference
+        separate: true
+  - id: "#read2"
+    type: ["null",Any]
+    inputBinding:
+      prefix: --read2
+  - id: "#noreport"
+    type: ["null",boolean]
+    default: false
+    inputBinding:
+      prefix: --no-report
+  - id: "#output"
+    type: Any
+    inputBinding:
+      prefix: --output
+  - id: "#options"
+    type: ["null",string]
+    inputBinding:
+      prefix: --options
+  - id: "#reffinder_options"
+    type: ["null",Any]
+    inputBinding:
+      prefix: --reffinder_options
+outputs: []
+baseCommand: ["./mapping.py"]
diff --git a/mapping.py b/mapping.py
new file mode 100755
index 0000000..ef0448a
--- /dev/null
+++ b/mapping.py
@@ -0,0 +1,543 @@
+#!/usr/bin/env python3
+import argparse
+from collections import defaultdict
+import configparser
+import datetime
+import imp
+import jinja2
+import logging
+import os
+import platform
+import pwd
+import re
+import socket
+import string
+import sys
+if sys.version_info < (3, 2):
+    import subprocess32 as sp
+else:
+    import subprocess as sp
+### 
+import custom_logger as log
+import datetime
+import extract_genbank
+import reffinder
+import mapper
+import metrics
+
+__author__ = "Enrico Seiler"
+__credits__ = ["Wojtek Dabrowski"]
+__license__ = "GPL"
+__version__ = "1.0.0"
+__maintainer__ = "Enrico Seiler"
+__email__ = "seilere at rki.de"
+__status__ = "Development"
+
+class MAPPING:
+    def __init__(self, logger=None):
+        self.name = 'Mapping Pipeline'
+        self.version = __version__
+        self.logger = logger
+        self.python_version = sys.version.split(' ')[0]
+        self._argument_parser()
+        self.args.output = os.path.realpath(self.args.output)
+        self.user = pwd.getpwuid(os.getuid()).pw_name
+        self.server = socket.getfqdn()
+        self.system = platform.system()
+        self.machine = platform.architecture()[0]
+        self.call = ' '.join(sys.argv)
+        if self.system == 'Linux':
+            self.linux_dist = platform.linux_distribution()[0]      # Ubuntu
+            self.linux_version = platform.linux_distribution()[1]   # 14.04
+            self.release = ' '.join([self.linux_dist, self.linux_version])
+        elif self.system == 'Windows':                      # Windows
+            self.windows_version = platform.win32_ver()[0]  # 7
+            self.windows_build = platform.win32_ver()[1]    # 6.1.7601
+            self.windows_csd = platform.win32_ver()[2]      # SP1
+            self.release = ' '.join([self.windows_version, self.windows_csd, self.windows_build])
+        elif self.system == 'Java':
+            # ...
+            pass
+        else:
+            self.mac_release = platform.mac_ver()[0]
+            self.mac_version = platform.mac_ver()[1][0]
+        self.metrics_results = dict()
+        self.mapper = None
+        self.sam_files = list()
+        self.delete = list()
+        self.tmp = os.path.join(os.path.expanduser('~/tmp/'), datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        try:
+            os.mkdir(os.path.expanduser('~/tmp/'))
+        except OSError:
+            pass
+        try:
+            os.mkdir(self.tmp)
+        except OSError:
+            pass
+        self.run()
+        if self.args.join:
+            self._join()
+        if self.args.join_sample:
+            self._join_sample()
+        self.run_metrics()
+        self.cleanup()
+
+    def run_metrics(self):
+        if not self.args.noreport:
+            for sam in self.sam_files:
+                self.sam = sam
+                self.metric = metrics.METRICS(sam, output=self.tmp, logger=self.logger, samtools=os.path.realpath(os.path.join(self.args.samtools, 'samtools')))
+                self.current_project = os.path.basename(self._sam_name(os.path.basename(sam))) #read1
+                self.metrics_results[self.current_project] = dict()
+                self.metric.read_stats()
+                self.metric.compute_metrics()
+                self.metrics_results[self.current_project] = self.metric.cov_stats.copy()
+                self.metrics_results[self.current_project].update(self.metric.read_stats)
+                self.end_time = self._timestr(d=datetime.datetime.now())
+                self.report()
+        
+    def report(self):
+        self.report_file = os.path.join(self.args.output, 'mapping_{}_{}.tex'.format(os.path.basename(self._sam_name(self.sam)), datetime.datetime.strftime(datetime.datetime.now(), '%d-%m-%y_%H-%M')))
+        # self.env = jinja2.Environment()
+        # self.env.loader = jinja2.FileSystemLoader('.')
+
+        # self.template = self.env.get_template(os.path.join(os.path.dirname(__file__), 'report.tex'))
+        # self.pdf_latex = self.template.render(pipeline=self, metric=self.metric)
+        template_dir = os.path.dirname(__file__) #'/usr/share/mapping-pipeline'
+        template_file = 'report.tex'
+        loader = jinja2.FileSystemLoader(template_dir)
+        env = jinja2.Environment(loader=loader)
+        template = env.get_template(template_file)
+        self.args.reference = self.args.reference[0]
+        pdf_latex = template.render(pipeline=self, metric=self.metric)
+
+        with open(self.report_file, 'w') as self.latex:
+            self.latex.write(pdf_latex)
+        
+        process = sp.Popen(['pdflatex', '-interaction=nonstopmode', '-output-directory={}'.format(os.path.dirname(self.report_file)), self.report_file], stdout = sp.PIPE, stderr = sp.PIPE)
+        for line in iter(process.stderr.readline, b''):
+            print(line)
+
+        process.communicate()
+        self.metric.cleanup()
+    def cleanup(self):
+        log.print_write(self.logger, 'info', '=== Cleaning up ===')
+        try:
+            self.command = ('find {} -type f -regextype posix-extended '
+                    '-iregex \'.*\.(aux|log|tex)$\' -delete'.format(os.path.realpath(self.args.output)))
+            log.call(self.command, self.logger, shell=True)
+            self.mapper.cleanup()
+        except sp.CalledProcessError:
+            log.print_write(self.logger, 'exception', 'ERROR: CPE in MAPPING.cleanup()')
+            return False
+        try:
+            self.command = 'rm -dfr {}'.format(self.tmp)
+            log.call(self.command, self.logger, shell=True)
+        except sp.CalledProcessError:
+            log.print_write(self.logger, 'exception', 'ERROR: CPE in MAPPING.cleanup()')
+            pass
+        try:
+            for delete in self.delete:
+                self.command = 'rm -f {}'.format(delete)
+                log.call(self.command, self.logger, shell=True)
+        except sp.CalledProcessError:
+            log.print_write(self.logger, 'exception', 'ERROR: CPE in MAPPING.cleanup()')
+            pass
+
+    def run_reffinder(self, mode, walk=None):
+        log.print_write(self.logger, 'info', '=== Running Reffinder ===')
+        if self.args.reffinder_options is None:
+            if mode == 0:
+                self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} --tmp {}".format(self.args.read1, self.args.read2, ' '.join(self.args.reference), self.tmp), logger=self.logger))
+            elif mode == 1:
+                self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single --tmp {}".format(self.args.read1, ' '.join(self.args.reference), self.tmp), logger=self.logger))
+            elif mode == 2:
+                self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} --tmp {}".format(os.path.join(walk, self.r), os.path.join(walk, self.r2), ' '.join(self.args.reference), self.tmp), logger=self.logger))
+            else:
+                self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single --tmp {}".format(os.path.join(walk, self.r), ' '.join(self.args.reference), self.tmp), logger=self.logger))
+        else:
+            if mode == 0:
+                self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} {} --tmp {}".format(self.args.read1, self.args.read2, ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+            elif mode == 1:
+                self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single {} --tmp {}".format(self.args.read1, ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+            elif mode == 2:
+                self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} {} --tmp {}".format(os.path.join(walk, self.r), os.path.join(walk, self.r2), ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+            else:
+                self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single {} --tmp {}".format(os.path.join(walk, self.r), ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+    def convert_to_accession(self):
+        pass
+        #pattern = re.compile('>gi\|\d*\|[^|]*\|([^|]*)\|(.*)')
+        #with open(self.args.reference[0], 'r+') as f:
+        #    d = f.readlines()
+        #    f.seek(0)
+        #    fl = d[0]
+        #    del(d[0])
+        #    if '>gi' in fl:
+        #        f.write('>{}\n'.format(' '.join(re.match(pattern, fl).groups())))
+        #        for l in d:
+        #            f.write(l)
+        #        f.write('\n')
+        #        f.truncate()
+    def run(self):
+        if self.args.reference is not None:
+            # Handle genbank
+            new_ref = []
+            for ref in self.args.reference:
+                if ref.endswith('.gb') or ref.endswith('.gbk'):
+                    new_ref.append(extract_genbank.main(ref, os.path.join(self.tmp, os.path.basename(''.join(ref.split('.')[:-1])+'.fasta'))))
+                else:
+                    new_ref.append(ref)
+            self.args.reference = new_ref
+            # if --multi-fasta is set, split refs
+            if self.args.multifasta:
+                print('=== Splitting multifasta ===')
+                self.split_multifasta()
+            #TODO RefFinder, if set to run, change self.args.reference to the returned reference.
+            self.find_ref = len(self.args.reference) >= 2
+            # Convert gi to accession
+            self.convert_to_accession()
+        else:
+            self.find_ref = False
+        # If only reads were given, run the pipeline on the reads
+        if not os.path.isdir(self.args.read1):
+            self.read_counter = 1
+            if self.find_ref:
+                if self.args.read2 is not None:
+                    self.run_reffinder(0)
+                else:
+                    self.run_reffinder(1)
+            elif self.args.reference:
+                self.args.reference = self.args.reference[0]
+            self._run_once(self.args.read1, self.args.read2)
+        # Else we have a folder containing project folders with reads. Run the pipeline for each project.
+        else:
+            self.read_counter = 1
+            [self.root, self.subdir, self.files] = next(os.walk(self.args.read1))
+            for self.sub in self.subdir:
+                self._run_all(next(os.walk(os.path.join(self.root, self.sub))))
+            self._run_all([self.root, self.subdir, self.files])
+    def _run_all(self, walk):
+        # Run pipeline for each pair of reads.
+        for self.r in [x for x in walk[2] if x.endswith('.fastq') or x.endswith('fastq.gz')]:
+            if self.r.startswith('r1_P'):
+                self.r2 = self.r[0] + '2' + self.r[2:]#TODO regex?
+                if not os.path.isfile(self.r2):
+                    try:
+                        # QCumber output
+                        pattern = '(r)([1,2])(_P_.*_L\d\d\d_)(R)([1,2])(_\d\d\d\.fastq.*)'
+                        frags = re.match(pattern, self.r).groups()
+                        self.r2 = '{}{}{}{}{}{}'.format(frags[0], '2', frags[2], frags[3], '2', frags[5])
+                    except AttributeError:
+                        # Generic
+                        pattern = '(r)([1,2])(_P_.*)'
+                        frags = re.match(pattern, self.r).groups()
+                        self.r2 = '{}{}{}{}{}{}'.format(frags[0], '2', frags[3])
+                if self.args.reference is not None:
+                    self.cache_ref = self.args.reference
+                    if self.find_ref:
+                        self.run_reffinder(2, walk[0])
+                    else:
+                        self.args.reference = self.args.reference[0]
+                self._run_once(os.path.join(walk[0], self.r), read2=os.path.join(walk[0], self.r2))
+                if self.args.reference is not None:
+                    self.args.reference = self.cache_ref
+                self.read_counter += 1
+            elif self.r.startswith('r2_P'):
+                pass
+            else:
+                if self.args.reference is not None:
+                    self.cache_ref = self.args.reference
+                    if self.find_ref:
+                        self.run_reffinder(3, walk[0])
+                    else:
+                        self.args.reference = self.args.reference[0]
+                self._run_once(os.path.join(walk[0], self.r))
+                if self.args.reference is not None:
+                    self.args.reference = self.cache_ref
+                self.read_counter += 1
+    def _sam_name(self, s):
+        if s.startswith('r1_P_') or s.startswith('r2_P_'):
+            st = 'P_' + s[5:]
+        elif s.startswith('r1_UP_') or s.startswith('r2_UP_'):
+            st = 'UP_' + s[6:]
+        else:
+            st = s
+        try:
+            self.sam_name = re.match('(.*)(\.fastq\.gz|\.fastq)$', st).group(1)
+        except:
+            self.sam_name =  '.'.join(st.split('.')[:-1])
+        return os.path.join(self.args.output, self.sam_name)
+    def _join(self):
+        log.print_write(self.logger, 'info', '=== Joining SAM files ===')
+        command = '{java} -jar {jar} MergeSamFiles O={o} '.format(java=self.args.java, jar=os.path.join(self.args.picard, 'picard.jar'), o=os.path.join(self.args.output, 'joined.sam'))
+        for sam in self.sam_files:
+            command += 'I={}.sam '.format(sam)
+            self.delete.append(sam + '.sam')
+        log.call(command, self.logger, shell=True)
+        self.sam_files = [os.path.join(self.args.output, 'joined.sam')]
+    def _join_sample(self):
+        groups = defaultdict(set)
+        pattern = re.compile('^U{0,1}P_(.*)_L\d\d\d_R[1,2]_\d\d\d')
+        try:
+            for sam in self.sam_files:
+                sam = os.path.basename(sam)
+                sample = re.match(pattern, sam).group(1)
+                groups[sample].add(sam)
+        except:
+            pattern = re.compile('^U{0,1}P_(.*)')
+            for sam in self.sam_files:
+                sam = os.path.basename(sam)
+                sample = re.match(pattern, sam).group(1)
+                groups[sample].add(sam)
+        self.sam_files = list()
+        for (key, values) in groups.items():
+            command = '{java} -jar {jar} MergeSamFiles O={o} '.format(java=self.args.java, jar=os.path.join(self.args.picard, 'picard.jar'), o=os.path.join(self.args.output, key+'.sam'))
+            for sam in values:
+                command += 'I={}.sam '.format(os.path.join(self.args.output, sam))
+                self.delete.append(os.path.join(self.args.output, sam + '.sam'))
+            log.call(command, self.logger, shell=True)
+            self.sam_files.append(os.path.join(self.args.output, key+'.sam'))
+    def _run_once(self, read1, read2=None):
+        log.print_write(self.logger, 'info', '=== Processing read #{} ==='.format(self.read_counter))
+        log.write(self.logger, 'info', 'Read1 = {}'.format(read1))
+        log.write(self.logger, 'info', 'Read2 = {}'.format(read2))
+        self.start_time = self._timestr(d=datetime.datetime.now())
+        self.current_read1 = read1
+        self.current_read2 = read2
+        # Mapping
+        self.sam_files.append(self._sam_name(os.path.basename(read1)))
+        if self.mapper is None:
+            self.mapper = mapper.get_mapper(self.args.mapper, [x for x in [read1, read2] if x is not None], self.args.reference if self.args.reference else '', self._sam_name(os.path.basename(read1)), self.tmp, self.args.options, self.logger, not self.args.norg, os.path.realpath(self.args.unmapped) if self.args.unmapped is not None else None, self.args.samtools, self.args.bwa, self.args.bowtie2)
+        else:
+            self.mapper.update([x for x in [read1, read2] if x is not None], self._sam_name(os.path.basename(read1)), self.args.options)
+        self.sam_file = self.mapper.map()
+        self.mapper_version = self.args.mapper + ' ' + self.mapper.get_version()
+        self.sam_files.append(self._sam_name(self.sam_file))
+    def split_multifasta(self):
+        i = 0
+        f = None
+        files = list()
+        for ref in self.args.reference:
+            for line in open(ref, 'r'):
+                if '>' in line:
+                    if f is not None:
+                        f.close()
+                    f = open(os.path.join(self.tmp, '{}.fasta'.format(i)), 'w')
+                    files.append(os.path.join(self.tmp, '{}.fasta'.format(i)))
+                    i += 1
+                    f.write('{}'.format(line))
+                else:
+                    f.write('{}'.format(line))
+        if f is not None:
+            f.close
+        self.args.reference = files
+
+    def _valid_picard(self, f):
+        if os.path.isfile(os.path.join(os.path.realpath(f), 'picard.jar')):
+            return f
+        else:
+            raise argparse.ArgumentTypeError('Cannot find picard.jar in folder {}'.format(os.path.realpath(f)))
+
+    def _valid_unmapped(self, f):
+        if not os.path.isdir(os.path.realpath(f)):
+            return f
+        else:
+            raise argparse.ArgumentTypeError('{} is a directory. Please specify a path containing the prefix for unmapped reads.'.format(os.path.realpath(f)))
+
+    def _valid_samtools(self, f):
+        if os.path.isfile(os.path.join(os.path.realpath(f), 'samtools')):
+            return f
+        else:
+            raise argparse.ArgumentTypeError('Cannot find samtools in folder {}'.format(os.path.realpath(f)))
+
+    def _valid_java(self, f):
+        try:
+            version_string = sp.check_output(f + " -version", shell=True, stderr=sp.STDOUT)
+        except:
+            raise argparse.ArgumentTypeError('It appears that {} is not a java executable (absolute or available in the PATH environment)'.format(f))
+        try:
+            version = version_string.decode("ASCII").split("\n")[0].split('"')[1].split("_")[0]
+            vn = float(".".join(version.split(".")[:2]))
+        except:
+            raise argparse.ArgumentTypeError('It appears that {} is not a java executable (absolute or available in the PATH environment)'.format(f))
+        if vn < 1.8:
+            raise argparse.ArgumentTypeError('{} is java version {}, however, at least version 1.8 is required'.format(f, vn))
+        return f
+
+    def _valid_bwa(self, f):
+        if os.path.isfile(os.path.join(os.path.realpath(f), 'bwa')):
+            return f
+        else:
+            raise argparse.ArgumentTypeError('Cannot find bwa in folder {}'.format(os.path.realpath(f)))
+
+    def _valid_bowtie2(self, f):
+        if os.path.isfile(os.path.join(os.path.realpath(f), 'bowtie2')):
+            return f
+        else:
+            raise argparse.ArgumentTypeError('Cannot find bowtie2 in folder {}'.format(os.path.realpath(f)))
+
+    def _argument_parser(self):
+        # Parse any conf_file specification
+        # We make this parser with add_help=False so that
+        # it doesn't parse -h and print help.
+        conf_parser = argparse.ArgumentParser(
+            description=__doc__, # printed with -h/--help
+            # Don't mess with format of description
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            # Turn off help, so we print all options in response to -h
+            add_help=False
+        )
+        config = os.path.join(os.path.dirname(__file__), 'run.config')
+        if '--config' not in sys.argv and os.path.isfile(config):
+            sys.argv += ['--config', config]
+        conf_parser.add_argument("--config", help="Specify config file", metavar="FILE")
+        conf_args, remaining_argv = conf_parser.parse_known_args()
+        if conf_args.config:
+            with open(conf_args.config, 'r') as f:
+                for line in f:
+                    if '=' in line:
+                        [option, argument] = line.strip().split('=')
+                        if option not in sys.argv:
+                            remaining_argv += [option, argument]
+                    else:
+                        if line.strip() not in sys.argv:
+                            remaining_argv += [line.strip()]
+
+        self.parser = argparse.ArgumentParser(description='Mapping pipeline', add_help=False)
+        self.required = self.parser.add_argument_group('required arguments')
+        self.optional = self.parser.add_argument_group('optional arguments')
+        self.required.add_argument('mapper', type=self._is_mapper, help='Select a mapper. Supported: {}'.format(' '.join(mapper.list_mapper())))
+        self.required.add_argument('read1', type=str, help='Either unpaired reads, first mate of paired reads, project folder or folder containing project folders.')
+        self.optional.add_argument('--reference', dest='reference', type=str, help='Reference genome. FASTA or GenBank format. Parameter can be used multiple times.', action='append')
+        self.optional.add_argument('-h', '--help', action='help', help='Show this help message and exit')
+        self.optional.add_argument('--no-report', dest='noreport', action='store_true', default=False, help='Do not generate report.')
+        self.optional.add_argument('--read2', dest='read2', nargs='?', type=str, help='Second mate of paired reads.', default=None)
+        self.optional.add_argument('--output', dest='output', nargs='?', type=str, help='Output location. (Default: %(default)s)', default=os.getcwd())
+        self.optional.add_argument('--multi-fasta', dest='multifasta', action='store_true', default=False, help='If set, split multifasta files into single fasta files.')
+        self.optional.add_argument('--options', dest='options', nargs='?', type=str, help='Options for the mapper. Usage: --options "option(s)"')
+        self.optional.add_argument('--reffinder_options', dest='reffinder_options', nargs='?', type=str, help='Options for reffinder. Usage: --reffinder_options="option(s)"')
+        self.optional.add_argument('--join', dest='join', action='store_true', default=False, help='Join all created alignment files into one file.')
+        self.optional.add_argument('--join_sample', dest='join_sample', action='store_true', default=False, help='Join all created alignment files per sample into one file.')
+        self.optional.add_argument('--no-rg', dest='norg', action='store_true', default=False, help='Do not set read groups.')
+        self.optional.add_argument('--unmapped', dest='unmapped', nargs='?', default=None, type=self._valid_unmapped, help='Write unmapped reads to given file. Read name will be automatically added.')
+        self.optional.add_argument('--java', dest='java', nargs='?', default='java', type=self._valid_java, help='Java version 1.8+ executable.')
+        self.optional.add_argument('--samtools', dest='samtools', nargs='?', default='/usr/bin', type=self._valid_samtools, help='Path to folder containing samtools.')
+        self.optional.add_argument('--bwa', dest='bwa', nargs='?', default='/usr/bin', type=self._valid_bwa, help='Path to folder containg bwa.')
+        self.optional.add_argument('--bowtie2', dest='bowtie2', nargs='?', default='/usr/bin/', type=self._valid_bowtie2, help='Path to folder containing bowtie2.')
+        self.optional.add_argument('--picard', dest='picard', nargs='?', default='/usr/bin/', type=str, help='Path to folder containing picard.jar')
+        self.optional.add_argument('--config', help='Define config file to read parameters from. If run.config exists in the source directory, it is if --config is not defined.')
+        self.args = self.parser.parse_args(remaining_argv)
+        if self.args.join_sample or self.args.join:
+            self.args.picard = self._valid_picard(os.path.expanduser(self.args.picard))
+        if self.args.options is not None:
+            if self.args.options[0] == self.args.options[-1] == '"':
+                self.args.options = self.args.options[1:-1] # workaround for cwl
+    def _is_mapper(self, m):
+        self.mapper_list = mapper.list_mapper()
+        if m in self.mapper_list:
+            return m
+        else:
+            raise argparse.ArgumentTypeError('Mapper {} is not supported. Available mappers are: {}'.format(m, ' '.join(self.mapper_list)))
+    def _timestr(self,ger=False,d=None):
+        def lz(i):
+            if i < 10:
+                return '0{}'.format(i)
+            else:
+                return i
+
+        weekdays = {
+                1: 'Monday',
+                2: 'Tuesday',
+                3: 'Wednesday',
+                4: 'Thursday',
+                5: 'Friday',
+                6: 'Saturday',
+                7: 'Sunday',
+                8: 'Montag',
+                9: 'Dienstag',
+                10: 'Mittwoch',
+                11: 'Donnerstag',
+                12: 'Freitag',
+                13: 'Samstag',
+                14: 'Sonntag'
+        }
+        tstr = '{} {}.{}.{} {}:{}:{}'
+        if d is None:
+            d = datetime.datetime.now()
+        if not ger:
+            tstr = tstr.format(weekdays[d.isoweekday()], lz(d.day), lz(d.month), d.year, lz(d.hour), lz(d.minute), lz(d.second))
+        else:
+            tstr = tstr.format(weekdays[d.isoweekday()+7], lz(d.day), lz(d.month), d.year, lz(d.hour), lz(d.minute), lz(d.second))
+        return tstr
+
+def timestr(ger=False,d=None):
+    def lz(i):
+        if i < 10:
+            return '0{}'.format(i)
+        else:
+            return i
+
+    weekdays = {
+            1: 'Monday',
+            2: 'Tuesday',
+            3: 'Wednesday',
+            4: 'Thursday',
+            5: 'Friday',
+            6: 'Saturday',
+            7: 'Sunday',
+            8: 'Montag',
+            9: 'Dienstag',
+            10: 'Mittwoch',
+            11: 'Donnerstag',
+            12: 'Freitag',
+            13: 'Samstag',
+            14: 'Sonntag'
+    }
+    tstr = '{} {}.{}.{} {}:{}:{}'
+    if d is None:
+        d = datetime.datetime.now()
+    if not ger:
+        tstr = tstr.format(weekdays[d.isoweekday()], lz(d.day), lz(d.month), d.year, lz(d.hour), lz(d.minute), lz(d.second))
+    else:
+        tstr = tstr.format(weekdays[d.isoweekday()+7], lz(d.day), lz(d.month), d.year, lz(d.hour), lz(d.minute), lz(d.second))
+    return tstr
+
+def valid_filename(s):
+    valid_chars = "-_.: %s%s" % (string.ascii_letters, string.digits)
+    filename = ''.join(c for c in s if c in valid_chars)
+    return filename.replace(' ','_').replace('.','-').replace(':','-')
+
+if  __name__ == '__main__':
+    if '-h' in sys.argv or '--help' in sys.argv:
+        m = MAPPING(None)
+    else:
+        out = None
+        try:
+            out = sys.argv[sys.argv.index('--output')+1]
+        except Exception:
+            pass
+        if out is None:
+            try:
+                cache = [y for x in sys.argv for y in x.split('=')]
+                out = cache[cache.index('--output')+1]
+            except Exception:
+                pass
+        if out is None:
+            out = ''
+        t0 = datetime.datetime.now()
+        log_file = os.path.join(out, ('{}_log.txt'.format(valid_filename(timestr(d=t0)))))
+        try:
+            os.mkdir(os.path.dirname(log_file))
+        except:
+            pass
+        logging.basicConfig(format='%(levelname)s:%(message)s',filename=log_file,filemode='w',level=logging.DEBUG)
+        logger = logging.getLogger(log_file)
+        logger.info('Log is written to: {}'.format(os.path.abspath(log_file)))
+        logger.info('This pipeline was invoked by: {}'.format(pwd.getpwuid(os.getuid()).pw_name))
+        logger.info('This pipeline started at: {}'.format(timestr(d=t0)))
+        m = MAPPING(logger)
+        t1 = datetime.datetime.now()
+        logger.info('This pipeline finished at: {}'.format(timestr(d=t1)))
+        logger.info('Runtime: {}'.format(t1-t0))
+
+
diff --git a/metrics.py b/metrics.py
new file mode 100755
index 0000000..a9a9c8a
--- /dev/null
+++ b/metrics.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python
+
+import custom_logger as log
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import os
+import pysam
+import pylab
+import itertools
+import string
+from scipy import stats
+from collections import Counter
+from collections import deque
+from itertools import islice
+import subprocess as sp
+
+def window(iterable, size=2, step=1, fillvalue=None):
+    if size < 0 or step < 1:
+        raise ValueError
+    it = iter(iterable)
+    q = deque(islice(it, size), maxlen=size)
+    if not q:
+        return  # empty iterable or size == 0
+    #q.extend(fillvalue for _ in range(size - len(q)))  # pad to size
+    while True:
+        #yield iter(q)  # iter() to avoid accidental outside modifications
+        yield q
+        q.append(next(it))
+        #q.extend(next(it, fillvalue) for _ in range(step - 1))
+        q.extend(next(it) for _ in range(step - 1))
+
+def valid_filename(s):
+    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
+    filename = ''.join(c for c in s if c in valid_chars)
+    filename = filename.replace(' ','_')
+    if len(filename) > 200:
+        return(filename[:200])
+    else:
+        return filename
+
+class SAM:
+    def __init__(self, sam_file_path, logger=None, samtools=None):
+        self.samtools = samtools
+        self.sam_file_path = sam_file_path
+        self.sam_file = None
+        self.logger = logger
+        #self.sam_alignments = list()
+        self._read_sam()
+        self.sam_flags = None
+    def _read_sam(self):
+        sp.check_call('{samtools} view -bS {sam_file} | {samtools} sort - -o {bam} 2> /dev/null'.format(samtools=self.samtools, sam_file=self.sam_file_path, bam=self.sam_file_path[:-3] + 'bam'), shell=True)
+        self.sam_file_path = self.sam_file_path[:-3] + 'bam'
+        sp.check_call('{samtools} index {sam_file} {index}'.format(samtools=self.samtools, sam_file=self.sam_file_path, index=self.sam_file_path[:-1] + 'i'), shell=True)
+        self.sam_file = pysam.AlignmentFile(self.sam_file_path, 'rb')
+        #for alignment in self.sam_file.fetch():
+        #    self.sam_alignments.append(alignment)
+        #TODO return new iterator?
+        #self.sam_alignments = self.sam_file.fetch()
+    def _read_flags(self):
+        self.sam_flags = list()
+        for self.alignment in self.get_alignments():
+            self.sam_flags.append((self.alignment.query_name, self.alignment.flag))
+    def get_alignments(self, reference=None, until_eof=False):
+        return self.sam_file.fetch(reference = reference, multiple_iterators = True, until_eof=until_eof)
+    def get_flags(self):
+        for self.alignment in self.get_alignments(until_eof=True):
+            yield self.alignment.flag
+    def get_sam_file(self):
+        return self.sam_file
+    def get_sam_file_path(self):
+        return self.sam_file_path
+    def get_reference_count(self):
+        return self.sam_file.nreferences
+    def get_reference_name(self, ref_id):
+        return self.sam_file.getrname(ref_id)
+    def get_lengths(self):
+        return self.sam_file.lengths
+    def get_references(self):
+        return self.sam_file.references
+    def get_reference_pos(self, reference):
+        return self.sam_file.references.index(reference)
+    def get_reference_length(self, reference):
+        return self.sam_file.lengths[self.get_reference_pos(reference)]
+
+class METRICS:
+    def __init__(self, sam_file_path, output=None, logger=None, samtools=None):
+        self.sam = SAM(sam_file_path, logger, samtools)
+        self.logger = logger
+        if output is not None:
+            self.output = os.path.realpath(output)
+        else:
+            self.output = os.path.realpath('./')
+        self.reference_coverage_count = None
+        self.reference_mapq_avg = None
+        self.reads_per_quality_result = []
+        self.bases_per_coverage_result = []
+        self.mapq_graph_result = []
+        self.coverage_graph_result = []
+        self.cov_stats_result = []
+        #self.read_stats_result = []
+
+    def reads_per_quality(self, reference, i):
+        """Generate one pdf for each reference showing the number of reads (y-axis) that map with a certain mapping quality (x-axis)."""
+        log.print_write(self.logger, 'info', '=== Reads per Quality #{} ==='.format(i))
+        # The SAM convention states that 255 is the highest value for the mapping quality.
+        reads_per_quality = np.zeros(256, np.int)
+        # Go through all alignments int the SAM file and increase the counter for the mapping quality in the respective reference.
+        for alignment in self.sam.get_alignments(reference=reference):
+            reads_per_quality[alignment.mapping_quality] += 1
+        # Trim all zeros at the end of the array.
+        rpq = np.trim_zeros(reads_per_quality, 'b')
+        # If there is something left to plot.
+        if rpq.size > 0:
+            # Plot the reads per quality as a bar chart into a pdf file.
+            with PdfPages(os.path.join(self.output, 'reads_per_quality_{}.pdf'.format(valid_filename(reference)))) as pdf:
+                self.reads_per_quality_result.append([os.path.realpath(os.path.join(self.output, 'reads_per_quality_{}.pdf'.format(valid_filename(reference)))), reference.replace('_','\_')])
+                fig, ax = plt.subplots( nrows=1, ncols=1 )  # create figure & 1 axis
+                ax.set_yscale('symlog')
+                ax.bar(range(rpq.size), rpq, align='center')
+                pylab.xlim([-0.5,rpq.size-0.5])
+                pylab.ylim(ymin=-0.05)
+                ax.set_xlabel('Mapping quality')
+                ax.set_ylabel('Read Count')
+                ax.set_title('Reads per Mapping Quality')
+                pdf.savefig()  # saves the current figure into a pdf page
+                plt.close()
+
+    def compute_metrics(self):
+        scored_references = list()
+        score_reference = np.zeros(self.sam.get_reference_count(), np.int)
+        for alignment in self.sam.get_alignments():
+            score_reference[alignment.reference_id] += 1
+        scored_references = [self.sam.get_references()[x] for x in np.argsort(score_reference)[::-1][:10]]
+        for i, reference in enumerate(scored_references):
+            if i == 10:
+                break
+            self.count_coverage(reference)
+            self.count_mapq(reference)
+            self.reads_per_quality(reference, i+1)
+            self.bases_per_coverage(reference, i+1)
+            self.coverage_graph(reference, i+1)
+            self.mapq_graph(reference, i+1)
+            self.coverage_stats(reference, i+1)
+
+    def count_coverage(self, reference):
+        """Count coverage for each position in each reference, i.e. reference_count[reference_id][position] = coverage."""
+        # We will count coverages for all references(self.sam.lengths). One row will contain at most the max(lengths references) entries.
+        self.reference_coverage_count = np.zeros(self.sam.get_reference_length(reference), np.int)
+        # Go through all alignments in the SAM file.
+        for alignment in self.sam.get_alignments(reference=reference):
+            # If the length is None, the query/read does not map anywhere.
+            if alignment.reference_length is not None:
+                # Increase the coverage by one for the positions in the reference that the read maps to.
+                self.reference_coverage_count[alignment.reference_start:alignment.reference_end] += 1
+
+    def count_mapq(self, reference):
+        """Append mapping quality for each position in each reference, i.e. reference_count[reference_id][position] = [mapqs]."""
+        # We will count coverages for all references(self.sam.lengths). One row will contain at most the max(lengths references) entries.
+        reference_mapq_sum = np.zeros(self.sam.get_reference_length(reference), np.float)
+        reference_mapq_count = np.zeros(self.sam.get_reference_length(reference), np.int)
+        # Go through all alignments in the SAM file.
+        for alignment in self.sam.get_alignments(reference=reference):
+            # If the length is None, the query/read does not map anywhere.
+            if alignment.reference_length is not None:
+                # Increase the coverage by one for the positions in the reference that the read maps to.
+                mapq = alignment.mapping_quality
+                reference_mapq_sum[alignment.reference_start:alignment.reference_end] += mapq
+                reference_mapq_count[alignment.reference_start:alignment.reference_end] += 1
+        with np.errstate(divide='ignore', invalid='ignore'):
+            self.reference_mapq_avg = np.true_divide(reference_mapq_sum, reference_mapq_count)
+            self.reference_mapq_avg = np.nan_to_num(self.reference_mapq_avg)
+            self.reference_mapq_avg[self.reference_mapq_avg == np.inf] = 0
+
+    def _integer_mean(self, array):
+        """Calculate float mean of the values in array and return the rounded float as int."""
+        if len(array) == 0:
+            return 0
+        return int(round(np.sum(array)/len(array)))
+
+    def _set_axis(self, ax, factor):
+        """Manipulate the xlabel for a plot."""
+        # The new label should be the "compression" factor caused by the binning * the position.
+        def _factor_label(x, pos):
+            return str(int(round(x * factor)))
+        # Create a new formatter using _factor_label
+        formatter = matplotlib.ticker.FuncFormatter(_factor_label)
+        # Set the formatter
+        ax.xaxis.set_major_formatter(formatter)
+
+    def bases_per_coverage(self, reference, i):
+        """Generate one pdf for each reference showing the number of bases (y-axis) that have a certain coverage (x-axis)."""
+        log.print_write(self.logger, 'info', '=== Bases per Coverage #{} ==='.format(i))
+        # If we have not count the coverages yet, have to count the coverages.
+        if self.reference_coverage_count is None:
+            self.count_coverage(reference)
+        # Trim coverages of 0 at the end.
+        reference_coverage = np.trim_zeros(self.reference_coverage_count, 'b')
+        # If there are some coverages left.
+        if reference_coverage.size > 0:
+            # Count how often a coverage appears. bincount[coverage]=#coverage
+            bases_per_coverage = np.bincount(reference_coverage)
+            # Bin the coverages into 1000 bin and calculate the mean for every bin.
+            binned_coverage = stats.binned_statistic(range(bases_per_coverage.size), bases_per_coverage, statistic = self._integer_mean, bins = 1000)[0]
+            # Trim again in case the maximal coverage is less than 1000.
+            binned_coverage = np.trim_zeros(binned_coverage, 'b')
+            # Get the number of binned coverages.
+            len_coverage = binned_coverage.size
+            # Plot the bases per coverage as a bar chart into a pdf file.
+            if len_coverage > 0:
+                with PdfPages(os.path.join(self.output, 'bases_per_coverage_{}.pdf'.format(valid_filename(reference)))) as pdf:
+                    self.bases_per_coverage_result.append([os.path.realpath(os.path.join(self.output, 'bases_per_coverage_{}.pdf'.format(valid_filename(reference)))), reference.replace('_','\_')])
+                    fig, ax = plt.subplots(nrows=1, ncols=1)
+                    # Change labels if there are more unbinned coverages than binned coverages, i.e. the binning influences the labels.
+                    if len_coverage <= bases_per_coverage.size:
+                        factor = float(bases_per_coverage.size)/len_coverage
+                        self._set_axis(ax, factor)
+                    ax.set_yscale('symlog')
+                    ax.bar(range(len_coverage), binned_coverage, align='center', color='b', linewidth=0, edgecolor='b')
+                    pylab.xlim([-0.005*len_coverage, len_coverage+0.005*len_coverage])
+                    ax.set_xlabel('Coverage')
+                    ax.set_ylabel('Base Count')
+                    ax.set_title('Bases per Coverage')
+                    pylab.ylim(ymin=-0.05)
+                    pdf.savefig()
+                    plt.close()
+        if len(self.bases_per_coverage_result) == 0:
+            self.bases_per_coverage_result.append(None)
+
+    def coverage_graph(self, reference, i):
+        """Generate one pdf for each reference showing the genome position (y-axis) an the respective coverage (x-axis). The genome is reduced/binned into 1000 points."""
+        log.print_write(self.logger, 'info', '=== Coverage Graph #{} ==='.format(i))
+        if self.reference_coverage_count is None:
+            self.count_coverage(reference)
+        # Trim coverages of 0 at the end.
+        reference_coverage = np.trim_zeros(self.reference_coverage_count, 'b')
+        # If there is something left of the reference.
+        if reference_coverage.size > 0:
+            # Use a sliding window to create ~1000 datapoints out of the reference. 
+            reference_coverage_mean = []
+            reference_coverage_max = []
+            reference_coverage_min = []
+            window_size = min(int(reference_coverage.size/10), 50000)
+            for w in window(reference_coverage, size=window_size, step=max(1, int((reference_coverage.size-window_size)/1000))):
+                reference_coverage_mean.append(int(np.mean(w)))
+                reference_coverage_max.append(max(w))
+                reference_coverage_min.append(min(w))
+            reference_coverage_mean = np.array(reference_coverage_mean)
+            reference_coverage_max = np.array(reference_coverage_max)
+            reference_coverage_min = np.array(reference_coverage_min)
+            # Get the number of binned reference.
+            len_reference = reference_coverage_mean.size
+            # Plot bases per coverage as line diagram.
+            if len_reference > 0:
+                with PdfPages(os.path.join(self.output, 'coverage_graph_{}.pdf'.format(valid_filename(reference)))) as pdf:
+                    self.coverage_graph_result.append([os.path.realpath(os.path.join(self.output, 'coverage_graph_{}.pdf'.format(valid_filename(reference)))), reference.replace('_', '\_')])
+                    fig, ax = plt.subplots(nrows=1, ncols=1)
+                    # Change labels if there are more unbinned entries than binned entries, i.e. the binning influences the labels.
+                    if len_reference <= reference_coverage.size:
+                        factor = float(reference_coverage.size)/len_reference
+                        self._set_axis(ax, factor)
+                    ax.set_yscale('symlog')
+                    ax.plot(range(len_reference), reference_coverage_mean, 'r', label='mean')
+                    ax.plot(range(len_reference), reference_coverage_max, 'b', label='max')
+                    ax.plot(range(len_reference), reference_coverage_min, 'g', label='min')
+                    ax.legend(bbox_to_anchor=(0., 1.05, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
+                    ax.set_xlabel('Position')
+                    ax.set_ylabel('Coverage')
+                    ax.set_title('Coverage Graph')
+                    pylab.xlim([0, len_reference])
+                    pylab.ylim(ymin=-0.05)
+                    pdf.savefig()
+                    plt.close()
+        if len(self.coverage_graph_result) == 0:
+            self.coverage_graph_result.append(None)
+
+    def mapq_graph(self, reference, i):
+        """Generate one pdf for each reference showing the genome position (y-axis) an the respective coverage (x-axis). The genome is reduced/binned into 1000 points."""
+        log.print_write(self.logger, 'info', '=== Mapping Quality Graph #{} ==='.format(i))
+        if self.reference_mapq_avg is None:
+            self.count_mapq(reference)
+        # Trim coverages of 0 at the end.
+        reference_mapq = np.trim_zeros(self.reference_mapq_avg, 'b')
+        # If there is something left of the reference.
+        if reference_mapq.size > 0:
+            # Use a sliding window to create ~1000 datapoints out of the reference. 
+            reference_mapq_mean = []
+            reference_mapq_max = []
+            reference_mapq_min = []
+            window_size = min(int(reference_mapq.size/10), 50000)
+            for w in window(reference_mapq, size=window_size, step=max(1, int((reference_mapq.size-window_size)/1000))):
+                reference_mapq_mean.append(int(np.mean(w)))
+                reference_mapq_max.append(max(w))
+                reference_mapq_min.append(min(w))
+            reference_mapq_mean = np.array(reference_mapq_mean)
+            reference_mapq_max = np.array(reference_mapq_max)
+            reference_mapq_min = np.array(reference_mapq_min)
+            # Get the number of binned reference.
+            len_reference = reference_mapq_mean.size
+            # Plot bases per coverage as line diagram.
+            if len_reference > 0:
+                with PdfPages(os.path.join(self.output, 'mapq_graph_{}.pdf'.format(valid_filename(reference)))) as pdf:
+                    self.mapq_graph_result.append([os.path.realpath(os.path.join(self.output, 'mapq_graph_{}.pdf'.format(valid_filename(reference)))), reference.replace('_', '\_')])
+                    fig, ax = plt.subplots(nrows=1, ncols=1)
+                    # Change labels if there are more unbinned entries than binned entries, i.e. the binning influences the labels.
+                    if len_reference <= reference_mapq.size:
+                        factor = float(reference_mapq.size)/len_reference
+                        self._set_axis(ax, factor)
+                    #self.ax.set_yscale('symlog')
+                    ax.plot(range(len_reference), reference_mapq_mean, 'r', label='mean')
+                    ax.plot(range(len_reference), reference_mapq_max, 'b', label='max')
+                    ax.plot(range(len_reference), reference_mapq_min, 'g', label='min')
+                    ax.legend(bbox_to_anchor=(0., 1.05, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
+                    ax.set_xlabel('Position')
+                    ax.set_ylabel('Mapping Quality')
+                    ax.set_title('Mapping Quality Graph')
+                    pylab.xlim([0, len_reference])
+                    pylab.ylim(ymin=-0.1)
+                    pdf.savefig()
+                    plt.close()
+        if len(self.mapq_graph_result) == 0:
+            self.mapq_graph_result.append(None)
+
+    def coverage_stats(self, reference, i):
+        log.print_write(self.logger, 'info', '=== Coverage Statistics #{} ==='.format(i))
+        self.cov_stats = dict()
+        if self.reference_coverage_count is None:
+            self.count_coverage(reference)
+        # Trim coverages of 0 at the end.
+        reference_coverage = np.trim_zeros(self.reference_coverage_count, 'b')
+        # If there is something left of the reference.
+        if reference_coverage.size > 0:
+            mean_coverage = np.mean(reference_coverage)
+            maximum_coverage = np.max(reference_coverage)
+            minimum_coverage = np.min(reference_coverage)
+            sd_coverage = np.std(reference_coverage)
+            bases_zero_coverage = (reference_coverage == 0).sum()
+            conf_int = mean_coverage + 2 * sd_coverage
+            lower_conf_int = (reference_coverage < conf_int).sum()
+            greater_conf_int = (reference_coverage > conf_int).sum()
+            cov_stats = {'mean': mean_coverage, 'max': maximum_coverage, 'min': minimum_coverage, 'sd': sd_coverage, 'zero': bases_zero_coverage, 'lower': lower_conf_int, 'greater': greater_conf_int}
+            self.cov_stats_result.append(cov_stats)
+        if len(self.cov_stats_result) == 0:
+            self.cov_stats_result.append({'mean': -1, 'max': -1, 'min': -1, 'sd': -1, 'zero': -1, 'lower': -1, 'greater': -1})
+        #return self.cov_stats
+        
+    def read_stats(self):
+        log.print_write(self.logger, 'info', '=== Read Statistics ===')
+        concordant = 0
+        discordant = 0
+        paired = 0
+        unmapped = 0
+        total = 0
+        mapped = 0
+        #for (query, flag) in self.sam.get_flags():
+        for flag in self.sam.get_flags():
+            bflag = bin(flag)
+            total += 1
+            if flag >= 1 and bflag[-1] == '1':
+                paired += 1
+            if flag >= 2 and bflag[-2] == '1':
+                concordant += 1
+            else:
+                discordant += 1
+            if flag >= 4 and bflag[-3] == '1':
+                unmapped += 1
+        mapped = total - unmapped
+        self.read_stats = dict()
+        self.read_stats['concordant'] = concordant
+        self.read_stats['discordant'] = discordant
+        self.read_stats['paired'] = paired
+        self.read_stats['mapped'] = mapped
+        self.read_stats['unmapped'] = unmapped
+        self.read_stats['total'] = total
+        #self.read_stats_result.append(self.stats)
+        #return self.stats
+
+    def cleanup(self):
+        log.write(self.logger, 'info', '=== Cleaning up metrics ===')
+        try:
+            for mfile in self.reads_per_quality_result+self.bases_per_coverage_result+self.mapq_graph_result+self.coverage_graph_result+[(self.sam.sam_file_path, 'bam'), (self.sam.sam_file_path[:-1] + 'i', 'bai')]:
+                if mfile is not None:
+                    self.command = 'rm {}'.format(mfile[0])
+                    log.call(self.command, self.logger, shell=True)
+        except sp.CalledProcessError:
+            log.print_write(self.logger, 'exception', 'ERROR: CPE in metrics.cleanup()')
+            return False
+        self.reads_per_quality_result = []
+        self.bases_per_coverage_result = []
+        self.mapq_graph_result = []
+        self.coverage_graph_result = []
+        return True
diff --git a/reffinder.py b/reffinder.py
new file mode 100755
index 0000000..4d02b01
--- /dev/null
+++ b/reffinder.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#created by Stephan Fuchs (FG13, RKI), 2015
+version = '0.0.9'
+
+import argparse
+import time
+import re
+import sys
+if sys.version_info < (3, 2):
+    import subprocess32 as sp
+else:
+    import subprocess as sp
+import os
+import random
+import math
+import custom_logger as log
+
+#sys.path.insert(0, "/home/RKID1/fuchss/data/scripts/libs")
+
+import fux_filing as filing
+
+# options is a string containing parameters, e.g. "--fastq sth. --ref sth. -p 0.2"
+def parse_arguments(options=None, logger=None):
+    #processing command line arguments
+    parser = argparse.ArgumentParser(prog="REFFINDER", description='')
+    parser.add_argument('--fastq', metavar="FILE(S)", help="input FASTQ file(s)", type=argparse.FileType('r'), required=True, nargs='+')
+    parser.add_argument('--ref', metavar="FILE(S)", help="input reference FASTA file(s)", type=str, required=True, nargs='+')
+    parser.add_argument('--tmp', type=str, help='Temporary directory to use', default=os.path.expanduser('~/tmp/'))
+    parser.add_argument('-p', metavar="DECIMAL", help="proportion of reads used for mapping (default: 0.1)",  type=float, action="store", default=0.1)
+    parser.add_argument('-t', metavar='INT', help="number of threads/CPUs used for FastQC (default: 20)", type=int, action="store", default=20)
+    parser.add_argument('-s', help="if set, subset fastq files are stored in a sub folder. Please consider, that high amount of data may be generated.",  action="store_true")
+    parser.add_argument('-b', help="if set, subset BAM files are stored in a sub folder. Please consider, that high amount of data may be generated.",  action="store_true")
+    parser.add_argument('-i', help="if set, reference index files are not deleted.",  action="store_true")
+    parser.add_argument('--mode', help="select paired (default) or unpaired mode",  choices=['paired', 'single'], action="store", default='paired')
+    parser.add_argument('--mapping', help="select mapping algorithm (default: bwasw).",  choices=['bwasw', 'bwamem'], action="store", default='bwasw')
+    parser.add_argument('--version', action='version', version='%(prog)s ' + version)
+    if options is None:
+        args = parser.parse_args() 
+    else:
+        args = parser.parse_args(options.split(' '))
+    args.logger = logger
+    try:
+        os.mkdir(args.tmp)
+    except OSError:
+        pass
+    return args
+
+
+#FUNCTIONS
+def indexRef(ref, args):
+	if args.mapping in ['bwasw', 'bwamem']:
+		cmd = ['bwa', 'index',  '-p', os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), '-a',  'is', ref]
+	elif args.mapping == smalt:
+		cmd = ['smalt', 'index',  '-s',  '6', ref, ref]
+	log.call(cmd, args.logger)
+	cmd = ['samtools', 'faidx', ref]
+	log.call(cmd, args.logger)	
+
+def mapping(ref, fastq, revfastq, samfile, args):
+	#for single mapping set revfastq to False
+	if args.mapping == 'bwasw':
+		cmd = ['bwa', 'bwasw', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]),  fastq, revfastq, '-f', samfile]
+		if revfastq == False:
+			del(cmd[6])
+		log.call(cmd, args.logger)		
+	elif args.mapping == 'bwamem':
+		with open(samfile,"wb") as outfile:
+			cmd = ['bwa', 'mem', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]),  fastq, revfastq]
+			if revfastq == False:
+				del(cmd[6])
+			sp.check_call(cmd, stdout=outfile)		
+	elif args.mapping == 'smalt':
+		cmd = ['smalt', 'map', '-n', str(args.t), '-f', 'sam', '-o', samfile, ref,  fastq, revfastq]	
+		if revfastq == False:
+			del(cmd[-1])		
+		log.call(cmd, args.logger)
+	
+def sam2bam(samfile, bamfile, delSam = True):
+	with open(bamfile, 'w') as bf:
+		cmd = ['samtools', 'view', '-F', '4', '-Sb', samfile]
+		sp.check_call(cmd, stdout=bf)
+	if delSam:
+		os.remove(samfile)	
+		
+def getMappedReads(bamfile):		
+	cmd = ['samtools', 'flagstat', bamfile]
+	out = sp.check_output(cmd).decode("utf-8")
+	mapped_reads = re.compile('.*?(\d*) \+ \d* mapped.*', re.DOTALL)
+	m = mapped_reads.match(out)
+	#return int(out[2].split('+')[0])
+	return int(m.group(1))
+
+def getBwaVers():		
+	cmd = ['bwa']
+	process = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
+	lines = process.communicate()[1].decode("utf-8").split('\n')
+	return lines[2].split(': ')[1]
+	
+def getSamtoolsVers():		
+	cmd = ['samtools']
+	process = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
+	lines = process.communicate()[1].decode("utf-8").split('\n')
+	return lines[2].split(': ')[1]
+	
+def clean(args):
+	exts = ['amb', 'ann', 'bwt', 'fai', 'pac', 'sa']
+	for ref in args.ref:
+		for ext in exts:
+			if os.path.isfile(ref + '.' + ext):
+				os.remove(ref + '.' + ext)
+
+def find_reference(args):
+    #START
+    creationtime = time.strftime("%Y-%m-%d_%H-%M-%S")
+    starttime = time.time()
+
+    #consistency checks
+    if args.mode == 'paired' and len(args.fastq) % 2 != 0:
+            log.write(args.logger, 'debug', "EXCEPTION: in paired mode the number of fastq files must be even")
+            exit(1)
+
+    if args.p <= 0 or args.p > 1:
+            log.write(args.logger, 'debug', "EXCEPTION: read proportion must between 0 (exclusive) and 1 (inclusive).")
+            exit(1)	
+            
+    #generate dir
+    if args.s or args.b:
+            storedir = 'REFFINDER_' + creationtime + '/'
+            os.mkdir(storedir)
+    else:
+            storedir = ''
+
+    #files to fastq file objects
+    fastqFileObjs = []
+    for fastqfile in args.fastq:
+            fastqfile.close()
+            fastqFileObjs.append(filing.fastqfile(fastqfile.name))
+            
+    #generating reference index files 
+    log.write(args.logger, 'debug', '')
+    log.write(args.logger, 'debug', "generate reference index files ...")
+    for ref in args.ref:
+            indexRef(ref, args)
+
+    #generating subset fastq files and mapping 
+    fastq_mapped_reads =[] #minimal quality mapped reads based in fastq files order
+    ref_mapped_reads =[] #minimal quality mapped reads based in ref files order
+    total_reads = 0 #total reads
+
+    if args.mode == 'paired':
+            datasets = [['forward read file', 'reverse read file', 'reads', 'subset reads']]
+    elif args.mode == 'single':	
+            datasets = [['read file', 'reads', 'subset reads']]
+
+    current_task = 0
+
+    #processing paired files
+    if args.mode == 'paired':
+            total_tasks = int((len(args.fastq)/2) * len(args.ref))
+            i = 0
+            while i < len(args.fastq):
+                    log.write(args.logger, 'debug', '')
+                    log.write(args.logger, 'debug', 'processing paired files ...')
+                    log.write(args.logger, 'debug', 'forward read file: {}'.format(args.fastq[i].name))
+                    log.write(args.logger, 'debug', 'reverse read file: {}'.format(args.fastq[i+1].name))
+                    pairedfastqFileObj = filing.pairedfastqfiles(fname = args.fastq[i].name, rname=args.fastq[i+1].name)
+                    log.write(args.logger, 'debug', 'reads in each file: {}'.format(pairedfastqFileObj.getReadNumber()))	
+                    datasets.append([args.fastq[i].name, args.fastq[i+1].name, pairedfastqFileObj.getReadNumber()])
+                                            
+                    #generating subsets
+                    log.write(args.logger, 'debug', 'generating read subsets ...')
+                    mapped_reads = []
+                    with pairedfastqFileObj.randomSubset(prop=args.p) as subsetObj: 
+                            log.write(args.logger, 'debug', 'reads in each subset file: {}'.format(subsetObj.getReadNumber()))
+                            datasets[-1].append(subsetObj.getReadNumber())
+                            total_reads += subsetObj.getReadNumber()
+                            j = 0
+                            for ref in args.ref:
+                                    current_task += 1
+                                    log.write(args.logger, 'debug', '')
+                                    log.write(args.logger, 'debug', 'START MAPPING TASK {} OF {}'.format(current_task, total_tasks))
+                                    
+                                    #mapping
+                                    log.write(args.logger, 'debug', 'mapping to {} using {} {}'.format(ref, args.mapping, '...'))
+                                    samfile = subsetObj.forward.uniqueFilename(ext='sam')
+                                    mapping(ref, subsetObj.forward.fullname, subsetObj.reverse.fullname, samfile, args)
+                                    
+                                    #convert to bam
+                                    log.write(args.logger, 'debug', '')
+                                    log.write(args.logger, 'debug', 'converting sam to bam ...')
+                                    bamfile = subsetObj.forward.uniqueFilename(ext='bam')
+                                    sam2bam(samfile, bamfile)
+                                            
+                                    #get reads aligend with minimal mapping quality
+                                    mapped_reads.append(getMappedReads(bamfile))
+                                    if len(ref_mapped_reads) < j + 1:
+                                            ref_mapped_reads.append(0)					
+                                    ref_mapped_reads[j] += mapped_reads[-1]			
+                                    
+                                    if args.b:
+                                            fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
+                                            reffilepath, reffilename = os.path.split(ref)
+                                            os.rename(bamfile, storedir + 'subset_' + fastqfilename + '__to__' + reffilename + '.bam') 
+                                    else:
+                                            os.remove(bamfile)								
+                                    
+                                    log.write(args.logger, 'debug', '')
+                                    log.write(args.logger, 'debug', 'mapped reads: {}'.format(mapped_reads[-1]))
+                                    j += 1
+                                    
+                            if args.s:
+                                    fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
+                                    subsetObj.forward.renameFile('subset_' + fastqfilename, path=storedir)
+                                    fastqfilepath, fastqfilename = os.path.split(args.fastq[i+1].name)
+                                    subsetObj.reverse.renameFile('subset_' + fastqfilename, path=storedir)
+                            else:
+                                    subsetObj.setTmp()
+                    
+                    #make fastq_mapped_reads relative
+                    k = 0
+                    for reads in mapped_reads:
+                            mapped_reads[k] = str(round(mapped_reads[k]/subsetObj.getReadNumber() * 100, 2)) + '%'
+                            k += 1
+                    fastq_mapped_reads.append(mapped_reads)
+                    i += 2
+                    
+                    
+    #processing single files	
+    if args.mode == 'single':
+            total_tasks = int(len(args.fastq) * len(args.ref))
+            i = 0
+            while i < len(args.fastq):
+                    log.write(args.logger, 'debug', '')
+                    log.write(args.logger, 'debug', 'processing fastq file {}'.format(args.fastq[i].name))
+                    fastqFileObj = filing.fastqfile(name = args.fastq[i].name)
+                    log.write(args.logger, 'debug', 'number of reads: {}'.format(fastqFileObj.getReadNumber())	)
+                    datasets.append([args.fastq[i].name, fastqFileObj.getReadNumber()])
+                            
+                    log.write(args.logger, 'debug', 'generating read subset ...')
+                    mapped_reads = []
+                    with fastqFileObj.randomSubset(prop=args.p) as subsetObj: 
+                            log.write(args.logger, 'debug', 'reads in subset file: {}'.format(subsetObj.getReadNumber()))
+                            datasets[-1].append(subsetObj.getReadNumber())
+                            j = 0
+                            for ref in args.ref:
+                                    current_task += 1
+                                    log.write(args.logger, 'debug', '')
+                                    log.write(args.logger, 'debug', 'START MAPPING TASK {} OF {}'.format(current_task, total_tasks))
+                                    
+                                    #mapping
+                                    log.write(args.logger, 'debug', 'mapping to {} using {}'.format(ref, args.mapping))
+                                    samfile = subsetObj.uniqueFilename(ext='sam')
+                                    mapping(ref, subsetObj.fullname, False, samfile, args)
+                                    
+                                    #convert to bam
+                                    log.write(args.logger, 'debug', '')
+                                    log.write(args.logger, 'debug', 'converting sam to bam ...')
+                                    bamfile = subsetObj.uniqueFilename(ext='bam')
+                                    sam2bam(samfile, bamfile)
+                                            
+                                    #get reads aligend with minimal mapping quality
+                                    mapped_reads.append(getMappedReads(bamfile))
+                                    if len(ref_mapped_reads) < j + 1:
+                                            ref_mapped_reads.append(0)					
+                                    ref_mapped_reads[j] += mapped_reads[-1]			
+                                    
+                                    if args.b:
+                                            fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
+                                            reffilepath, reffilename = os.path.split(ref)
+                                            os.rename(bamfile, storedir + 'subset_' + fastqfilename + '__to__' + reffilename + '.bam') 
+                                    else:
+                                            os.remove(bamfile)								
+                                    
+                                    log.write(args.logger, 'debug', '')
+                                    log.write(args.logger, 'debug', 'mapped reads: {}'.format(mapped_reads[-1]))
+                                    j += 1
+                                    
+                            if args.s:
+                                    fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
+                                    subsetObj.renameFile('subset_' + fastqfilename, path=storedir)
+                            else:
+                                    subsetObj.setTmp()
+                                    
+                    #make fastq_mapped_reads relative
+                    k = 0
+                    for reads in mapped_reads:
+                            mapped_reads[k] = str(round(mapped_reads[k]/subsetObj.getReadNumber() * 100, 2)) + '%'
+                            k += 1
+                    fastq_mapped_reads.append(mapped_reads)
+                    i += 1
+                    
+    #make ref_mapped_reads relative
+    i = 0
+    for reads in ref_mapped_reads:
+            if total_reads == 0:
+                    ref_mapped_reads[i] = '0%'
+            else:
+                    ref_mapped_reads[i] = str(round(reads/total_reads * 100, 2)) + '%'
+            i += 1
+            
+                    
+
+    #get reference charts
+    log.write(args.logger, 'debug', '')
+    log.write(args.logger, 'debug', 'prepare output ...')
+
+    refcharts = [['rank', 'reference file(s)', 'avg of mapped reads']]
+
+    total_datasets = len(datasets) -1 #minus 1^since it contains a headline
+    order = sorted(set(ref_mapped_reads), reverse=True)
+    n = 0
+            
+    for value in order:
+            n += 1
+            refkeys = [i for i, j in enumerate(ref_mapped_reads) if j == value]
+            refs = []
+            for key in refkeys:
+                    refs.append(args.ref[key])
+            refcharts.append([str(n), ', '.join(refs), value])	
+
+
+    #get rankmatrix
+    if args.mode == 'paired':
+            rankmatrix = [['forward read file', 'reverse read file']]
+    elif args.mode == 'single':
+            rankmatrix = [['read file']]
+
+    for i in range(1, len(args.ref) + 1):
+            rankmatrix[0].append('rank ' + str(i) + ' (mapped reads)')
+
+    i = 0
+    for mapped_reads in fastq_mapped_reads:
+            if args.mode == 'paired':
+                    rankmatrix.append([args.fastq[i].name, args.fastq[i+1].name])
+            elif args.mode == 'single':
+                    rankmatrix.append([args.fastq[i].name])
+            
+            order = sorted(set(mapped_reads), reverse=True)
+            for value in order:
+                    refkeys = [i for i, j in enumerate(mapped_reads) if j == value]
+                    refs = []
+                    for key in refkeys:
+                            refs.append(args.ref[key])
+                    
+                    rankmatrix[-1].append(', '.join(refs) + ' (' + str(value) + ')')			
+            
+            if args.mode == 'paired':
+                    i += 2
+            elif args.mode == 'single':
+                    i += 1
+            
+    #get refmatrix
+    if args.mode == 'paired':
+            refmatrix = [['forward read file', 'reverse read file']]
+    elif args.mode == 'single':	
+            refmatrix = [['read file']]
+            
+    for ref in args.ref:
+            refmatrix[0].append(ref)
+
+    i = 0
+    for mapped_reads in fastq_mapped_reads:
+            if args.mode == 'paired':
+                    refmatrix.append([args.fastq[i].name, args.fastq[i+1].name])
+                    i += 2
+            elif args.mode == 'single':
+                    refmatrix.append([args.fastq[i].name])
+                    i += 1
+            refmatrix[-1].extend(mapped_reads)		
+
+    #cleaning
+    if args.i == False:
+            clean(args)
+
+    #print best reference
+    maxvalue = max(set(ref_mapped_reads))
+    refkeys = [i for i, j in enumerate(ref_mapped_reads) if j == maxvalue]
+    log.write(args.logger, 'debug', 'best matching reference:')
+    #log.write(args.logger, 'debug', args.ref[key].name)
+    log.write(args.logger, 'debug', args.ref[refkeys[0]])
+    #return(args.ref[key].name)
+    return(args.ref[refkeys[0]])
+
+if __name__ == '__main__':
+    find_reference(parse_arguments())
+
diff --git a/report.tex b/report.tex
new file mode 100755
index 0000000..a641807
--- /dev/null
+++ b/report.tex
@@ -0,0 +1,135 @@
+\documentclass[a4paper]{article}
+\usepackage[document]{ragged2e}
+\usepackage{python}
+\usepackage{afterpage}
+\usepackage[english]{babel}
+\usepackage{graphicx}
+\usepackage[utf8]{inputenc}
+\usepackage{xcolor}
+\usepackage[T1]{fontenc}
+\renewcommand*\familydefault{\ttdefault} %% Only if the base font of the document is to be typewriter style
+\usepackage{subfigure}
+\usepackage{geometry}
+\usepackage{tcolorbox}
+\usepackage{adjustbox}
+\usepackage{setspace}
+\usepackage{float}
+\usepackage[obeyspaces]{url}
+\usepackage{grffile}
+
+\geometry{a4paper,left=25mm,right=20mm,
+		top=12mm,
+		bottom=20mm}
+
+\setlength\parindent{0pt}
+
+\tolerance=1
+\emergencystretch=\maxdimen
+\hyphenpenalty=1000
+\hbadness=1000
+
+\begin{document}
+
+{\bf {\LARGE{ {{pipeline.name}} } Version {{pipeline.version}}   } }\\
+\line(1,0){ \textwidth }
+
+\begin{tabular}{lp{0.85\textwidth}}
+
+Executor: & {{pipeline.user}} \\
+Dataset: & {\bfseries{\verb|{{pipeline.sam_name}}|} }\\
+Started: &  {{pipeline.start_time}}\\
+Finished: & {{pipeline.end_time}}\\
+ & \\
+Operating System & \\
+Server: & {{pipeline.server}}\\
+System: & {{pipeline.system}}\\
+Release: & {{pipeline.release}}\\
+%Version: & {{pipeline.operating_version}}\\
+Machine: & {{pipeline.machine}}\\
+ & \\
+ Tool versions & \\
+Python: & {{pipeline.python_version}}\\
+Mapper: & {{pipeline.mapper_version}}\\
+\end{tabular}\\
+
+%----------------- Workflow -------------------%
+\line(1,0){ \textwidth } \\
+\begin{tabular}{lp{0.85\textwidth}}
+Processed read(s): & \path{{"{"+pipeline.current_read1+"}"}}\\
+{%if pipeline.current_read2 is not none%} & \path{{"{"+pipeline.current_read2+"}"}}\\{%endif%}
+{%if pipeline.args.reference is defined and pipeline.args.reference is not none%} Reference file: & \path{{"{"+pipeline.args.reference+"}"}}\\{%endif%}
+\end{tabular}\\
+\line(1,0){ \textwidth } \\
+\begin{tabular}{lp{0.85\textwidth}}
+Call: & \path{{"{"+pipeline.call+"}"}}\\
+\end{tabular}\\
+\line(1,0){ \textwidth } \\
+
+%-------------------- Summary -------------------%
+\begin{tcolorbox}
+{\large{Read statistics} } \\
+\begin{tabular}{ll}
+Total reads: & {{metric.read_stats['total']}}\\
+Concordant reads: & {{metric.read_stats['concordant']}}\\
+Discordant reads: & {{metric.read_stats['discordant']}}\\
+Paired reads: & {{metric.read_stats['paired']}}\\
+Mapped reads: & {{metric.read_stats['mapped']}}\\
+Unmapped reads: & {{metric.read_stats['unmapped']}}\\
+\end{tabular}
+\end{tcolorbox}
+
+\vspace{5mm}
+
+{% for i in range(metric.reads_per_quality_result|length) %}
+\afterpage{
+\begin{tcolorbox}
+\begin{center}
+	{\Large Reference}\\ 
+        \textbf{ {{metric.reads_per_quality_result[i][1][:85]}} }\\
+\end{center}
+\end{tcolorbox}
+	\begin{figure}[H]
+		\centering
+		\begin{minipage}{0.5\textwidth}%
+		\begin{subfigure}%
+                    {%if metric.coverage_graph_result[i] is defined and metric.coverage_graph_result[i][0] is not none%} {\includegraphics[width=\textwidth]{/{{metric.coverage_graph_result[i][0]}}}} {%endif%}
+		\end{subfigure}%
+		\end{minipage}%		
+		\begin{minipage}{0.5\textwidth}%
+		\begin{subfigure}%
+			{%if metric.bases_per_coverage_result[i] is defined and metric.bases_per_coverage_result[i][0] is not none%} {\includegraphics[width=\textwidth]{/{{metric.bases_per_coverage_result[i][0]}}}} {%endif%}
+		\end{subfigure}%
+		\end{minipage}%
+	\end{figure}
+	\begin{figure}[H]
+		\centering
+		\begin{minipage}{0.5\textwidth}%
+		\begin{subfigure}%
+			{%if metric.reads_per_quality_result[i] is defined and metric.reads_per_quality_result[i][0] is not none%} {\includegraphics[width=\textwidth]{/{{metric.reads_per_quality_result[i][0]}}}} {%endif%}
+		\end{subfigure}%
+		\end{minipage}%		
+		\begin{minipage}{0.5\textwidth}%
+		\begin{subfigure}%
+			{%if  metric.mapq_graph_result[i] is defined and metric.mapq_graph_result[i][0] is not none%} {\includegraphics[width=\textwidth]{/{{metric.mapq_graph_result[i][0]}}}} {%endif%}
+		\end{subfigure}%
+		\end{minipage}%
+	\end{figure}
+	\begin{tcolorbox}
+	{\large{Coverage statistics} } \\
+	\begin{tabular}{ll}
+        {%if metric.cov_stats_result[i] is defined%}
+	Average: & {{metric.cov_stats_result[i]['mean']}}\\
+	Maximum: & {{metric.cov_stats_result[i]['max']}}\\
+	Minimum: & {{metric.cov_stats_result[i]['min']}}\\
+	Standard Deviation: & {{metric.cov_stats_result[i]['sd']}}\\
+	#Bases 0 coverage: & {{metric.cov_stats_result[i]['zero']}}\\
+	#Bases < mean+2*sd: & {{metric.cov_stats_result[i]['lower']}}\\
+	#Bases > mean+2*sd: & {{metric.cov_stats_result[i]['greater']}}\\
+        {%endif%}
+	\end{tabular}
+	\end{tcolorbox}
+}
+\clearpage
+{% endfor %}
+
+\end{document}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mapping-pipeline.git



More information about the debian-med-commit mailing list