[med-svn] [qcumber] 01/15: New upstream version 1.1.2
Andreas Tille
tille at debian.org
Wed Nov 23 08:21:08 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository qcumber.
commit f1ce1ebb8a02f383d54eb96bbda0245d6c0c1c52
Author: Andreas Tille <tille at debian.org>
Date: Fri Oct 21 15:33:43 2016 +0200
New upstream version 1.1.2
---
batch_report.html | 591 ++++++++++++++++++++++++++++++++++++++++++++++
classes.py | 681 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
combineFastQC.py | 431 ++++++++++++++++++++++++++++++++++
config.txt | 14 ++
helper.py | 204 ++++++++++++++++
parameter.txt | 42 ++++
readme.txt | 111 +++++++++
report.tex | 184 +++++++++++++++
runQCPipeline.py | 484 ++++++++++++++++++++++++++++++++++++++
9 files changed, 2742 insertions(+)
diff --git a/batch_report.html b/batch_report.html
new file mode 100755
index 0000000..af91d19
--- /dev/null
+++ b/batch_report.html
@@ -0,0 +1,591 @@
+<!doctype html>
+<html>
+<head>
+ <meta charset="utf-8">
+ <title>Batch Report</title>
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
+ <script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.4.5/angular.min.js"></script>
+ <script src="http://d3js.org/d3.v3.min.js"></script>
+ <!-- Latest compiled and minified CSS -->
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css">
+ <!-- Optional theme -->
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap-theme.min.css">
+ <!-- Latest compiled and minified JavaScript -->
+ <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
+ <style>
+ .row_hidden {
+ display: none;
+ }
+
+ .show_row {
+ display: show;
+ }
+
+ p.inbox_text {
+ text-align: center;
+ color: grey;
+ margin: 0 0 0px;
+ }
+
+ .thumbnail {
+ margin-bottom: 5px;
+ border: 0px solid #000;
+ height: 150px;
+ width: 200px;
+ }
+
+ .sample_box {
+ display: table-cell;
+ margin: 10px;
+ padding: 10px;
+ color: lightgrey;
+ border-radius: 5px;
+ box-shadow: 0 1px 3px rgba(0, 0, 0, .25);
+ }
+
+ #sidebar {
+ float: both;
+ position: fixed;
+ height: 100%;
+ top: 50px;
+ z-index: 999;
+ padding-right: 0px;
+ box-shadow: 0 2px 10px 0 rgba(0, 0, 0, 0.12);
+ background: #f7f7f7;
+ }
+
+ #header {
+ background: #336AA2;
+ position: fixed;
+ width: 100%;
+ height: 50px;
+ top: 0;
+ z-index: 1000;
+ box-shadow: 0 2px 4px 0 rgba(0, 0, 0, 0.16), 0 2px 10px 0 rgba(0, 0, 0, 0.12);
+ }
+
+ .node {
+ cursor: pointer;
+ }
+
+ .node circle {
+ fill: #fff;
+ stroke: steelblue;
+ stroke-width: 1.5px;
+ }
+
+ .node text {
+ font: 10px sans-serif;
+ }
+
+ .link {
+ fill: none;
+ stroke: #ccc;
+ stroke-width: 1.5px;
+ }
+ </style>
+ <script>
+ $(window).load(function () {
+ $("#main").css({left: $('#sidebar').width()});
+ });
+
+ $(document).ready(function () {
+
+ loadGallery(true, 'a.thumbnail');
+
+ function loadGallery(setIDs, setClickAttr) {
+ var current_image,
+ selector,
+ counter = 0;
+
+ function updateGallery(selector) {
+ var $sel = selector;
+ current_image = $sel.data('image-id');
+ $('#image-gallery-caption').text($sel.data('caption'));
+ $('#image-gallery-title').text($sel.data('title'));
+ $('#image-gallery-image').attr('src', $sel.data('image'));
+ disableButtons(counter, $sel.data('image-id'));
+ }
+
+ if (setIDs == true) {
+ $('[data-image-id]').each(function () {
+ counter++;
+ $(this).attr('data-image-id', counter);
+ });
+ }
+ $(setClickAttr).on('click', function () {
+ updateGallery($(this));
+ });
+ }
+ });
+
+ //Start d3 tree
+ function d3_kraken(root, id) {
+ var margin = {top: 20, right: 120, bottom: 20, left: 120},
+ width = $(document).width(),//1600 - margin.right - margin.left,
+ height =$(document).height();
+
+ var i = 0,
+ duration = 750//,root
+ ;
+
+ var tree = d3.layout.tree()
+ .size([height, width]);
+
+ var diagonal = d3.svg.diagonal()
+ .projection(function (d) {
+ return [d.y, d.x];
+ });
+
+
+ var svg = d3.select("#kraken").append("tr")
+ .attr("id", "kraken_" + id)
+ .attr("class", "row_hidden")
+ .append("svg")
+ .attr("width", width + margin.right + margin.left)
+ .attr("height", height + margin.top + margin.bottom)
+ .append("g")
+ .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
+
+ root.x0 = height / 2;
+ root.y0 = 0;
+
+ function collapse(d) {
+ if (d.children) {
+ d._children = d.children;
+ d._children.forEach(collapse);
+ d.children = null;
+ }
+ }
+ function uncollapse(d) {
+ if (d._children) {
+ d.children = d._children;
+ d._children.forEach(uncollapse);
+ d._children = null;
+ }
+
+ }
+
+ root.children.forEach(collapse);
+ update(root);
+
+
+ d3.select(self.frameElement).style("height", "800px");
+
+ function update(source) {
+var levelWidth = [1];
+ var childCount = function(level, n) {
+
+ if (n.children && n.children.length > 0) {
+ if (levelWidth.length <= level + 1) levelWidth.push(0);
+
+ levelWidth[level + 1] += n.children.length;
+ n.children.forEach(function(d) {
+ childCount(level + 1, d);
+ });
+ }
+ };
+ childCount(0, root);
+
+
+ var newHeight = d3.max(levelWidth) * 100; // 25 pixels per line
+ tree = tree.size([newHeight, width]);
+ // Compute the new tree layout.
+ var nodes = tree.nodes(root).reverse(),
+ links = tree.links(nodes);
+
+ // Normalize for fixed-depth.
+ nodes.forEach(function (d) {
+ d.y = d.depth * 180;
+ });
+
+ // Update the nodes…
+ var node = svg.selectAll("g.node")
+ .data(nodes, function (d) {
+ return d.id || (d.id = ++i);
+ });
+
+ // Enter any new nodes at the parent's previous position.
+ var nodeEnter = node.enter().append("g")
+ .attr("class", "node")
+ .attr("transform", function (d) {
+ return "translate(" + source.y0 + "," + source.x0 + ")";
+ })
+ .on("click", click)
+.attr("data-toggle","tooltip")
+ .attr("title", function(d) { return d.name +"\n" + d.perc + "%";})
+ ;
+
+ nodeEnter.append("circle")
+ .attr("r", function (d) {
+ if (d.name != "unclassified") return d.perc + 1e-06; else 0.0;
+ })
+ .style("fill", function (d) {
+ return d._children ? "lightsteelblue" : "#fff";
+ })
+
+
+ nodeEnter.append("text")
+ .attr("x", function (d) {
+ return d.children || d._children ? -10 : 10;
+ })
+ .attr("dy", ".35em")
+ .attr("text-anchor", function (d) {
+ return d.children || d._children ? "end" : "start";
+ })
+ .text(function (d) {
+ return d.name + ", " + d.perc;
+ })
+ .style("fill-opacity", 1e-6);
+
+ // Transition nodes to their new position.
+ var nodeUpdate = node.transition()
+ .duration(duration)
+ .attr("transform", function (d) {
+ return "translate(" + d.y + "," + d.x + ")";
+ });
+
+ nodeUpdate.select("circle")
+ .attr("r", function (d) {
+ if (d.name != "unclassified") return d.perc + 4.5; else return 0.0;
+ })
+ .style("fill", function (d) {
+ return d._children ? "lightsteelblue" : "#fff";
+ });
+
+ nodeUpdate.select("text")
+ .style("fill-opacity", 1);
+
+ // Transition exiting nodes to the parent's new position.
+ var nodeExit = node.exit().transition()
+ .duration(duration)
+ .attr("transform", function (d) {
+ return "translate(" + source.y + "," + source.x + ")";
+ })
+ .remove();
+
+ nodeExit.select("circle")
+ .attr("r", 1e-6);
+
+ nodeExit.select("text")
+ .style("fill-opacity", 1e-6);
+
+ // Update the links…
+ var link = svg.selectAll("path.link")
+ .data(links, function (d) {
+ return d.target.id;
+ });
+
+ // Enter any new links at the parent's previous position.
+ link.enter().insert("path", "g")
+ .attr("class", "link")
+ .attr("d", function (d) {
+ var o = {x: source.x0, y: source.y0};
+ return diagonal({source: o, target: o});
+ });
+
+ // Transition links to their new position.
+ link.transition()
+ .duration(duration)
+ .attr("d", diagonal);
+
+ // Transition exiting nodes to the parent's new position.
+ link.exit().transition()
+ .duration(duration)
+ .attr("d", function (d) {
+ var o = {x: source.x, y: source.y};
+ return diagonal({source: o, target: o});
+ })
+ .remove();
+
+ // Stash the old positions for transition.
+ nodes.forEach(function (d) {
+ d.x0 = d.x;
+ d.y0 = d.y;
+ });
+ }
+
+// Toggle children on click.
+ function click(d) {
+ if (d.children) {
+ d._children = d.children;
+ d.children = null;
+ } else {
+ d.children = d._children;
+ d._children = null;
+ if ( d.name == "root"){
+ d.children.forEach(uncollapse);
+ }
+ }
+ update(d);
+ }
+ }
+ ;
+ // end d3 tree
+ var myApp = angular.module('BatchReport', []);
+ myApp.controller('SummaryTable', ['$scope', function ($scope) {
+ // create a projects Object
+ $scope.projects = {};
+ // pretend data we just got back from the server
+ // this is an ARRAY of OBJECTS
+ $scope.information = {};
+ $scope.information.commandline = @= commandline =@;
+
+ $scope.projects.samples = @= json_samples =@;
+ $scope.show_summary = {};
+ $scope.show_img = {};
+ $scope.show_raw_data = true;
+
+ $scope.show_kraken = "";
+ $scope.kraken = @= kraken =@;
+ $.each($scope.kraken, function (key, value) {
+ d3_kraken(value, key)
+ });
+ $scope.$watch("show_kraken", function (newVal, oldVal) {
+ d3.select("#kraken_" + newVal).attr("class", "show_row");
+ d3.select("#kraken_" + oldVal).attr("class", "row_hidden");
+
+ });
+ $scope.collapse = true;
+ $scope.$watch("collapse", function (newVal, oldVal) {
+ d3.select("#kraken_" + newVal).attr("class", "show_row");
+ d3.select("#kraken_" + oldVal).attr("class", "row_hidden");
+
+ });
+
+
+ }]);
+
+ myApp.filter('imgFilter', function () {
+ return function (input, filter) {
+ var result = [];
+ angular.forEach(input, function (input) {
+ angular.forEach(filter, function (isfiltered, name) {
+ if (isfiltered && name === input.name) {
+ result.push(input);
+ }
+ });
+ });
+ return result;
+ };
+ });
+ myApp.filter('typeFilter', function () {
+ return function (input, filter) {
+ var result = [];
+ angular.forEach(input, function (input) {
+ angular.forEach(filter, function (isfiltered, setname) {
+
+ if (isfiltered && setname === input.setname) {
+ result.push(input);
+
+ }
+ });
+ });
+ return result;
+ };
+ });
+
+
+ </script>
+
+</head>
+<body>
+<div ng-app="BatchReport">
+
+ <!-- head Quality Control-->
+ <div class="row" id="header">
+
+ <div class="col-md-3"></div>
+ <div class="col-md-3">
+ <a href="#summary_plots" style="font-size: x-large;color: aliceblue;" data-toggle="tab">Quality Control</a>
+ </div>
+ <div class="col-md-1">
+ <a href="#summary" style="color: aliceblue;">Summary Table</a>
+ </div>
+ <div class="col-md-1">
+ <a href="#fastqc" style="color: aliceblue;" data-toggle="tab">FastQC</a>
+ </div>
+ <div class="col-md-1">
+ <a href="#kraken_report" style="color: aliceblue;" data-toggle="tab">Kraken</a>
+ </div>
+ <div class="col-md-1">
+ <a href="#information" style="color: aliceblue;" data-toggle="tab">Information</a>
+ </div>
+
+ </div>
+ <!-- END head Quality Control-->
+
+ <!-- Summary Table Controller-->
+ <div ng-controller="SummaryTable">
+ <div class="row" style="margin-top: 50px">
+ <!--Sidebar-->
+ <div class="col-md-2" id="sidebar">
+ <h4 style="color: #336AA2">Filter by ...</h4>
+ <br/>
+ <h5 style="color: #336AA2">... dataset</h5>
+ <p>
+ <input type="checkbox" ng-model="show_raw_data"/> show raw data
+ </p>
+ <br/>
+ <!--Filter by image name-->
+ <h5 style="color: #336AA2">..FastQC Images</h5>
+ <p ng-repeat="(type,bla) in projects.samples[0].images[0].raw[0]">
+ <input type="checkbox" ng-model="show_img[bla.name]" ng-init="show_img[bla.name]=true"/>
+ {{bla.name}}
+ </p>
+
+ <!-- END Filter by image name-->
+ <!--Filter by trimmed /untrimmed-->
+
+ <!-- END Filter by trimmed /untrimmed-->
+
+ </div>
+ <!-- END Sidebar-->
+
+ <div class="col-md-10" id="main" style="float: both;padding-left: 30px;">
+ <!-- Summary Table -->
+ <h4 id="summary" style="color:#336AA2">Summary</h4>
+ <table class="table table-striped">
+ <thead style="font-weight:600;color:#336AA2">
+ <tr>
+ <td ng-repeat="(headline,bla) in projects.samples[0]" ng-if="headline!='images'">{{headline}}
+ </td>
+ </tr>
+ </thead>
+ <tr ng-repeat="sample in projects.samples | orderBy:'setname'">
+ <td ng-repeat="(key,value) in sample " ng-if="key!='images'">
+ <input type="checkbox" ng-model="show_summary[value]" ng-if="key=='setname'"
+ ng-init="show_summary[value]=true"/> {{value}}
+ </td>
+
+ </tr>
+ </table>
+ <!-- END Summary Table -->
+
+ <!-- TAB-CONTENT-->
+ <div class="tab-content">
+ <!-- FASTQC Images-->
+ <div class="tab-pane fade " id="kraken_report">
+
+ Show Kraken report of : <select id="select_kraken" ng-model="show_kraken">
+ <option value="" disabled selected hidden>Choose sample...</option>
+ <option ng-repeat="(name, value) in kraken" value="{{name}}">{{name}}</option>
+ </select>
+ <table id="kraken"></table>
+ </div>
+ <div class="tab-pane fade in active" id="summary_plots">
+ </div>
+ <div class="tab-pane fade in active" id="fastqc">
+ <div style="width: 100%; " class="row">
+
+ <div style="display: table-row;" class="col-md-10">
+
+ <div class="sample_box"
+ ng-repeat="sample in projects.samples | orderBy:'setname'| typeFilter: show_summary ">
+ <p class="inbox_text">{{sample.setname}}</p>
+ <div ng-repeat="sample_img in sample.images " style="display:table-cell;"
+ ng-if="show_raw_data">
+ <p class="inbox_text">Raw data</p>
+ <p class="inbox_text" ng-if="sample_img.raw.length==2">R1</p>
+ <div style="display:inline-block">
+ <a class="thumbnail"
+ ng-repeat="img in sample_img.raw[0]| imgFilter: show_img" href="#"
+ data-image-id="" data-toggle="modal" data-title="Raw {{sample.setname}} R1"
+ data-caption="{{img.name}}" data-image="{{img.file}}"
+ data-target="#image-gallery">
+ <img width="200px" height="150px" ng-src="{{img.file}}"
+ style="border-radius:3px;border:solid 2px {{img.color}};display: block;">
+ </a>
+ </div>
+ <p class="inbox_text" ng-if="sample_img.raw.length==2">R2</p>
+ <div style="display:inline-block">
+ <a class="thumbnail"
+ ng-repeat="img in sample_img.raw[1]| imgFilter: show_img" href="#"
+ data-image-id="" data-toggle="modal" data-title="Raw {{sample.setname}} R2"
+ data-caption="{{img.name}}" data-image="{{img.file}}"
+ data-target="#image-gallery">
+ <img width="200px" height="150px" ng-src="{{img.file}}"
+ style="border-radius:3px;border:solid 2px {{img.color}};display: block;">
+ </a></div>
+ </div>
+ <div ng-repeat="sample_img in sample.images " style="display:table-cell;">
+ <p class="inbox_text">Trimmed data</p>
+ <p class="inbox_text" ng-if="sample_img.raw.length==2">R1</p>
+ <div style="display:inline-block">
+ <a class="thumbnail"
+ ng-repeat="img in sample_img.trimmed[0]| imgFilter: show_img" href="#"
+ data-image-id="" data-toggle="modal" data-title="Trimmed {{sample.setname}} R1"
+ data-caption="{{img.name}}" data-image="{{img.file}}"
+ data-target="#image-gallery">
+ <img width="200px" height="150px" ng-src="{{img.file}}"
+ style="border-radius:3px;border:solid 2px {{img.color}};display: block;">
+ </a>
+ </div>
+ <p class="inbox_text" ng-if="sample_img.raw.length ==2">R2</p>
+ <div style="display:inline-block">
+ <a class="thumbnail"
+ ng-repeat="img in sample_img.trimmed[1]| imgFilter: show_img" href="#"
+ data-image-id="" data-toggle="modal" data-title="Trimmed {{sample.setname}} R2"
+ data-caption="{{img.name}}" data-image="{{img.file}}"
+ data-target="#image-gallery">
+ <img width="200px" height="150px" ng-src="{{img.file}}"
+ style="border-radius:3px;border:solid 2px {{img.color}};display: block;">
+ </a>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ <!-- END FastQC -->
+ <div class="tab-pane fade" id="information">
+ <h4>Commandline</h4>
+
+ <table class="table">
+ <tr ng-repeat="(attr, value) in information.commandline" ng-if="value !=''">
+ <td>{{attr}}</td>
+ <td>{{value}}</td>
+ </tr>
+ </table>
+ </div>
+ <!-- END tab content-->
+ </div>
+
+ </div>
+ </div>
+ </div>
+ <!-- END div row-->
+ <!-- Trigger the modal with a button -->
+
+ <!-- Modal -->
+ <div class="modal fade" id="image-gallery" tabindex="-1" role="dialog" aria-labelledby="myModalLabel"
+ aria-hidden="true">
+ <div class="modal-dialog">
+ <div class="modal-content">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-label="Close"><span
+ aria-hidden="true">×</span></button>
+ <div class="col-md-8 text-justify" id="image-gallery-title">
+ </div>
+
+ </div>
+ <div class="modal-body">
+ <img id="image-gallery-image" class="img-responsive" src="">
+ </div>
+ <div class="modal-footer">
+ <div class="col-md-8 text-justify" id="image-gallery-caption">
+ </div>
+ </div>
+ </div>
+ </div>
+ <!-- END Modal -->
+ </div>
+ <!-- END Summary Table Controller-->
+
+</div>
+
+</div>
+
+
+</body>
+</html>
diff --git a/classes.py b/classes.py
new file mode 100755
index 0000000..fb181fd
--- /dev/null
+++ b/classes.py
@@ -0,0 +1,681 @@
+import runQCPipeline
+import datetime
+import numpy as np
+import getpass
+import csv
+from helper import *
+import re
+from os.path import basename, splitext, join, exists, dirname, isdir
+from os import uname, listdir, remove
+import configparser
+import shutil
+from collections import OrderedDict
+import base64
+
+default_parameter = configparser.ConfigParser()
+default_parameter.read(join(dirname(__file__), "parameter.txt"))
+
+def get_tool_path(name, section="PATH"):
+ config = configparser.ConfigParser()
+ config.read(join(dirname(__file__), "config.txt"))
+ path_dict = config[section]
+ return path_dict[name]
+
+class Pipeline:
+ '''Get all tool versions '''
+
+ name = "Quality Control"
+ version = runQCPipeline.__version__
+ user = getpass.getuser()
+ python_version = toLatex(sys.version)
+
+ def __init__(self):
+ self.fastqc_version = toLatex(subprocess.check_output(join(get_tool_path("fastqc"), "fastqc") + " --version", shell = True).decode("utf-8"))
+
+ try:
+ self.trimmo_version = subprocess.check_output(join(get_tool_path("trimmomatic"), "TrimmomaticSE") + " -version", shell=True).decode("utf-8")
+ except:
+ config = configparser.ConfigParser()
+ config.read(join(dirname(__file__), "config.txt"))
+ if "VERSION" in config:
+ self.trimmo_version = config["VERSION"]["trimmomatic"]
+ else:
+ self.trimmo_version = "0.32"
+ bowtie_v = subprocess.check_output(join(get_tool_path("bowtie2"), "bowtie2") + " --version", shell=True).decode("utf-8")
+ self.bowtie_version = re.match(".*\)", toLatex(bowtie_v, True)).group()
+ self.kraken_version =toLatex(subprocess.check_output( join(get_tool_path('kraken'), "kraken") + " --version", shell = True).decode("utf-8"))
+
+ system = uname()
+ self.system = toLatex(system.sysname)
+ self.server = toLatex(system.nodename)
+ self.release = toLatex(system.release)
+ self.operating_version = toLatex(system.version)
+ self.machine = toLatex(system.machine)
+
+
+class Sample:
+ """Sample with reference to ReadSets, MappingRes, KrakenResr"""
+
+ def __init__(self, name, path, reference, threads, technology):
+ self.start = datetime.datetime.strftime(datetime.datetime.now(),"%d.%m.%Y, %H:%M:%S")
+ self.stop = None
+ self.name = name
+ self.technology = technology
+ self.readSets = []
+ #self.trimmed_readSets =[]
+ self.mappingRes = None
+ self.krakenRes = None
+ self.reference = reference
+ self.mainResultsPath = path
+ self.threads = threads
+
+ def total_reads(self):
+ total = 0
+ for rs in self.readSets:
+ if rs.trimRes:
+ total += rs.trimRes.total
+ return total
+
+ def pTrimmed(self):
+ perc = []
+ for rs in self.readSets:
+ if rs.trimRes:
+ perc.append(rs.trimRes.pTrimmed)
+ if len(perc) == 0 :
+ return 0
+ return round(np.mean(perc), 2)
+ def nTrimmed(self):
+ total = 0
+ for rs in self.readSets:
+ if rs.trimRes:
+ total += rs.trimRes.nTrimmed
+ return total
+
+ def add_readSet(self, readSet):
+ self.readSets.append(readSet)
+ return self
+
+ def run_Kraken(self, db):
+ r1, r2 = self.get_all_reads()
+ self.krakenRes = KrakenResult(r1,r2,db, self.mainResultsPath, self.threads, splitext(self.readSets[0].r1.filename)[-1], self.name)
+ return self
+
+ def run_Bowtie2(self, bowtie_index, save_mapping ):
+ r1, r2 = self.get_all_reads()
+ self.mappingRes = MappingResult(self.reference, r1, r2, self.readSets[0].r1.phred, bowtie_index)
+ self.mappingRes = self.mappingRes.run_bowtie(self.threads, save_mapping)
+ return self.mappingRes
+
+ def get_all_reads(self, trimmed = True):
+ if trimmed:
+ r1 = " ".join([x.trimRes.readset.r1.filename for x in self.readSets])#join_reads([x.trimRes.readset.r1.filename for x in self.readSets], self.mainResultsPath, self.name + "_R1")
+ r2 = " ".join([x.trimRes.readset.r2.filename for x in self.readSets if x.trimRes.readset.r2 is not None])#join_reads([x.trimRes.readset.r2.filename for x in self.readSets if x.trimRes.readset.r2 is not None], self.mainResultsPath, self.name + "_R2")
+ else:
+ r1 = " ".join([x.r1.filename for x in self.readSets])#join_reads([x.r1.filename for x in self.readSets], self.mainResultsPath, self.name + "_R1")
+ r2 = " ".join([x.r2.filename for x in self.readSets if x.r2 is not None]) #join_reads([x.r2.filename for x in self.readSets if x.r2 is not None], self.mainResultsPath, self.name + "_R2") #readnames = {'r1' : r1}
+ return r1,r2
+
+ def get_mean_trimRes(self):
+ return round(np.mean([x.trimRes.shorter_fragments_percentage for x in self.readSets]),2)
+
+class ReadSet:
+ ''' Readset contains of r1 and r2'''
+ def __init__(self,outputDir, r1 ,r2=None):
+ self.r2 = None
+ if type(r1) is str:
+ self.r1 = FastQFile(r1) # FastQFile
+ if r2 is not None:
+ self.r2 = FastQFile(r2) # FastQFile
+ else:
+ self.r1 = r1
+ if r2 is not None:
+ self.r2 = r2
+ self.numberOfReads = 0
+ self.numberofTrimmedReads = 0
+ self.trimRes = None
+ self.outputDir = outputDir
+ self.paired = True
+ if len( basename(self.r1.filename).split("_")[:-3]) > 4:
+ self.setname = "_".join(basename(self.r1.filename).split("_")[:-3])
+ else:
+ self.setname = basename(self.r1.filename)
+ if(r2 is None):
+ self.paired = False
+
+ def run_FastQC(self, threads):
+ self.r1 = self.r1.run_FastQC(threads, self.outputDir)
+ if(self.r2 is not None):
+ self.r2 = self.r2.run_FastQC(threads, self.outputDir)
+ return self
+
+ def run_Trimmomatic(self, adapter, threads, palindrome, outputDir, technology, minlen, trimOption, betterTrimming):
+
+ self.trimRes = TrimResult(adapter, palindrome,self.r1,self.r2, threads,outputDir, technology, minlen, trimOption, betterTrimming )
+ return self
+
+ def get_all_qc(self):
+ qc_results = [ (self.r1.qcRes, self.trimRes.r1.qcRes) ]
+ if (self.r2 is not None):
+ qc_results.append((self.r2.qcRes, self.trimRes.r2.qcRes) )
+ return qc_results
+
+
+ def get_all_qc_dict(self):
+ qc_dict =OrderedDict( {"raw":[],"trimmed":[]} )
+
+ qc_dict["raw"] = [[x.__dict__ for x in self.r1.qcRes.results_to_base64()]]
+ qc_dict["trimmed"] = [[x.__dict__ for x in self.trimRes.readset.r1.qcRes.results_to_base64()]]
+
+ if (self.r2 is not None):
+ qc_dict["raw"].append([x.__dict__ for x in self.r2.qcRes.results_to_base64()])
+ qc_dict["trimmed"].append([x.__dict__ for x in self.trimRes.readset.r2.qcRes.results_to_base64()])
+
+ return qc_dict
+
+
+class FastQFile:
+ def __init__(self, absFilename):
+ self.filename = absFilename
+ self.qcRes = None
+ self.log = ""
+ self.phred="phred33"
+ self.concat_files = None
+
+ def run_FastQC(self, threads, outputDir):
+
+ outputDir = join(outputDir, "FastQC")
+ makedirs(outputDir, exist_ok= True)
+ process = subprocess.Popen([join(get_tool_path('fastqc'), "fastqc") , self.filename , "-o" , outputDir, "-t", str(threads), "--extract"], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
+
+ for line in iter(process.stderr.readline, b''):
+ line = line.decode("utf-8")
+ line = line.replace("\n", "")
+ print(line, end="\r")
+ for line in iter(process.stdout.readline, b''):
+ line = line.decode("utf-8")
+ if line.startswith("Approx"):
+ print(line, end="\r")
+ else:
+ print(line, end="\r")
+ pattern = re.match("Quality encoding detected as (?P<phred>\w+\d{2})", line)
+ if pattern:
+ self.phred = pattern.group("phred")
+ print("")
+ process.communicate()
+ self.qcRes = QCResult(join(outputDir, getFilenameWithoutExtension(basename(self.filename), level = 5) + "_fastqc"))
+ return self
+ def get_filename(self):
+ return toLatex(basename(self.filename))
+###########
+## Result classes
+###########
+
+class QCResult:
+ def __init__(self, resultsPath):
+ self.path = resultsPath
+ self.results = self.get_results(resultsPath)
+
+ def get_results(self, resultsPath):
+ summary_list = []
+ no_plot=["Basic Statistics", "Overrepresented sequences"]
+ with open(join(resultsPath, "summary.txt"), "r") as summary:
+ reader = csv.reader(summary, delimiter = "\t")
+ for row in reader:
+ if (row[1] not in no_plot):
+ try:
+ plot = Plot(row[0], row[1], resultsPath)
+ if exists(plot.file):
+ summary_list.append(plot)
+ else:
+ print("No plot file ", plot.file, row[1])
+
+ summary_list.append(Plot("NotExisting", row[1], "NotExisting"))
+ except:
+ print(row[1], "does not exist")
+ summary_list.append(Plot("NotExisting", row[1], "NotExisting"))
+ return summary_list
+ def results_to_base64(self):
+ converted_plot = []
+ for plot in self.results:
+ with open(plot.file, "rb") as imgfile:
+ imgstring = base64.b64encode(imgfile.read())
+ plot.file ='data:image/png;base64,' + imgstring.decode("utf-8")
+ converted_plot.append(plot)
+ return converted_plot
+
+class Plot:
+ def __init__(self, value, name, resultsPath):
+ self.color = self.get_color(value)
+ self.name = name
+ self.file = self.name_to_file(resultsPath)
+
+ def get_color(self, value):
+ if(value == "PASS"):
+ return "green"
+ elif(value =="FAIL"):
+ return "red"
+ #elif (value =="WARN"):
+ # return "orange"
+ else:
+ return "orange"#"grey"
+
+ def name_to_file(self,resultsPath):
+ if self.name in default_parameter["FastQC"].keys():
+ return join(resultsPath, "Images", default_parameter["FastQC"][self.name])
+ else:
+ return ""
+
+
+class TrimParameters:
+ illuminaClip_seedMismatch = str(default_parameter["Trimmomatic"]["illuminaClip_seedMismatch"]) #"2" #specifies the maximum mismatch count which will still allow a full match to be performed
+ illuminaClip_simpleClip = str(default_parameter["Trimmomatic"]["illuminaClip_simpleClip"]) # "10" #specifies how accurate the match between any adapter etc. sequence must be against a read
+ leading = str(default_parameter["Trimmomatic"]["leading"]) #"3" # remove leading low quality below 3
+ trailing = str(default_parameter["Trimmomatic"]["trailing"]) #"3" # remove trailing quality below 3
+ adapter_path = get_tool_path('adapter')
+
+ def make_callstring(self, adapter, palindrome, technology, minlen, trimOption, betterTrimmingOption ):
+ call = []
+ if technology =="Illumina":
+ call.append("ILLUMINACLIP:" + join(self.adapter_path, adapter+ ".fa") + ":" + self.illuminaClip_seedMismatch + ":" + str(palindrome) + ":" + self.illuminaClip_simpleClip)
+ call.append(trimOption)
+ call.append(betterTrimmingOption)
+ call.append("MINLEN:" + str(minlen))
+ return " ".join(call)
+
+
+class TrimResult:
+ parameters = TrimParameters()
+
+ def __init__(self, adapter, palindrome,r1,r2, threads, outputDir, technology, minlen, trimOption, betterTrimming):
+ self.readset = ReadSet(outputDir, r1,r2)
+ self.adapter = adapter
+ self.technology = technology
+ # pathnames
+ self.outputDir = outputDir
+ self.r1_paired = None
+ self.r2_paired = None
+ self.r1_unpaired = None
+ self.r2_unpaired = None
+ self.logs = ""
+ self.shorter_fragments_percentage = 0
+ self.threads = threads
+ self.input_number = 0
+ self.nTrimmed = 0
+ self.pTrimmed = 0
+ self.extraTrimOptions = ""
+ self.run(palindrome, outputDir, minlen, trimOption, betterTrimming)
+ self.blobplot = ""
+
+ def make_callstring(self, palindrome, path, technology, minlen, trimOption, save = False, betterTrimmingOption =""):
+ if self.readset.paired:
+ if save:
+ self.r1_paired =join(path, "r1_P_" + getFilenameWithoutExtension(self.readset.r1.filename, level = 2,getBase = True) + ".fastq")
+ self.r1_unpaired = join(path, "r1_UP_" + getFilenameWithoutExtension(self.readset.r1.filename, level=2, getBase = True)+ ".fastq")
+ self.r2_paired =join(path, "r2_P_" + getFilenameWithoutExtension(self.readset.r2.filename, level=2,getBase = True)+ ".fastq")
+ self.r2_unpaired =join(path, "r2_UP_" + getFilenameWithoutExtension(self.readset.r2.filename,level=2, getBase = True)+ ".fastq")
+ output = " ".join([self.r1_paired, self.r1_unpaired, self.r2_paired, self.r2_unpaired])
+ else:
+ output = "/dev/null /dev/null /dev/null /dev/null"
+ callProcess = [join(get_tool_path("trimmomatic"), "TrimmomaticPE "),
+ "-threads", str(self.threads),
+ "-" + self.readset.r1.phred,
+ self.readset.r1.filename, # input 1
+ self.readset.r2.filename, # input 2
+ output,
+ self.parameters.make_callstring(self.adapter, palindrome, technology, minlen, trimOption, betterTrimmingOption),
+ ]
+ else:
+ callProcess = [join(get_tool_path("trimmomatic"),"TrimmomaticSE "),
+ "-threads", str(self.threads),
+ "-" + self.readset.r1.phred,
+ self.readset.r1.filename, # input 1
+ join(path, getFilenameWithoutExtension(self.readset.r1.filename,level=2, getBase = True)+ ".fastq"),#output
+ self.parameters.make_callstring(self.adapter, palindrome, technology, minlen, trimOption, betterTrimmingOption),
+ ]
+ self.r1_unpaired = join(path,getFilenameWithoutExtension(self.readset.r1.filename,level=2, getBase = True)+ ".fastq")
+ return callProcess
+
+ def check_quality(self, path, trim_perc):
+ #makedirs(join(path, "temp"), exist_ok=True)
+ #path= join(path, "temp")
+ if self.readset.paired:
+ try:
+ r1_name = join_reads([self.r1_paired, self.r1_unpaired], path, basename(self.readset.r1.filename))
+
+ print("Run FastQC to optimize trimming.")
+ process = subprocess.Popen( [join(get_tool_path('fastqc'), "fastqc"), r1_name,"--nogroup", "--extract", "-o", path,"--threads", str(self.threads)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ process.communicate()
+ trim_bool, headcrop, crop = optimize_trimming(join(path, getFilenameWithoutExtension(basename(r1_name), level=5) + "_fastqc"),trim_perc)
+
+ #if r1 was ok, go on and check r2
+ print("Check 'per sequence base content' of R2.")
+
+ r2_name = join_reads([self.r2_paired, self.r2_unpaired], path, basename(self.readset.r2.filename))
+ process = subprocess.Popen(
+ [join(get_tool_path('fastqc'), "fastqc"), r2_name,"--nogroup", "--extract", "-o",path, "--threads", str(self.threads)],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ for line in iter(process.stderr.readline, b''):
+ print(line)
+ for line in iter(process.stdout.readline, b''):
+ print(line)
+ process.communicate()
+ trim_bool, r2_headcrop, r2_crop = optimize_trimming(join(path, getFilenameWithoutExtension(basename(r2_name), level=5) + "_fastqc"), trim_perc)
+
+ return trim_bool, max(headcrop, r2_headcrop), min(crop, r2_crop)
+ #if headcrop >= 0 or crop > 0:
+ finally:
+ shutil.rmtree(path)
+ else:
+ process = subprocess.Popen([join(get_tool_path('fastqc'), "fastqc"), self.r1_unpaired,"--nogroup", "--extract", "-o", join(path,"temp"), "--threads", str(self.threads)],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ for line in iter(process.stdout.readline, b''):
+ print(line, end="\r")
+ process.communicate()
+ trim_bool, headcrop, crop = optimize_trimming(join(path,"temp", getFilenameWithoutExtension(basename(self.r1_unpaired), level=5) + "_fastqc"), trim_perc)
+
+ return trim_bool, headcrop, crop
+
+ def run(self, palindrome, path, minlen, trimOption, betterTrimming):
+
+ # Report Trimmomatic Output
+ # Run palindrome parameter which should be kept at last
+ survived = {'30':0, '1000':0}
+
+ call = self.make_callstring(palindrome, path, self.technology,minlen, trimOption, save=True)
+ process = subprocess.Popen(" ".join(call), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+ survived1, total, nTrimmedReads, pTrimmedReads, log = trimmomatic_report(process,call[0], True)
+ process.communicate()
+
+ ### Optimize trimming
+ if betterTrimming !="":
+ print("Improve trimming parameter")
+ makedirs(join(path,"temp"), exist_ok=True)
+ print(join(path, "temp"), betterTrimming )
+ trim_bool, final_headcrop, final_crop = self.check_quality(join(path, "temp"), betterTrimming)
+
+ if trim_bool:
+ self.extraTrimOptions = "HEADCROP:"+str(final_headcrop) + " CROP:"+str(final_crop)
+ call = self.make_callstring(palindrome, path, self.technology, minlen, trimOption, save=True, betterTrimmingOption = self.extraTrimOptions)
+ process = subprocess.Popen(" ".join(call), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+ survived1, total, nTrimmedReads, pTrimmedReads, log = trimmomatic_report(process, call[0], True)
+ process.communicate()
+
+ self.total = total
+ self.nTrimmed = nTrimmedReads
+ self.pTrimmed = pTrimmedReads
+ self.logs += log
+
+ ############################
+ survived[str(palindrome)] = survived1
+ if self.readset.r2 is not None:
+ if palindrome==30:
+ palindrome=1000
+ else:
+ palindrome=30
+ process = subprocess.Popen(" ".join(self.make_callstring(palindrome, path, self.technology, minlen, trimOption, save=False, betterTrimmingOption = self.extraTrimOptions) ),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, shell=True)
+ survived2, total,nTrimmedReads, pTrimmedReads, temp = trimmomatic_report(process, call[0],False)
+ process.communicate()
+ survived[str(palindrome)] = survived2
+ self.shorter_fragments_percentage = round(survived["1000"] - survived["30"], 2)
+
+ return self
+
+ def run_FastQC(self, outputDir, tempdir):
+ if self.readset.paired:
+ #tempdir = tempfile.mkdtemp()
+ r1_name = join_reads([self.r1_paired, self.r1_unpaired], tempdir, basename(self.readset.r1.filename))
+ r2_name = join_reads([self.r2_paired, self.r2_unpaired], tempdir, basename(self.readset.r2.filename))
+ try:
+ self.readset = ReadSet(outputDir, r1_name, r2_name)
+ self.readset.run_FastQC(self.threads)
+ except:
+ print(sys.exc_info())
+ sys.exit()
+ finally:
+ remove(r1_name)
+ remove(r2_name)
+ self.readset.r1.filename = " ".join([self.r1_paired, self.r1_unpaired])
+ self.readset.r2.filename = " ".join([self.r2_paired, self.r2_unpaired])
+ else:
+ self.readset = ReadSet(outputDir, self.r1_unpaired)
+ self.readset.run_FastQC(self.threads)
+ return self
+
+ def run_blobplot(self, db):
+ all_inputs = [x for x in [self.r1_paired, self.r1_unpaired, self.r2_paired, self.r2_unpaired] if x is not None]
+ blobplot = BlobResult()
+ self.blobplot = blobplot.run(all_inputs, self.outputDir, self.threads, self.readset.setname.split(".")[0], db)
+ return self
+ def print_param(self, minlen, trimOption, pali):
+ return self.parameters.make_callstring(self.adapter, pali, self.technology, minlen, trimOption, self.extraTrimOptions)
+
+class MappingParameter:
+ def __init__(self, mapping):
+ pass
+
+class MappingResult:
+ def __init__(self, reference, r1_name, r2_name, phred, bowtie_index):
+ self.numberOfAlignedReads = 0
+ self.percentAlignedReads= 0
+ self.index = self.build_index(reference, bowtie_index)
+ self.log = ""
+ self.r1_name = r1_name
+ self.r2_name = r2_name
+ self.phred = phred
+
+ def build_index(self, reference, index):
+ index_name = join(index, getFilenameWithoutExtension(basename(reference), level=2) +"_bt2") # reference name "bowtie2 build index_name"
+ if not isdir(index):
+ return index
+ process = subprocess.Popen([join(get_tool_path("bowtie2"),"bowtie2-build"),reference, index_name ], stderr=subprocess.PIPE, stdout = subprocess.PIPE)
+
+ for line in iter(process.stderr.readline, b''):
+ line = line.decode("utf-8")
+ print(line)
+ process.communicate()
+ print("Bowtie2-build. Done.")
+ return index_name
+
+
+ def run_bowtie(self, threads, save_mapping):
+ print("Run Bowtie2")
+ call = [join(get_tool_path("bowtie2"), "bowtie2"), "-x", self.index, "--no-unal --threads ", str(threads)]
+ call.extend(["-U", re.sub(r"(?<!\\)\s",",",self.r1_name)])
+ if self.r2_name is not None:
+ call.extend(['-U',re.sub(r"(?<!\\)\s",",",self.r2_name)])
+ call.extend(["-S", save_mapping, "2>&1"])
+ #print("Call ", " ".join(call))
+ process = subprocess.Popen(" ".join(call), stderr=subprocess.PIPE, stdout= subprocess.PIPE, shell=True)
+
+ for line in iter(process.stdout.readline, b''):
+ line = line.decode("utf-8")
+ print(line)
+ self.log += toLatex(line) + "\n"
+ pattern1 = re.match(".* (?P<aligned_exact>\d+) \((?P<percent_aligned_exact>\d+.\d+)%\) aligned exactly 1 time", line)
+ pattern2 = re.match(".* (?P<aligned_more_than_one>\d+) \((?P<percent_aligned_more_than_one>\d+.\d+)%\) aligned >1 times", line)
+ if pattern1:
+ self.numberOfAlignedReads += int(pattern1.group("aligned_exact"))
+ self.percentAlignedReads += float(pattern1.group("percent_aligned_exact").replace(",","."))
+ if pattern2:
+ self.numberOfAlignedReads += int(pattern2.group("aligned_more_than_one"))
+ self.percentAlignedReads += float(pattern2.group("percent_aligned_more_than_one"))
+ self.percentAlignedReads = round(self.percentAlignedReads,2)
+
+ for line in iter(process.stderr.readline, b''):
+ line = line.decode("utf-8")
+ self.log += "\\textcolor{red}{" + toLatex(line) + "}\n"
+ print(line)
+ process.communicate()
+ print("Bowtie2 done.")
+ return self
+
+
+class KrakenResult:
+ def __init__(self, r1,r2, db, outputDir, threads, fileext, project_name):
+ self.report_name = None
+ self.report = ""
+ self.pClassified = 0
+ self.nClassified = 0
+ self.log = ""
+ self.run_kraken(r1,r2, db, outputDir, threads, fileext, project_name)
+
+ def make_callstring(self, r1,r2, db, outputDir, threads, fileext, project_name):
+ #input_type = ""
+ #if fileext.endswith("")
+ call = [join(get_tool_path('kraken') , "kraken") , "--db", db, r1]
+ if r2 is not None:
+ call.append( r2)# + " --paired")
+ call.append("--preload")
+ call.append("--threads " + str(threads))
+ self.report_name = join(outputDir, project_name + ".krakenreport")
+ if exists(self.report_name):
+ i = 2
+ new_name = join(outputDir, project_name + "_" +str(i) + ".krakenreport")
+ while exists(new_name):
+ i += 1
+ new_name = join(outputDir, project_name + "_" + str(i) + ".krakenreport")
+ self.report_name=new_name
+
+ call.append("--output " + self.report_name)
+ return " ".join(call)
+
+ def run_kraken(self, r1,r2, db, outputDir, threads, fileext, project_name):
+ #print(self.make_callstring(r1,r2, db, outputDir, threads, fileext, project_name))
+ process = subprocess.Popen(self.make_callstring(r1,r2, db, outputDir, threads, fileext, project_name), shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+ self.log += "\\textcolor{gray}{Using Parameter " + " ".join(["\path{"+ x + "}" for x in self.make_callstring(r1,r2, db, outputDir, threads, fileext, project_name).split(" ")]) + "}\\\\"
+
+ for line in iter(process.stderr.readline, b''):
+ line = line.decode("utf-8")
+ print(line)
+ if not (line.startswith("Loading database") or re.search("Processed \d+", line)):
+ #self.output += line + "\n"
+ self.log += toLatex(line) +"\n"
+ pattern = re.match("\s+(?P<nClassified>\d+) sequences classified \((?P<pClassified>\d+.\d+)%\)", line)
+ if pattern:
+ self.nClassified = pattern.group("nClassified")
+ self.pClassified = pattern.group("pClassified")
+
+ for line in iter(process.stdout.readline, b''):
+ line = line.decode("utf-8")
+ print(line)
+ if not ( re.search("Processed \d+", line) or line.startswith("Loading database") ):
+ self.output += line + "\n"
+
+ process.communicate()
+
+ print("Convert Kraken-Report..")
+ self.convert_report(db)
+ return self
+
+ def convert_report(self, db):
+ process = subprocess.check_output([join(get_tool_path('kraken') ,'kraken-report') , "--db", db, self.report_name])
+ self.report = process.decode("utf-8")
+ #converted_kraken = open("kraken_output.csv","w")
+ #converted_kraken.write(self.report)
+ #converted_kraken.close()
+ return self
+
+
+class AssemblyParameter:
+ method = "velvet"
+ kmer = "35"
+ input_type = "-fmtAuto -short"
+
+class BlastParameter:
+ method="blastn"
+ task = "megablast"
+ evalue= "1e-5"
+ max_target_seqs = "1"
+ outfmt="'6 qseqid staxids bitscore std sscinames sskingdoms stitle' "
+ extraParams = "-culling_limit 5"
+
+ def __init__(self):
+ self.db = join(get_tool_path("blast_db","DEFAULT"), "nt")
+
+ def print(self):
+ return " ".join([self.method, "-task", self.task, "-evalue", self.evalue, "-max_target_seqs", self.max_target_seqs, "-outfmt", self.outfmt, self.extraParams , "-db", self.db])
+
+class BlobParameter:
+ def __init__(self, taxonomy_path):
+ self.db = ""
+ self.nodes = join(taxonomy_path, "taxonomy/nodes.dmp")
+ self.names = join(taxonomy_path, "taxonomy/names.dmp")
+
+ def print(self):
+ if self.db !="":
+ return "--db " + self.db
+ else:
+ return " ".join(["--nodes", self.nodes, "--names", self.names])
+
+class BlobResult:
+ assParams = AssemblyParameter()
+ blastParams = BlastParameter()
+
+ def run(self, input, output, threads, setname, db):
+ print("Run blobtools...")
+ self.blobParams = BlobParameter(db)
+ contigs=""
+ #ASSEMBLY
+ print("...first assemble")
+ if self.assParams.method == "velvet":
+ concat_input = " -short ".join(input)
+ full_output = join(output, "Velvet",setname)
+ makedirs(full_output, exist_ok=True)
+ #print(" ".join(["velveth", full_output, self.assParams.kmer, self.assParams.input_type, concat_input]))
+
+ process = subprocess.Popen(" ".join(["velveth", full_output, self.assParams.kmer, self.assParams.input_type, concat_input]), shell= True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+ '''for line in iter(process.stderr.readline, b''):
+ print(line)
+ for line in iter(process.stdout.readline, b''):
+ print(line)'''
+ process.communicate()
+ print(["velvetg", full_output])
+ process = subprocess.Popen(" ".join(["velvetg", full_output]), shell=True, stderr=subprocess.PIPE,
+ stdout=subprocess.PIPE)
+ '''for line in iter(process.stderr.readline, b''):
+ print(line)
+ for line in iter(process.stdout.readline, b''):
+ print(line)'''
+ process.communicate()
+
+ contigs = join(full_output, "contigs.fa")
+
+ # blast search
+ blast_result = join(output, setname + "." + self.blastParams.evalue + ".megablast")
+ print("...blastn ")
+
+ process = subprocess.Popen(" ".join([self.blastParams.print(), "-query", contigs, "-out",blast_result, "-num_threads", str(threads)]),shell = True,stderr = subprocess.PIPE, stdout = subprocess.PIPE)
+ for line in iter(process.stderr.readline, b''):
+ print(line)
+ for line in iter(process.stdout.readline, b''):
+ print(line)
+ process.communicate()
+
+ # blobtools
+ img = None
+ blob_output= join(output, setname)
+ print("Run blobtools")
+ process = subprocess.Popen(" ".join([join(get_tool_path("blobtools"),"blobtools"), "create", self.blobParams.print(), "-y", self.assParams.method, "-t", blast_result, "-i" , contigs, "-o", blob_output]),shell = True,
+ stderr = subprocess.PIPE,
+ stdout = subprocess.PIPE)
+ for line in iter(process.stderr.readline, b''):
+ print(line)
+ for line in iter(process.stdout.readline, b''):
+ print(line)
+ process.communicate()
+ process = subprocess.Popen(" ".join([join(get_tool_path("blobtools"),"blobtools"), "plot", "-i",blob_output + ".BlobDB.json"]),shell = True,
+ stderr = subprocess.PIPE,
+ stdout = subprocess.PIPE)
+ for line in iter(process.stdout.readline, b''):
+ line = line.decode()
+ if line.startswith("[STATUS]"):
+ if "Plotting" in line:
+ pattern = re.match("\[STATUS\]\t: Plotting (?P<path>.*)",line)
+ img = pattern.group("path")
+ print(line)
+
+ process.communicate()
+
+ return img
diff --git a/combineFastQC.py b/combineFastQC.py
new file mode 100755
index 0000000..b0ca7d3
--- /dev/null
+++ b/combineFastQC.py
@@ -0,0 +1,431 @@
+ # !/usr/bin/env python # -*- coding: utf-8 -*-
+# created by Stephan Fuchs (FG13, RKI), 2015
+
+'''
+Combine FastQC Results
+Input: Project/Sample Folder which was created by runQCPipeline (folder contains QCResults - <FastQC_Result>)
+Output: html file
+'''
+
+# config
+version = '0.2.9'
+logsep = '=====================================\n'
+
+import os
+import sys
+import time
+import shutil
+import base64
+import tempfile
+import matplotlib
+
+matplotlib.use("Agg") #in case no display is set
+import matplotlib.pyplot as plt
+import argparse
+import numpy as np
+
+
+
+#advanced config - HTML output
+htmlheader = '<!doctype html>' \
+ '<html>' \
+ '<head>' \
+ '<meta charset="utf-8">' \
+ '<title>Combined FastQC Report</title>' \
+ '<script language="javascript">top.DSfirstAccess = 1442945125;top.DSmaxTimeout=3600;top.DStimediff=1442945348 - parseInt(""+(new Date()).getTime()/1000);</script><SCRIPT language="javascript" id="dsshim" src="/dana-cached/js/shimdata.cgi"></SCRIPT><SCRIPT language="javascript" id="dsfunc" src="/dana-cached/js/oth.js?a4e535923255f21d1261a51909daf29f"></SCRIPT><SCRIPT language="javascript" id="dstimeout" src="/dana-cached/js/sessiontimeout.js?a4e535923255f21d1261a51909daf29f"></SCRIPT> [...]
+ '//<![CDATA[' \
+ 'var dsnodocwrites = 0 ; var DanaCookie="MstrPgLd1=1; MstrPgLd2=1; UserContext=mBoHOGHwbUOz5yuRe-kSp4w1h6L4yNIICPjPhC-9ifhLTYXFL0EmDebAMytIx2kDia-CCTHmi8Y.; tzid=W. Europe Standard Time"; var DSHost="webmail.rki.de"; var DSDanaInfoStr=""; var DSLang="de"; var DSObfuscateHostname=0;var DSTBSettings=17425;var DSTBLU=\'/dana/home/starter.cgi?startpageonly=1\';;danaSetDSHost();' \
+ '//]]>' \
+ '</SCRIPT></head>' \
+ '<body>'
+
+htmlfooter = '</table>\n</body>\n<SCRIPT language="javascript" id="dstb-id">dstb()</SCRIPT></html>\n'
+
+#advanced config - fastqc img extraction
+order = ['sld', 'sq', 'sgc', 'bq', 'bnc', 'kmer', 'dup']
+imgfiles = {'dup': 'duplication_levels.png',
+ 'kmer': 'kmer_profiles.png',
+ 'bgc': 'per_base_gc_content.png',
+ 'bnc': 'per_base_n_content.png',
+ 'bq': 'per_base_quality.png',
+ 'bsc': 'per_base_sequence_content.png',
+ 'sgc': 'per_sequence_gc_content.png',
+ 'sq': 'per_sequence_quality.png',
+ 'sld': 'sequence_length_distribution.png',
+ "ptsq": "per_tile_quality.png",
+ "ac": "adapter_content.png"
+ }
+#advanced config - fastqc data extraction
+modules = {'sld': '>>Sequence Length Distribution\t', 'sq': '>>Per sequence quality scores\t',
+ 'sgc': '>>Per sequence GC content\t'}
+all_modules = {'bq': ">>Per base sequence quality",
+ 'bsc': ">>Per base sequence content",
+ 'bgc': ">>Per base GC content",
+ 'bnc': ">>Per base N content",
+ 'sld': ">>Sequence Length Distribution",
+ 'dup': ">>Sequence Duplication Levels",
+ 'kmer': ">>Kmer Content",
+ 'sq': '>>Per sequence quality scores',
+ 'sgc': '>>Per sequence GC content',
+ 'ptsq': '>>Per tile sequence quality',
+ 'ac':'>>Adapter Content'
+ }
+codes = {'comment': '#', 'modulestart': '>>', 'moduleend': '>>END_MODULE'}
+
+#advanced config - boxplot design
+
+#processing command line arguments
+'''parser = argparse.ArgumentParser(prog="COMBINE_FASTQC",
+ description='performs FastQC for multiple fastq files and combines results in a single html file.')
+parser.add_argument('file', metavar="FILE(S)", help="input BAM file", type=argparse.FileType('r'), nargs='+')
+parser.add_argument('-t', metavar='INT', help="number of threads/CPUs used for FastQC", type=int, action="store",
+ default=4)
+parser.add_argument('--version', action='version', version='%(prog)s ' + version)
+
+args = parser.parse_args()
+'''
+
+
+#functions
+def createCSV(dataset, lines, key, path):
+ '''creates CSV used for boxplotting'''
+ output = []
+ for line in lines:
+ line = line.strip()
+ fields = line.split('\t')
+
+ #Sequence Length Distribution
+ if key == "sld":
+ l = fields[0].split('-')
+ try:
+ mean = (int(l[0]) + int(l[1])) / 2
+ except:
+ mean = int(l[0])
+ output.extend([str(mean)] * int(float(fields[1])))
+
+ #sequence quality or sequence GC content
+ if key == "sq" or key == 'sgc':
+ v = fields[0] #quality or gc [%]
+ output.extend([str(v)] * int(float(fields[1])))
+
+ #duplicates
+ if key == 'dup':
+ c = fields[0].replace('>', '') #count of duplicates
+ c = c.replace('k', '000')
+ c = c.replace('+', '')
+ output.extend([str(c)] * int(float(fields[1]) * 100))
+ csvhandle = open(path + '/' + key + ".csv", 'a')
+ csvhandle.write(dataset.replace(',', '_') + "," + ",".join(output) + '\n')
+ csvhandle.close()
+
+
+def color(line):
+ line = line.strip()
+ if line.endswith("pass"):
+ return "#3CBC3C"
+ elif line.endswith("warn"):
+ return "orange"
+ elif line.endswith("fail"):
+ return "red"
+ else:
+ return "black"
+
+
+def get_key(dict, line):
+ for key, value in dict.items():
+ if line.startswith(value):
+ return key
+ return None
+
+
+def barplot(overview, width, height, tmpdir, type, xlabel, ylabel, title):
+ bplotfont = {'weight': 'bold', 'size': 5}
+ plt.rc('font', **bplotfont)
+ fig = plt.figure(1, figsize=(width, height))
+ ax = fig.add_subplot(111)
+ index = np.arange(overview[1].shape[0])
+ opacity = 0.4
+ bar_width = 0.35
+ bplot = plt.bar(index, overview[1][type], bar_width,
+ alpha=opacity,
+ color='#385d8a') #, label='Aligned Reads')
+ plt.xlabel(xlabel)
+ plt.ylabel(ylabel)
+ plt.title(title)
+ plt.xticks(index + (bar_width / 2), [x.decode("utf-8") for x in overview[1]['name']], rotation=90)
+ # plt.ylim((0,100))
+
+ for rect, label in zip(bplot, overview[1][type]):
+ ax.text(rect.get_x() + rect.get_width() / 2, rect.get_height() , str(label), ha='center', va='bottom')
+
+ fig.savefig(os.path.join(tmpdir ,type + ".png"), bbox_inches='tight', dpi=200)
+ fig.clf()
+
+
+def combineFastQC(project_folder, overview=None, tmp = "tmp"):
+
+ err = 1
+ while err > 0:
+ timecode = time.strftime("%Y-%m-%d_%H-%M-%S")
+ html_fname = "FASTQC_COMBINE_" + timecode + ".html"
+ if os.path.isfile(html_fname):
+ err += 1
+ time.sleep(1)
+ else:
+ err = 0
+ if err == 30:
+ print("ERROR: giving up to find available output file name")
+ sys.exit(1)
+ #working
+ try:
+ tmpdir = tempfile.mkdtemp(tmp)
+ os.makedirs(tmpdir, exist_ok=True)
+ chunkfiles = [tmpdir + '/' + html_fname + '1', tmpdir + '/' + html_fname + '2']
+ outhandle = open(chunkfiles[1], "w")
+
+ print("Summarize fastqc ", project_folder)
+ fastqc_dirs = []
+ #sample_name=[]
+ for sample in os.listdir(project_folder):
+ if os.path.isdir(os.path.join(project_folder, sample)):
+ if sample == "QCResults":
+ sample = ""
+ path = os.path.join(project_folder, sample)
+ if "QCResults" in os.listdir(path):
+ path = os.path.join(path, "QCResults")
+
+ if "FastQC" in os.listdir(path):
+ path = os.path.join(path, "FastQC")
+
+ print("Found path " + path)
+ if os.path.exists(path):
+ fastqc_folder = sorted( [x for x in os.listdir(path) if x.endswith("_fastqc")])
+
+ if len(fastqc_folder) != 0:
+ path_trimmed = os.path.join(path, "..", "Trimmed", "FastQC")
+ if os.path.exists(path_trimmed):
+ for fastqc in fastqc_folder:
+ fastqc_dirs.append(os.path.join(path, fastqc))
+ fastqc_dirs.append(os.path.join(path_trimmed, fastqc))
+ else:
+ fastqc_dirs = fastqc_folder
+ fastqc_folder = None
+
+ i = 0
+ outhandle.write('<table border = "1"> <tr>')
+
+ for fastqcdir in fastqc_dirs:
+ i += 1
+ print("extracting data ", fastqcdir)
+ store = False
+ data = []
+ fastqcdatafile = open(os.path.join(fastqcdir, "fastqc_data.txt"), "r")
+ frame_color = {}
+ for line in iter(fastqcdatafile):
+ if line.startswith(codes['comment']):
+ pass #print("comment")
+ #continue
+ elif line.startswith(codes['moduleend']):
+ if len(data) > 0:
+ name1 = os.path.basename(fastqcdir)
+ if "/Trimmed/" in fastqcdir:
+ name1 = "Trimmed " + name1
+ createCSV(name1, data[2:], key, tmpdir)
+ data = []
+ store = False
+
+ elif line.startswith(codes['modulestart']):
+ key = get_key(all_modules, line)
+ if key in modules.keys():
+ store = True
+ if key is not None:
+ frame_color[key] = color(line)
+ if store:
+ data.extend([line])
+ fastqcdatafile.close()
+
+ #image processing
+ name = os.path.basename(fastqcdir)
+ if "/Trimmed/" in fastqcdir:
+ name = "Trimmed " + os.path.basename(fastqcdir)
+ outhandle.write('<tr>\n<td><b>' + name + "</b></td>\n")
+ for imgkey in order:
+ try:
+ with open(os.path.join(fastqcdir , "Images" , imgfiles[imgkey]), "rb") as imgfile:
+ imgstring = base64.b64encode(imgfile.read())
+ outhandle.write('<td><img style="border: 5px solid ' + frame_color[imgkey] + ';" alt="Embedded Image" src="data:image/png;base64,' + imgstring.decode(
+ "utf-8") + '" /></td>\n')
+ except:
+ outhandle.write('<td></td>\n')
+ outhandle.write('</tr>\n')
+ if "/Trimmed/" in fastqcdir:
+ outhandle.write('</tr><tr></tr><tr></tr><tr></tr><tr></tr><tr></tr><tr></tr>')
+
+ outhandle.write(htmlfooter)
+ outhandle.close()
+
+ bplotfont = {'weight': 'bold', 'size': 5}
+ plt.rc('font', **bplotfont)
+
+ #generate boxplots
+ print ("generating comparative boxplots...")
+ xclasses = range(1, len(fastqc_dirs) + 1)
+ width = 4.5
+ height = 3.5
+ if (len(fastqc_dirs) / 3) > width:
+ width = len(fastqc_dirs) / 3
+ height = len(fastqc_dirs) / 4
+ #max width
+ if width > 12:
+ width = 12
+ height = 7
+ bplotfont = {'weight': 'bold', 'size': 3}
+ plt.rc('font', **bplotfont)
+
+ xlabels = []
+ for key in order:
+ if key in modules:
+ data = []
+
+ plottitle = modules[key].replace(">", '')
+ plottitle = plottitle.replace("\t", '')
+ print("Plot ", plottitle )
+
+ #csvhandle = open(tmpdir + '/' + key + ".csv", 'r')
+ with open(tmpdir + '/' + key + ".csv", 'r') as csvfile:
+ csvhandle = csv.reader(csvfile, delimiter=',', quotechar='|')
+ for row in csvhandle:
+ xlabels.append(row[0])
+ data.append([float(x) for x in row[1:]])
+
+ fig = plt.figure(1, figsize=(width, height))
+ ax = fig.add_subplot(111)
+ #bp = ax.boxplot(data, notch=True, patch_artist=True)
+
+ # set props
+ '''whisker = {'color':'#b4b4b4', 'linestyle':"-", 'linewidth':0.75}
+ caps = {'color':'#b4b4b4', 'linewidth':1}
+ flier = {"marker":'o', "lw":0, "color":'#e6e6e6', "markersize":0.5}
+ box = {"color":'#385d8a', "linewidth":0.75}'''
+ bp = ax.boxplot(data)#, capsprops = caps , flierprops = flier, boxprops =box)
+
+ for box in bp['boxes']:
+ #outline color
+ box.set(color='#385d8a', linewidth=0.75)
+
+ for whisker in bp['whiskers']:
+ whisker.set(color='#b4b4b4', linestyle="-", linewidth=0.75)
+
+ for cap in bp['caps']:
+ cap.set(color='#b4b4b4', linewidth=1)
+
+ for median in bp['medians']:
+ median.set(color='#c0504d', linewidth=1)
+
+ for flier in bp['fliers']:
+ flier.set(marker='o', lw=0, color='#e6e6e6', markersize=0.5)
+
+ plt.title(plottitle)
+ plt.xticks(xclasses, xlabels, rotation=90)
+ fig.savefig(tmpdir + '/' + key + ".png", bbox_inches='tight', dpi=200)
+ fig.clf()
+
+ if overview is not None:
+ try:
+ if len([x for x in overview[1]['mapping'] if x != 0.0]) != 0:
+ barplot(overview, width, height, tmpdir, 'mapping', 'Samples', 'Mapped reads [%]',
+ 'Percentage of reads aligned to ' + overview[0])
+ except:
+ print("Couldnt plot mapping results")
+ try:
+ barplot(overview, width, height, tmpdir, 'trimming', 'Samples', 'Reads after trimming [%]',
+ 'Percentage of reads after trimming')
+ except:
+ print("Couldnt plot trimming results")
+ try:
+ if len([x for x in overview[1]['shorter_reads'] if x != 0.0]) != 0:
+ barplot(overview, width, height, tmpdir, 'shorter_reads', 'Samples', 'Shorter fragments [%]',
+ 'Percentage of fragments shorter than read length')
+ except:
+ print("Couldnt plot shorter fragments")
+ try:
+ if len([x for x in overview[1]['kraken'] if x != 0.0]) != 0:
+ barplot(overview, width, height, tmpdir, 'kraken', 'Samples', 'Classified reads [%]',
+ 'Percentage of classified reads')
+ except:
+ print("Couldnt plot Kraken results")
+ #save boxplots to html
+ outhandle = open(chunkfiles[0], "w")
+ outhandle.write(htmlheader)
+ #for line in iter(outhandle):
+ # if line.strip() == "<boxplotline \>":
+ outhandle.write("<tr>\n<td><b>summary</b></td>\n")
+ for key in order:
+ if key not in modules:
+ outhandle.write("<td>-</td>")
+ else:
+ imgfile = open(tmpdir + '/' + key + ".png", "rb")
+ imgstring = base64.b64encode(imgfile.read())
+ imgfile.close()
+ outhandle.write('<td><img alt="Embedded Image" src="data:image/png;base64,' + imgstring.decode(
+ "utf-8") + '" /></td>\n')
+ outhandle.write('</tr>\n')
+
+ outhandle.write("<tr>\n<td></td>\n")
+ if overview is not None:
+ for result in ['mapping', 'trimming', 'shorter_reads', 'kraken']:
+ try:
+ imgfile = open(os.path.join(tmpdir, result + ".png"), "rb")
+ imgstring = base64.b64encode(imgfile.read())
+ imgfile.close()
+ outhandle.write('<td><img alt="Embedded Image" src="data:image/png;base64,' + imgstring.decode(
+ "utf-8") + '" /></td>\n')
+ except:
+ pass
+ outhandle.write('</tr>\n')
+ outhandle.close()
+
+
+ #join sub-htm-files
+ print("Create result file: " + os.path.join(project_folder, html_fname))
+
+ htmlhandle = open(os.path.join(project_folder, html_fname), 'w')
+ for chunkfile in chunkfiles:
+ chunkhandle = open(chunkfile, 'r')
+ for line in iter(chunkhandle):
+ htmlhandle.write(line)
+ chunkhandle.close()
+ htmlhandle.close()
+ finally:
+ #cleaning
+ shutil.rmtree(tmpdir)
+
+
+import csv
+
+if __name__ == "__main__":
+ start =time.time()
+ print(start)
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-folder', dest='folder', help="Sample folder", required=True)
+ parser.add_argument('-csv', dest='csv',
+ help="csv file.Format: Sample name | Mapping Result | Trimming Result | Kraken Result | Shorter Fragments ",
+ required=False)
+ parser.add_argument('-reference', dest="reference", help="Name of reference", required=False)
+ arguments = vars(parser.parse_args())
+
+ if arguments["csv"] and arguments["reference"]:
+
+ overview = []
+ with open(arguments["csv"], newline='') as csvfile:
+ infosheet = csv.reader(csvfile, delimiter=';', quotechar='|')
+ for row in infosheet:
+ overview.append(tuple(row))
+ print(row)
+ combineFastQC(arguments["folder"], (arguments["reference"],np.array(overview, dtype = [('name', '|S100'), ('mapping', float), ('trimming', float), ('kraken', float), ('shorter_reads', float)])))
+
+ elif (arguments["csv"] or arguments["reference"]):
+ sys.exit("csv and reference required")
+ else:
+ combineFastQC(arguments["folder"])
+ print(time.time() - start)
diff --git a/config.txt b/config.txt
new file mode 100755
index 0000000..39ba760
--- /dev/null
+++ b/config.txt
@@ -0,0 +1,14 @@
+[DEFAULT]
+kraken_db =/home/andruscha/programs/kraken/minikraken_20141208/
+blast_db = /data/BLAST/nt/
+
+[PATH]
+kraken = /home/andruscha/programs/kraken/kraken_build/
+adapter = /data/GS/tools/Trimmomatic-0.32/adapters/
+fastqc = /home/lieuv/tools/FastQC/
+trimmomatic =
+bowtie2 =
+blobtools = /home/lieuv/tools/blobtools
+
+[VERSION]
+trimmomatic = 0.32
diff --git a/helper.py b/helper.py
new file mode 100755
index 0000000..e75cd69
--- /dev/null
+++ b/helper.py
@@ -0,0 +1,204 @@
+__author__ = 'LieuV'
+
+from os.path import basename, splitext, join, isfile, dirname, abspath, exists
+from os import makedirs
+import subprocess
+import re, sys, numpy
+
+def getDir(args, getAbs = False):
+ if isfile(args[0]):
+ args[0]= dirname(args[0])
+ path = join(*args)
+ if(getAbs):
+ path = abspath(path)
+ makedirs(path, exist_ok=True)
+ return path
+
+def getFilenameWithoutExtension(string, level = 1, getBase = False):
+ ''' In case of double extension '''
+ if getBase:
+ string = basename(string)
+ string = splitext(string)[0]
+ i = 0
+ while splitext(string)[-1] in [".gz", ".gzip", ".zip", ".bz", ".fasta", ".fastq", ".bam"]:
+ #while i < level:
+ #i +=1
+ string = splitext(string)[0]
+ return string
+
+def bam_to_fastq(bamfile, output):
+ print("Convert bam to fastq...", bamfile)
+ outfile = join(output, getFilenameWithoutExtension(bamfile, level = 2, getBase= True) +".fastq")
+ subprocess.Popen("bamtools convert -format fastq -in " + bamfile + " -out " + outfile, shell = True)
+ return outfile
+
+def toLatex(text, newline = False):
+ text = text.replace("#","\#")
+ text = text.replace("_", "\_")
+ text = text.replace("\t", " ")
+ text = text.replace("%", "\%")
+
+ if newline:
+ text = text.replace("\n", " ")
+ return text
+
+def join_reads(read_list, outputDir, outputName):
+ if len(read_list) ==0:
+ return None
+ elif len(read_list) ==1:
+ return read_list[0]
+ outfile = join(outputDir, getFilenameWithoutExtension(outputName, True) +".gz")
+ #if not exists(outfile):
+ if read_list[0].endswith(".gz"):
+
+ print("Decompress and concatenate files. This may take a while." )
+ process = subprocess.call("zcat " + " ".join(read_list) + "|gzip --fast > " + outfile, shell=True)
+ else:
+ print("Decompress and concatenate files. This may take a while.")
+ process = subprocess.call("gzip --fast -c " + " ".join(read_list) + " > " + outfile, shell=True)
+ if process != 0:
+ sys.exit("Error with gzip")
+ return outfile
+
+def trimmomatic_report(process, type, write = True):
+ logs =""
+ survived = 0
+ total = 0
+ pTrimmedReads = 0
+ nTrimmedReads = 0
+ for line in iter(process.stdout.readline, b''):
+ print(line.decode("utf-8"))
+ for line in iter(process.stderr.readline, b''):
+ line = line.decode("utf-8")
+ print(line)
+ if not (line.startswith("Using Long Clipping Sequence") or line.startswith("Trimmomatic")):
+ if write:
+ logs += toLatex(line) + "\n"
+ if re.match("Input Read", line):
+ if re.search("TrimmomaticPE", type):
+ pattern = re.match("Input Read Pairs:\s+(?P<total>\d+)\s+"
+ "Both Surviving:\s+(?P<nSurvived>\d+) \((?P<pSurvived>\d+.\d+)%\)\s+"
+ "Forward Only Surviving:\s+(?P<forward>\d+) \(\d+.\d+%\)\s+"
+ "Reverse Only Surviving:\s+(?P<reverse>\d+) \(\d+.\d+%\).*", line)
+
+ survived = float(pattern.group("pSurvived").replace(",", "."))
+ total = int(pattern.group("total")) * 2
+ nTrimmedReads = int(pattern.group("nSurvived"))*2 + int(pattern.group("forward")) + int(pattern.group("reverse"))
+ pTrimmedReads = round(100 * nTrimmedReads/total, 2)
+ else:
+ pattern = re.match(".*Surviving: (?P<nSurvived>\d+) \((?P<survived>\d+.\d+)%\)", line)
+ pattern2 = re.match("Input Reads: (?P<total>\d+)", line)
+
+ survived = float(pattern.group("survived").replace(",","."))
+ total = int(pattern2.group("total"))
+ nTrimmedReads = int(pattern.group("nSurvived"))
+ pTrimmedReads = survived
+
+ return survived, total, nTrimmedReads, pTrimmedReads, logs
+
+def perc_slope(a,b,perc):
+ if abs(numpy.diff([a,b]))>(b*float(perc)):
+ return True
+ return False
+
+def optimize_trimming(fastqc_folder,perc=0.1):
+ mytable=None
+ filename = join(fastqc_folder, "fastqc_data.txt")
+ with open(filename, "rb") as fastqcdata:
+ table = []
+ ifwrite = False
+ for line in fastqcdata.readlines():
+ line = line.decode()
+ line = line.replace("\n", "")
+ if line.startswith(">>END_MODULE") and ifwrite:
+ try:
+ dtype = {'names': table[0], 'formats': ['|S15', float, float, float, float]}
+ mytable = numpy.asarray([tuple(x) for x in table[1:]], dtype=dtype)
+ break
+ except:
+ pass
+ elif line.startswith(">>Per base sequence content"):
+ ifwrite = True
+ temp = line.split("\t")
+
+ if temp[1].lower() == "pass":
+ return False, 0,0#break
+ elif ifwrite:
+ table.append(line.split("\t"))
+
+ headcrop = 0
+ tailcrop = len(mytable["A"])
+ trim_bool = False
+ column = numpy.ma.array(mytable)
+ for i in range(-4, int(round(len(mytable["A"]) / 3, 0)), 1):
+ for nucl in ["A", "C", "G", "T"]:
+ column[nucl].mask[max(i, 0):i + 5] = True
+ if headcrop >0:
+ column[nucl].mask[:headcrop] = True
+ if tailcrop < len(mytable["A"]):
+ column[nucl].mask[tailcrop:] = True
+
+ # check heacrop
+ if (perc_slope(numpy.mean(mytable[nucl][max(i, 0):i + 5]), numpy.mean(column[nucl]), perc=perc)) & (headcrop < (i + 5)):
+ headcrop = i + 5
+ trim_bool = True
+ elif headcrop < i:
+ column[nucl].mask[max(i, 0):i + 5] = False
+
+ # now crop from the end
+ column[nucl].mask[-(i + 5):(min(len(mytable[nucl]), len(mytable[nucl]) - i))] = True
+ if (perc_slope(numpy.mean(mytable[nucl][-(i + 6): (min(len(mytable[nucl]) - 1, len(mytable[nucl]) - 1 - i))]),numpy.mean(column[nucl]), perc=perc)) & (tailcrop > len(mytable[nucl]) - (i + 5)):
+ # if perc_slope(numpy.var(mytable[nucl][-(i+6): (min(len(mytable["A"])-1, len(mytable[nucl])-1-i))]), numpy.mean(numpy.var(column)), perc=perc):
+ tailcrop = len(mytable[nucl]) - (i + 5)
+ trim_bool = True
+ else:
+ column[nucl].mask[-(i + 5): (min(len(mytable["A"]) - 1, len(mytable[nucl]) - 1 - i))] = False
+
+ return trim_bool, headcrop, tailcrop-headcrop
+
+def str_to_json(report_str):
+ closings = []
+ json_file = ""
+ report_str = report_str.split("\n")
+ pre_depth = -1
+ for row in report_str:
+ nNode = 0
+ if row != "":
+ row = row.split("\t")
+ if row[0].strip() != "0.00" and row[5] != "unclassified":
+ temp = len(row[5])
+ name = row[5].strip(" ")
+ curr_depth = int((temp - len(name)) / 2)
+
+ if pre_depth < curr_depth and curr_depth > 0:
+ for i in range(curr_depth - pre_depth):
+ if json_file.endswith('"children":[ '):
+ if json_file.endswith("}") or json_file.endswith("]"):
+ json_file += ","
+ json_file += '{"children":[ '
+ closings.append("]")
+ closings.append("}")
+ else:
+ json_file += ',"children":[ '
+ closings.append("]")
+ elif pre_depth > curr_depth:
+ while pre_depth >= curr_depth:
+ close_with = closings.pop()
+ json_file += close_with # end node
+ if close_with == "]":
+ close_with = closings.pop()
+ json_file += close_with # end node
+ pre_depth -= 1
+ elif pre_depth != -1:
+ json_file += closings.pop() + ','
+ if json_file.endswith("}")or json_file.endswith("]"):
+ json_file+=","
+ json_file += '{"name":"%s",' % name
+ json_file += '"perc":%s,' % row[0]
+ json_file += '"taxa":"%s"' % row[3]
+ closings.append("}")
+ nNode += 1
+ pre_depth = curr_depth
+ while len(closings) > 0:
+ json_file += closings.pop() # end node
+ return json_file
\ No newline at end of file
diff --git a/parameter.txt b/parameter.txt
new file mode 100755
index 0000000..cc97f05
--- /dev/null
+++ b/parameter.txt
@@ -0,0 +1,42 @@
+[FastQC]
+Per base sequence quality = per_base_quality.png
+Per sequence quality scores = per_sequence_quality.png
+Per base sequence content = per_base_sequence_content.png
+Per base GC content = per_base_gc_content.png
+Per sequence GC content = per_sequence_gc_content.png
+Per base N content = per_base_n_content.png
+Sequence Length Distribution= sequence_length_distribution.png
+Sequence Duplication Levels = duplication_levels.png
+Kmer Content = kmer_profiles.png
+Per tile sequence quality = per_tile_quality.png
+Adapter Content = adapter_content.png
+
+[Trimmomatic]
+illuminaClip_seedMismatch = 2
+illuminaClip_simpleClip = 10
+leading = 3
+trailing = 3
+trimOption = SLIDINGWINDOW:4:20
+trimBetter_threshold = 0.15
+
+[forAssembly.Illumina]
+trimBetter_threshold = 0.15
+trimOption = SLIDINGWINDOW:4:25
+
+[forAssembly.IonTorrent]
+trimBetter_threshold = 0.2
+trimOption = SLIDINGWINDOW:4:15
+
+[forMapping.Illumina]
+trimBetter_threshold = 0.15
+trimOption = SLIDINGWINDOW:4:15
+
+[forMapping.IonTorrent]
+trimBetter_threshold = 0.25
+trimOption = SLIDINGWINDOW:4:15
+
+[Adapter]
+Illumina Universal Adapter = TruSeq2
+
+[Fileextension]
+fastq = .fastq.gz, .fastq, .fq, .fq.gz
\ No newline at end of file
diff --git a/readme.txt b/readme.txt
new file mode 100755
index 0000000..f4943c3
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,111 @@
+------------
+Introduction
+------------
+
+QCPipeline is a tool for quality control. The workflow is as followed:
+
+ 1. Quality control with FastQC
+ 2. Trim Reads with Trimmomatic
+ 3. Quality control of trimmed reads with FastQC
+ 4. Map reads against reference using bowtie2
+ 5. Classify reads with Kraken
+
+------------
+Dependencies
+------------
+
+This tool was implemented in python and needs Python 3.4 or higher. Furhermore, FastQC (>v0.10.1), bowtie2 (> 2.2.3) and Kraken (0.10.5) are required.
+
+Further packages via apt-get install:
+* Python3-pip
+* libfreetype6-dev
+
+Packages via pip3 install:
+* Jinja2
+* matplotlib
+
+
+To change tool or adapter path, change config.txt.
+
+-----
+Usage
+-----
+```
+python3.4 runQCPipeline.py -i <input> -technology <Illumina/IonTorrent> <option(s)>
+```
+Input parameter:
+
+ -i, -input sample folder/file. If Illumina folder, files has to match pattern <Sample name>_<lane>_<R1/R2>_<number>. Eg. Sample_12_345_R1_001.fastq. Otherwise use -1,-2
+ -1 , -2 alternatively to -i: filename. Must not match Illumina names.
+ -technology sequencing technology (Illumina/IonTorrent)
+
+Options:
+
+ -o , -output output folder, default: input folder
+ -r reference file
+ -a adapter sequence (TruSeq2-PE, TruSeq2-SE, TruSeq3-PE, TruSeq3-SE, TruSeq3-PE-2, NexteraPE-PE). Required for Illumina.
+ -threads threads. Default:4
+ -p palindrome parameter used in Trimmomatic (use 30 or 1000 for further analysis). Default: 30
+ -db Kraken database
+ -b input is a project folder. A project folder contains furher sample folder(s)
+ -trimOption Use maxinfo or slidinginfo for Trimmomatic.MAXINFO:<target length>:<strictness> | SLIDINGWINDOW:<window size>:<required quality>.
+ default: SLIDINGWINDOW:4:15
+ -minlen Minlen parameter for Trimmomatic. Default:50
+ -nokraken skip Kraken
+ -nomapping skip mapping
+
+ -version Get version
+
+Output:
+
+<Sample/Output Folder>
+|-- QCResult
+ |-- summary_<sample name>.pdf (summarizes all FastQC, Trimming and Mapping Results)
+ |-- FastQC
+ |-- <output folder(s) from FastQC>
+ |-- Trimmed
+ |-- <trimmed reads>
+ |-- FastQC
+ |-- <output folder(s) from FastQC>
+[-- if project is entered (-b): FASTQC_COMBINE_<datetime>.html (summarizes all (plotted) results in oen html file) ]
+
+-------------------
+Program Description
+-------------------
+
+This project consists of 6 files:
+
+runQCPipeline.py main script for running complete pipeline
+classes.py script containing classes
+helper.py small helper functions
+report.tex Template for sample reports
+config.txt configuration for Kraken and Trimmomatic
+
+combineFastQC.py Written by Stefan Fuchs. It combines FastQC, mapping, trimming, classification results in a html file.
+ This file can be called without running runQCPipeline.
+
+ Usage: python3.4 combineFastQC.py -f project_folder [-csv csvtable]
+
+ csv is used to plot mapping, trimming and classification results if the user does not want to run the complete QCPipeline.
+ Format: sample name | mapping result | trimming result | Kraken result | % of fragments shorter than read length
+
+ The project folder must have the output structure (QCResults).
+
+-------
+Example
+-------
+
+1. Simple usage for Illumina:
+```
+python3.4 runQCPipeline.py -input mySampleFolder/ -technology Illumina -adapter NexteraPE-PE -r myReference.fasta
+```
+> Output files will be in SampleFolder/QcResults
+
+
+2. Entering a project:
+```
+python3.4 runQCPipeline.py -input myProjectFolder/ -technology IonTorrent -r myReference.fasta -b
+```
+> Output files will be in myProjectFolder/SampleFolder/QcResults and an overall summary will be in myProjectFolder.
+
+
diff --git a/report.tex b/report.tex
new file mode 100755
index 0000000..ea7f2b7
--- /dev/null
+++ b/report.tex
@@ -0,0 +1,184 @@
+\documentclass[a4paper]{article}
+\usepackage[document]{ragged2e}
+\usepackage{python}
+\usepackage[english]{babel}
+\usepackage{graphicx}
+\usepackage[utf8]{inputenc}
+\usepackage{xcolor}
+\usepackage[T1]{fontenc}
+\renewcommand*\familydefault{\ttdefault} %% Only if the base font of the document is to be typewriter style
+\usepackage{subfigure}
+\usepackage{geometry}
+\usepackage{tcolorbox}
+\usepackage{adjustbox}
+\usepackage{setspace}
+\usepackage{float}
+\usepackage{url}
+\usepackage{grffile}
+
+\geometry{a4paper,left=25mm,right=20mm,
+ top=12mm,
+ bottom=20mm}
+
+\setlength\parindent{0pt}
+
+\tolerance=1
+\emergencystretch=\maxdimen
+\hyphenpenalty=1000
+\hbadness=1000
+
+\begin{document}
+
+{\bf {\LARGE{ {{pipeline.name}} } Version {{pipeline.version}} } }\\
+\line(1,0){ \textwidth }
+
+\begin{tabular}{p{0.25\textwidth} p{0.75\textwidth}}
+
+Executor: & {{pipeline.user}} \\
+Dataset: & {\bfseries{\verb|{{sample.name}}|} }\\
+Started: & {{sample.start}}\\
+Finished: & {{sample.stop}}\\
+ & \\
+Operating System & \\
+Server: & {{pipeline.server}}\\
+System: & {{pipeline.system}}\\
+Release: & {{pipeline.release}}\\
+Version: & {{pipeline.operating_version}}\\
+Machine: & {{pipeline.machine}}\\
+ & \\
+ Tool versions & \\
+Python: & {{pipeline.python_version}}\\
+FastQC: & {{pipeline.fastqc_version}}\\
+Trimmomatic: & {{pipeline.trimmo_version}}\\
+{%if sample.mappingRes !=None%}Bowtie2: & {{pipeline.bowtie_version}}\\{%endif%}
+{%if sample.krakenRes != None%}Kraken: & {{pipeline.kraken_version}}\\{%endif%}
+\end{tabular}\\
+
+%----------------- Workflow -------------------%
+\line(1,0){ \textwidth } \\
+Processed reads: \\
+{%for read in sample.readSets %}
+{{read.r1.get_filename()}}
+{% if read.r2 %}
+{{read.r2.get_filename()}}
+{%endif%}
+{%endfor%}
+
+%-------------------- Summary -------------------%
+\begin{tcolorbox}
+{\large{Summary} } \\
+{{sample.nTrimmed()}} of {{sample.total_reads()}} ({{sample.pTrimmed()}}\%) reads remained after trimming \\
+{{sample.get_mean_trimRes()}} \% of fragments were shorter than read length\\
+
+{%if sample.mappingRes !=None%}{{sample.mappingRes.numberOfAlignedReads}} ({{sample.mappingRes.percentAlignedReads}}\%) of reads aligned to \path{ {{sample.reference}} }\\ {%endif%}
+{%if sample.krakenRes !=None%}{{sample.krakenRes.nClassified}} sequences classified ({{sample.krakenRes.pClassified}}\%) {%endif%}
+
+\end{tcolorbox}
+
+\line(1,0){\textwidth} \\
+\vspace{5mm}
+
+%-------------------- FASTQC Results -------------------%
+
+{\Large{FastQC (Pre | Post Trimming) } } \\
+\vspace{5mm}
+
+{%for read in sample.readSets %}
+
+{\underline{Readname: {{read.r1.get_filename()}} } }\\
+{%if read.r1.concat_files%}
+\vspace{5mm}
+Concatenated files:\\
+\begin{itemize}
+{%for file in read.r1.concat_files%}
+\item {{file}}
+{%endfor%}
+\end{itemize}
+{%endif%}
+{{read.r1.log}} \\
+
+Trimming Log: \\
+\textcolor{gray}{Using parameter: {{trim_param}} }\\
+{{read.trimRes.logs}} \\
+
+{% if read.trimRes.blobplot != "" %}
+ \begin{figure}[H]
+ \centering
+ {\includegraphics[width=0.8 \textwidth]{/{{read.trimRes.blobplot}}} }
+ \caption{Blobplot}
+ \end{figure}
+{% endif %}
+%
+{% for i in range(read.r1.qcRes.results|length) %}
+ \begin{figure}[H]
+ \centering
+ \begin{adjustbox}{minipage=0.4\textwidth-2\fboxrule, cframe = {{read.r1.qcRes.results[i].color}} 2}%
+ \begin{subfigure}%
+ {\includegraphics[width=\textwidth]{/{{read.r1.qcRes.results[i].file}}} }
+ \end{subfigure}
+ \end{adjustbox}\qquad
+ \begin{adjustbox}{minipage=0.4\textwidth-2\fboxrule, cframe = {{read.trimRes.readset.r1.qcRes.results[i].color}} 2}%
+ \begin{subfigure}%
+ {\includegraphics[width=\textwidth]{/{{read.trimRes.readset.r1.qcRes.results[i].file}}} }
+ \end{subfigure}
+ \end{adjustbox}
+ \caption{ {{read.trimRes.readset.r1.qcRes.results[i].name}}}
+ \end{figure}
+{% endfor %}
+
+{% if read.r2 %}
+{\underline{Readname: {{read.r2.get_filename()}} } } \\
+{%if read.r2.concat_files%}
+Concatenated files: \\
+\begin{itemize}
+{%for file in read.r2.concat_files%}
+\item {{file}}
+{%endfor%}
+\end{itemize}
+{%endif%}
+{{read.r2.log}}
+{% for i in range(read.r2.qcRes.results|length) %}
+ \begin{figure}[H]
+ \centering
+ \begin{adjustbox}{minipage=0.4\textwidth -2\fboxrule, cframe = {{read.r2.qcRes.results[i].color}} 2}%
+ \begin{subfigure}
+ {\includegraphics[width=\textwidth]{/{{read.r2.qcRes.results[i].file}}} }
+ \end{subfigure}
+ \end{adjustbox}\qquad
+ \begin{adjustbox}{minipage=0.4\textwidth -2\fboxrule, cframe = {{read.trimRes.readset.r2.qcRes.results[i].color}} 2}%
+ \begin{subfigure}%
+ {\includegraphics[width=\textwidth]{/{{read.trimRes.readset.r2.qcRes.results[i].file}}} }
+ \end{subfigure}
+ \end{adjustbox}
+ \caption{ {{read.trimRes.readset.r2.qcRes.results[i].name}}}
+ \end{figure}
+{% endfor %}
+{% endif %}
+
+{% endfor %}\\
+
+%-------------------- Bowtie Results -------------------%
+{%if sample.mappingRes != None%}
+\line(1,0){\textwidth}
+\vspace{5mm}
+
+{\Large{Bowtie2} } - Map against \path{ {{sample.reference}} } \\
+{{sample.mappingRes.log}}
+{%endif%}
+%-------------------- Kraken Results -------------------%
+{%if sample.krakenRes != None%}
+\line(1,0){\textwidth} \\
+\vspace{5mm}
+
+{\Large{Kraken} } \\
+
+{{sample.krakenRes.log}}
+
+
+\begin{singlespace}
+\begin{verbatim}
+{{sample.krakenRes.report}}
+\end{verbatim}
+\end{singlespace}
+{%endif%}
+\end{document}
\ No newline at end of file
diff --git a/runQCPipeline.py b/runQCPipeline.py
new file mode 100755
index 0000000..e794596
--- /dev/null
+++ b/runQCPipeline.py
@@ -0,0 +1,484 @@
+#!/usr/bin/env python3.4
+# -*- coding: utf-8 -*-
+__author__ = 'LieuV'
+__version__ = "1.1.2"
+
+from classes import *
+from helper import *
+import os
+from itertools import groupby
+from jinja2 import Environment,FileSystemLoader
+import shutil
+from combineFastQC import combineFastQC
+import configparser
+from collections import OrderedDict
+import json
+
+#dependencies
+try:
+ import argparse
+except ImportError:
+ print('The "argparse" module is not available. Use Python >3.2.')
+ sys.exit(1)
+
+
+
+file_extensions = ["fastq.gz",".fastq", ".fq","fq.gz"]
+
+def get_read(values,argMap, read = "_R1_"):
+ read_list = list(filter(lambda x: read in x, values))
+
+ if(len(read_list)==0):
+ return None
+ elif (len(read_list) == 1):
+ read = FastQFile(os.path.join(argMap["input"], os.path.basename(read_list[0])))
+ else:
+ concat_reads = join_reads(read_list, argMap["tmp"], "concat_" + "_".join(os.path.basename(read_list[0]).split("_")[:-1]) + os.path.splitext(read_list[0])[-1] )
+ read = FastQFile(concat_reads)
+ read.concat_files = [toLatex(os.path.basename(x)) for x in read_list]
+ return read
+
+
+def get_illumina_reads( files, argMap, output):
+ ## Add readsets to sample
+ readsets = []
+ # concat same files
+ files = sorted(files)
+ if(len(files)==1):
+ return [ReadSet(output,files[0])]
+ if argMap["r1"]:
+ if argMap["r2"]:
+ return[ReadSet(output, FastQFile(argMap["r1"]), FastQFile(argMap["r2"]) )]
+ else:
+ return [ReadSet(output, FastQFile(argMap["r1"]))]
+
+ for p, values in groupby(files, key=lambda x:x.split("_")[:-2]):
+ val = list(values)
+ r1 = get_read(val,argMap, "_R1_")
+ r2 = get_read(val,argMap, read = "_R2_")
+ readsets.append(ReadSet(output, r1,r2))
+ return readsets
+
+def run_analysis (readset, argMap, output):
+ print("Run FastQC...")
+ readset.run_FastQC(argMap["threads"])
+ trimmed_output = os.path.join(output, "Trimmed")
+ os.makedirs(trimmed_output, exist_ok=True)
+ if argMap["trimBetter"]:
+ trimming_perc = argMap["trimBetter_threshold"]
+ else:
+ trimming_perc = ""
+ readset.run_Trimmomatic(argMap["adapter"], argMap["threads"], argMap["palindrome"], trimmed_output, argMap["technology"], argMap["minlen"], argMap["trimOption"], trimming_perc)
+ readset.trimRes = readset.trimRes.run_FastQC(trimmed_output, argMap["tmp"])
+ if argMap["blobplot"]:
+ readset.trimRes.run_blobplot(argMap["db"])
+ return readset
+
+
+def check_input(argMap):
+ if argMap["r1"]:
+ if argMap["r2"]:
+ return [argMap["r1"], argMap["r2"]]
+ else:
+ return [argMap["r1"]]
+
+ if argMap["technology"]=="Illumina":
+ if os.path.isdir(argMap["input"]):
+ files = os.listdir(argMap["input"])
+ files = [os.path.join(argMap["input"], x) for x in files]
+ files = [x for x in files if os.path.isfile(x) & any([ext in x for ext in file_extensions])]
+ if len(files)==0:
+ sys.exit(argMap["input"] + " does not contain fastq files.")
+ return files
+ else:
+ return [argMap["input"]]
+ elif argMap["technology"]=="IonTorrent":
+
+ if os.path.isfile(argMap["input"]):
+
+ if argMap["input"].endswith(".bam"):
+ return argMap["input"]
+ else:
+ sys.exit("IonTorrent PGM input has to be a bam-file. Otherwise use -b. Use --help for more information.")
+ else:
+ sys.exit("Incorrect file input")
+ elif argMap["technology"] == "Illumina-MiSeq":
+ if os.path.isfile(argMap["input"]):
+ if any([argMap["input"].endswith(x) for x in file_extensions]):
+ return argMap["input"]
+ else:
+ sys.exit("Not supported file extension.")
+ else:
+ sys.exit("Illumina MiSeq input has to be a fastq-file. Otherwise use -b for a batch run. Use --help for more information.")
+ else:
+ sys.exit("Incorrect file input")
+ return None
+
+def fillClasses(argMap, temp_bowtie_path = "", tmp = ""):
+ #tempdir = tempfile.mkdtemp(dir = argMap["tmp"])
+ try:
+ if tmp!="":
+ argMap["tmp"] = tmp
+ if argMap["r1"]:
+ argMap["input"] = argMap["r1"]
+ argMap["input"] = os.path.abspath(argMap["input"])
+ if argMap["output"] == "":
+ output = getDir([os.curdir , "QCResults"], True)#getDir([argMap["input"] , "QCResults"], True)
+ else:
+ output = getDir([argMap["output"], "QCResults"], True)
+
+ sample = Sample(getFilenameWithoutExtension(argMap["input"], level = 2, getBase=True), output, argMap["reference"],argMap["threads"], argMap["technology"])
+ if argMap["technology"].startswith("Illumina"):
+ files = check_input(argMap)
+ if len(files) == 0:
+ return None
+ readsets = get_illumina_reads( files, argMap, output)
+
+ #os.makedirs(tempdir, exist_ok=True)
+ for rs in readsets:
+ readset = run_analysis(rs, argMap, output)
+ sample = sample.add_readSet(readset)
+ if not argMap["nomapping"]:
+ if argMap["save_mapping"]:
+ sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, join(argMap["output"], "QCResults", sample.name +".bam"))
+ else:
+ sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, "/dev/null")
+ if not argMap["nokraken"]:
+ print("Run Kraken.")
+ sample = sample.run_Kraken(argMap["db"])
+
+ elif argMap["technology"] == "IonTorrent":
+ argMap["input"] = check_input(argMap)
+ print("IonTorrent", argMap["input"])
+ fastqfile = bam_to_fastq(argMap["input"], argMap["tmp"])
+ readset = ReadSet(output, fastqfile)
+ readset = run_analysis(readset,argMap, output)
+ sample.add_readSet(readset)
+ if not argMap["nomapping"]:
+ if argMap["save_mapping"]:
+ sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, join(argMap["output"], "QCResults", sample.name +".bam"))
+ else:
+ sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, "/dev/null")
+ if not argMap["nokraken"]:
+ print("Run Kraken.")
+ sample = sample.run_Kraken(argMap["db"])
+
+ sample.stop = datetime.datetime.strftime(datetime.datetime.now(),"%d.%m.%Y, %H:%M:%S")
+ finally:
+ shutil.rmtree(argMap["tmp"])
+ return sample
+
+def writeReport(sample, argMap):
+ report_name = os.path.join(sample.mainResultsPath,
+ "summary_" + sample.name + "_" + datetime.datetime.strftime(datetime.datetime.now(), "%d-%m-%y_%H-%M"))
+ print("Writing latex " ,report_name)
+ env = Environment()
+ env.loader = FileSystemLoader(os.path.dirname(__file__))
+
+ template = env.get_template("report.tex")
+ pdf_latex = template.render(sample=sample, pipeline=Pipeline(), trim_param = sample.readSets[0].trimRes.print_param(argMap["minlen"], argMap["trimOption"], argMap["palindrome"]))
+
+ latex = open(report_name + ".tex", "w")
+ latex.write(pdf_latex)
+ latex.close()
+
+ process = subprocess.Popen(["pdflatex", "-interaction=nonstopmode", "-output-directory=" + sample.mainResultsPath, report_name + ".tex"], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
+ for line in iter(process.stdout.readline, b''):
+ pass#print(line)
+ for line in iter(process.stderr.readline, b''):
+ print(line)
+
+ process.communicate()
+
+ for ext in (".tex",".aux", ".log", ".toc", ".lof", ".lot", ".synctex.gz"):
+ try:
+ os.remove(report_name + ext)
+ except OSError:
+ pass
+
+def check_project(argMap):
+ if argMap["technology"] =="IonTorrent":
+ if os.path.isdir(argMap["input"]):
+ files = os.listdir(argMap["input"])
+ files = [os.path.join(argMap["input"], x) for x in files if x.endswith(".bam")]
+ if len(files) == 0:
+ sys.exit("Folder does not contain bam-files")
+ return sorted(files)
+ else:
+ sys.exit("Enter a valid PGM folder, which contains bam-files.")
+ elif argMap["technology"] =="Illumina":
+ if os.path.isdir(argMap["input"]):
+ files = os.listdir(argMap["input"])
+ files = [os.path.join(argMap["input"], x) for x in files]
+ files = [x for x in files if os.path.isdir(x)]
+ if len(files) == 0:
+ sys.exit("Enter a valid folder, which contain sample folders")
+ return sorted(files)
+ else:
+ sys.exit("Illumina projects has to contain subfolders with fastq-files.")
+
+
+
+def main(arguments):
+ parameter = configparser.ConfigParser()
+ parameter.read(join(dirname(__file__), "parameter.txt"))
+
+ if not arguments["tmp"]:
+ arguments["tmp"] = join(arguments["output"], "qc_tmp")
+ else:
+ if os.path.exists(arguments["tmp"]):
+ arguments["tmp"] = join(arguments["tmp"], "qc_tmp")
+
+ if arguments["technology"].startswith("Illumina"):
+ technology = "Illumina"
+ else:
+ technology = "IonTorrent"
+
+ if arguments["forAssembly"]:
+ if not arguments["trimBetter_threshold"]:
+ arguments["trimBetter_threshold"] = parameter["forAssembly." + technology]["trimBetter_threshold"]
+ if not arguments["trimOption"]:
+ arguments["trimOption"] = parameter["forAssembly." + technology]["trimOption"]
+
+ elif arguments["forMapping"]:
+ if not arguments["trimBetter_threshold"]:
+ arguments["trimBetter_threshold"] = parameter["forMapping." + technology]["trimBetter_threshold"]
+ if not arguments["trimOption"]:
+ arguments["trimOption"] = parameter["forMapping." + technology]["trimOption"]
+
+ if not arguments["trimOption"]:
+ arguments["trimOption"] = parameter["Trimmomatic"]["trimOption"]
+ if not arguments["trimBetter_threshold"]:
+ arguments["trimBetter_threshold"] = parameter["Trimmomatic"]["trimBetter_threshold"]
+
+ # check input
+ pattern = re.match("(?P<option>\w+):(?P<value1>\d+):(?P<value2>\d+)", arguments["trimOption"])
+ if(arguments["trimOption"]!=""):
+ if not ((pattern.group("option") =="SLIDINGWINDOW") or (pattern.group("option") =="MAXINFO")):
+ sys.exit("Check trimOption. Only 'SLIDINGWINDOW' or 'MAXINFO' allowed")
+ try:
+ int(pattern.group("value1"))
+ int(pattern.group("value2"))
+ except:
+ sys.exit("Check trimOptions.")
+
+ if arguments['technology'] == "Illumina":
+ if not arguments['adapter']:
+ sys.exit("Illumina projects requires an adapter file")
+ if arguments["nomapping"]:
+ #print("no mapping")
+ arguments["reference"] = ""
+ else:
+ if not arguments["reference"]:
+ sys.exit("Mapping needs a reference.")
+ else:
+ if not os.path.exists(arguments["reference"]):
+ sys.exit("Reference does not exist.")
+
+ if not arguments["nokraken"]:
+ if not os.path.exists(arguments["db"]):
+ sys.exit(arguments["db"] + " does not exist. Enter a valid database for kraken")
+ else:
+ if "database.kdb" not in os.listdir(arguments["db"]):
+ sys.exit("database "+arguments["db"] +" does not contain necessary file database.kdb")
+ if not arguments["input"]:
+ if arguments["r1"]:
+ print(os.path.abspath(arguments["r1"]),os.path.exists(arguments["r1"]))
+ if not os.path.exists(arguments["r1"]):
+ sys.exit(arguments["r1"] + " does not exist.")
+ if arguments["technology"]=="IonTorrent":
+ arguments["input"] = arguments["r1"]
+ arguments["r1"] = ""
+ else:
+ sys.exit("Input file required. Use option -input or -1 / -2")
+
+ if arguments["r2"]:
+ if not os.path.exists(arguments["r2"]):
+ sys.exit(arguments["r2"] + " does not exist.")
+
+
+ os.makedirs(arguments["tmp"], exist_ok=True)
+ if arguments["index"]:
+ mapping_index_path = arguments["index"]
+ else:
+ mapping_index_path = join(arguments["tmp"], "bowtie2_index")
+ os.makedirs(mapping_index_path, exist_ok=True)
+ # RUN
+ try:
+ jsonfile_dict = []
+ kraken_reports =OrderedDict()
+ if arguments["b"]:
+ print("A project was entered")
+ i=1
+ project = []
+ overview = []
+
+ if arguments["technology"]=="Illumina-MiSeq":
+
+ myFiles=[]
+ input = sorted(os.listdir(arguments["input"]), key=lambda x: x.split("_")[:-3])
+ #print(input)
+ input = [os.path.join(arguments["input"], x) for x in input if os.path.isfile(os.path.join(arguments["input"], x))and any([x.endswith(ext) for ext in file_extensions])]
+
+ for p, values in groupby(input, key=lambda x: x.split("_")[:-3]):
+ myFiles.append(sorted(list(values)))
+ print("Found " + str(len(myFiles)) + " samples.")
+ for subset in myFiles:
+ print("Analyse " + str(i) + "/" + str(len(myFiles)))
+ tmp = join(arguments["tmp"], "Sample_" + str(i))
+ os.makedirs(tmp, exist_ok=True)
+
+ arguments["r1"] = subset[0]
+ if len(subset)>1:
+ arguments["r2"] = subset[1]
+ sample = fillClasses(arguments, mapping_index_path,tmp )
+ if sample is not None:
+ project.append(sample)
+ try:
+ writeReport(sample, arguments)
+ except:
+ print("Couldnt write pdf.")
+ try:
+ if sample.mappingRes is None:
+ mapping = 0
+ else:
+ mapping = sample.mappingRes.percentAlignedReads
+ if sample.krakenRes is None:
+ kraken = 0
+ else:
+ kraken = sample.krakenRes.pClassified
+ jsonfile_dict.append(
+ OrderedDict([("setname", sample.name), ("Map to reference [%]", mapping),
+ ("Reads after trimming", sample.pTrimmed()),
+ ("Classified (Kraken)", kraken),
+ ("Shorter fragments", sample.get_mean_trimRes()),
+ ("images", [x.get_all_qc_dict() for x in sample.readSets])]))
+ overview.append((sample.name, mapping, sample.pTrimmed(), kraken, sample.get_mean_trimRes()))
+ except:
+ print("Error in appending summary of all samples.")
+ sample = None
+ i+=1
+
+ else:
+ sample_list = check_project(arguments)
+ print("Found " + str(len(sample_list)) + " samples.")
+ for sub in sample_list:
+ print("Analyse " + str(i) + "/" + str(len(sample_list)))
+ tmp = join(arguments["tmp"], "Sample_" + str(i))
+ os.makedirs(tmp, exist_ok=True)
+
+ arguments["input"] = sub
+ sample = fillClasses(arguments, mapping_index_path, tmp)
+ if sample is not None:
+ project.append(sample)
+ try:
+ writeReport(sample, arguments)
+ except:
+ print("Couldnt write pdf.")
+ try:
+ if sample.mappingRes is None:
+ mapping = 0
+ else:
+ mapping = sample.mappingRes.percentAlignedReads
+ if sample.krakenRes is None:
+ kraken = 0
+ else:
+ kraken = sample.krakenRes.pClassified
+ jsonfile_dict.append(
+ OrderedDict([("setname", sample.name), ("Map to reference [%]", mapping),
+ ("Reads after trimming", sample.pTrimmed()),
+ ("Classified (Kraken)", kraken),
+ ("Shorter fragments", sample.get_mean_trimRes()),
+ ("Trim Parameter", sample.readSets[0].trimRes.print_param(arguments["minlen"],arguments["trimOption"],arguments["palindrome"])),
+ ("images", [x.get_all_qc_dict() for x in sample.readSets])]))
+ overview.append((sample.name, mapping, sample.pTrimmed(), kraken , sample.get_mean_trimRes()))
+ except:
+ print("Error in appending summary of all samples.")
+
+ sample = None
+ i+=1
+ try:
+ combineFastQC(join(arguments["output"],"QCResults"), overview= ( getFilenameWithoutExtension(arguments["reference"], level=1, getBase = True), np.array(overview, dtype = [('name', '|S100'), ('mapping', float), ('trimming', float), ('kraken', float), ('shorter_reads', float)])), tmp = arguments["tmp"])
+ except:
+ print("Couldnt produce html.")
+
+ else: # no batch
+ if check_input(arguments):
+ sample = fillClasses(arguments, mapping_index_path)
+ try:
+ writeReport(sample, arguments)
+ except:
+ print("Couldnt write pdf")
+ if sample.mappingRes is None:
+ mapping = 0
+ else:
+ mapping = sample.mappingRes.percentAlignedReads
+ if sample.krakenRes is None:
+ kraken = 0
+ else:
+ kraken = sample.krakenRes.pClassified
+ kraken_reports[sample.name] = json.loads(str_to_json(sample.krakenRes.report))
+ jsonfile_dict.append(
+ OrderedDict([("setname", sample.name), ("Map to reference [%]", mapping),
+ ("Reads after trimming", sample.pTrimmed()),
+ ("Classified (Kraken)", kraken),
+ ("Shorter fragments", sample.get_mean_trimRes()),
+ ("Trim Parameter", sample.readSets[0].trimRes.print_param(arguments["minlen"], arguments["trimOption"], arguments["palindrome"])),
+ ("images", [x.get_all_qc_dict() for x in sample.readSets])
+ ]))
+
+ try:
+ env = Environment(block_start_string='@@', block_end_string='@@', variable_start_string='@=',
+ variable_end_string='=@')
+ env.loader = FileSystemLoader(os.path.dirname(__file__))
+ template = env.get_template("batch_report.html")
+ batch_report = template.render(json_samples=json.dumps(jsonfile_dict),
+ commandline=json.dumps(arguments), kraken = json.dumps(kraken_reports))
+ batch_html = open(join(arguments["output"], "QCResults", "batch_report.html"), "w")
+ batch_html.write(batch_report)
+ batch_html.close()
+ except:
+ print("Couldnt produce html file.")
+ finally:
+
+ #if not arguments["index"]:
+ # print(mapping_index_path)
+ shutil.rmtree(arguments["tmp"])
+
+if __name__ == "__main__":
+ config = configparser.ConfigParser()
+ config.read(join(dirname(__file__), "config.txt"))
+ kraken_db = ""
+ if "DEFAULT" in config:
+ if "kraken_db" in config["DEFAULT"].keys():
+ kraken_db = config["DEFAULT"]["kraken_db"]
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-input', dest='input', help = "input sample folder. Illumina filenames have to end with _<lane>_<R1|R2>_number, e.g. Sample_12_345_R1_001.fastq", required=False)
+ parser.add_argument('-1', dest='r1', help = "input file. Illumina filename must not match <project>_<lane>_<R1|R2>_<number> name pattern", required=False)
+ parser.add_argument('-2', dest='r2', help = "input file", required=False)
+
+ parser.add_argument('-output', dest='output', default="")
+ parser.add_argument('-technology', dest='technology',choices=['Illumina',"IonTorrent", "Illumina-MiSeq"] ,required = True)
+ parser.add_argument('-threads', dest='threads', default = 4, type = int)
+ parser.add_argument('-adapter', dest = 'adapter', choices = ['TruSeq2-PE', 'TruSeq2-SE' , 'TruSeq3-PE' , 'TruSeq3-SE' , 'TruSeq3-PE-2', 'NexteraPE-PE' ])
+ parser.add_argument('-reference', dest='reference', required = False, help = "Map reads against reference")
+ parser.add_argument('-index', dest='index', required = False, help = "Mapping index")
+ parser.add_argument('-save_mapping', action = "store_true")
+ parser.add_argument('-db', dest='db', help='Kraken database', required = False, default= kraken_db) #"/home/andruscha/programs/kraken/minikraken_20141208/"
+ parser.add_argument('-palindrome', dest='palindrome', help= 'Use palindrome parameter 30 or 1000 for further analysis. Default:30', default=30, type = int)
+ parser.add_argument('-minlen', dest='minlen',help='Minlen parameter for Trimmomatic. Default:50', default=50, type=int)
+ parser.add_argument('-trimOption', dest="trimOption", help='Use maxinfo or slidinginfo for Trimmomatic.MAXINFO:<target length>:<strictness> | SLIDINGWINDOW:<window size>:<required quality>. default: SLIDINGWINDOW:4:15', type= str)
+ parser.add_argument('-version', action='version', version='%(prog)s v' + __version__)
+ parser.add_argument('-b', action = "store_true", help = "Batch analysis. Run QC on a project.")
+ parser.add_argument('-nokraken', action = "store_true")
+ parser.add_argument('-nomapping', action = "store_true")
+ parser.add_argument('-trimBetter', action = "store_true")
+ parser.add_argument('-trimBetter_threshold', dest='trimBetter_threshold', help = "Default setting for Illumina: 0.15 and for IonTorrent: 0.25.", required=False, type=float)
+ parser.add_argument('-blobplot', action = "store_true")
+ parser.add_argument('-forAssembly', action = "store_true", help = "Trim parameter are optimized for assemblies (trim more aggressive).")
+ parser.add_argument('-forMapping', action = "store_true", help = "Trim parameter are optimized for mapping (allow more errors).")
+ parser.add_argument('-tmp',dest="tmp", required = False, help= "Define Temp folder. Default: output folder.")
+
+ arguments = vars(parser.parse_args())
+ main(arguments)
\ No newline at end of file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/qcumber.git
More information about the debian-med-commit
mailing list