[med-svn] [qcumber] 02/06: New upstream version 1.0.9+dfsg
Andreas Tille
tille at debian.org
Mon Mar 13 14:17:11 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository qcumber.
commit 966a33aad7fbeb70a28f2f08489c8662e01ad4a0
Author: Andreas Tille <tille at debian.org>
Date: Mon Mar 13 14:13:56 2017 +0100
New upstream version 1.0.9+dfsg
---
QCumber.py | 600 ++++++++++++++++++++++++++++--------------------------
barplot.R | 56 +++--
batch_report.html | 10 +-
classes.py | 96 +++------
config.txt | 6 +-
helper.py | 11 +-
parameter.txt | 13 +-
report.tex | 14 +-
8 files changed, 410 insertions(+), 396 deletions(-)
diff --git a/QCumber.py b/QCumber.py
index c56f2a7..c7eccde 100755
--- a/QCumber.py
+++ b/QCumber.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__author__ = 'LieuV'
-__version__ = "1.0.5"
+__version__ = "1.0.9"
from classes import *
from helper import *
@@ -11,7 +11,6 @@ import shutil
import configparser
from collections import OrderedDict
import json
-import csv
#dependencies
try:
@@ -20,95 +19,102 @@ except ImportError:
print('The "argparse" module is not available. Use Python >3.2.')
sys.exit(1)
+#parameter = configparser.ConfigParser(strict=False, interpolation=None)
+#parameter.read(join(dirname(__file__), "parameter.txt"))
-parameter = configparser.ConfigParser()
-parameter.read(join(dirname(__file__), "parameter.txt"))
+#r1_pattern = parameter["Pattern"]["R1"]
+#r2_pattern = parameter["Pattern"]["R2"]
+#sep_pattern = parameter["Pattern"]["sep"]
+#lane_pattern = parameter["Pattern"]["lane"]
+global parameter
+global r1_pattern
+global r2_pattern
+global sep_pattern
+global lane_pattern
-file_extensions = parameter["Fileextension"]["fastq"]
-r1_pattern = parameter["Pattern"]["R1"]
-r2_pattern = parameter["Pattern"]["R2"]
-lane_pattern = parameter["Pattern"]["lane"]
-
-def get_illumina_reads( output, tmp):
- ## Add readsets to sample
+def get_illumina_reads(tmp):
readsets = []
+
# split by lanes
- if isinstance(arguments["r1"], list):
- for lane in sorted(set([re.search(lane_pattern, x).group() for x in arguments["r1"]])):
- # concat same files
- r1_reads = [x for x in arguments["r1"] if lane in x]
- readname = re.sub(r1_pattern + ".*", "", os.path.basename(r1_reads[0]))
- if len(arguments["r1"]) != 1:
- r1 = FastQFile(join_reads(r1_reads, tmp, readname + "_R1"), [toLatex(os.path.basename(x)) for x in r1_reads] )
- else:
- r1 = FastQFile(r1_reads[0])
- if arguments["r2"]:
- r2_reads = [x for x in arguments["r2"] if lane in x]
+ for lane in sorted(set([re.search(lane_pattern, x).group() for x in arguments["r1"]])):
+ # concat same lanes
+ r1_reads = [x for x in arguments["r1"] if lane in x]
+ readname = re.sub(r1_pattern + ".*", "", os.path.basename(r1_reads[0]))
+ if len(arguments["r1"]) != 1:
+ r1 = FastQFile(join_reads(r1_reads, tmp, readname + "_R1"), [toLatex(os.path.basename(x)) for x in r1_reads] )
+ else:
+ r1 = FastQFile(r1_reads[0])
+ if arguments["r2"]:
+ r2_reads = [x for x in arguments["r2"] if lane in x]
- if len(r2_reads) != 1:
- r2 = FastQFile(join_reads(r2_reads, tmp, readname + "_R2"),[toLatex(os.path.basename(x)) for x in r2_reads] )
- else:
- r2 = FastQFile(r2_reads[0])
- readsets.append(ReadSet(r1,r2))
+ if len(r2_reads) != 1:
+ r2 = FastQFile(join_reads(r2_reads, tmp, readname + "_R2"),[toLatex(os.path.basename(x)) for x in r2_reads] )
else:
- readsets.append(ReadSet(r1))
- else:
- r1 = FastQFile(arguments["r1"])
- if arguments["r2"]:
- r2 = FastQFile(arguments["r2"])
- return [ReadSet(r1,r2)]
+ r2 = FastQFile(r2_reads[0])
+ readsets.append(ReadSet(r1,r2))
else:
- return [ReadSet(r1)]
-
-
+ readsets.append(ReadSet(r1))
return readsets
-def run_analysis (readset, arguments, output, tmp):
- print("Run FastQC...")
- readset.run_FastQC(join(output, "FastQC"))
- if arguments["trimBetter"]:
- trimming_perc = arguments["trimBetter_threshold"]
- else:
- trimming_perc = ""
- readset.run_Trimmomatic(arguments["adapter"], arguments["palindrome"],arguments["minlen"], arguments["trimOption"], trimming_perc, arguments["gz"])
- readset.trimRes = readset.trimRes.run_FastQC( tmp)
- return readset
-
+def get_input():
-def check_input(arguments):
if arguments["r1"]:
if arguments["r2"]:
return [arguments["r1"], arguments["r2"]]
else:
return [arguments["r1"]]
- if arguments["technology"]=="Illumina":
- if os.path.isdir(arguments["input"]):
- files = os.listdir(arguments["input"])
- files = [os.path.join(arguments["input"], x) for x in files]
- files = [x for x in files if os.path.isfile(x) & any([ext in x for ext in file_extensions])]
- if len(files)==0:
- sys.exit(arguments["input"] + " does not contain fastq files.")
- return files
+ all_files = []
+ bam_ext = [x.strip(" ") for x in parameter["Fileextension"]["bam"].split(",")]
+ fastq_ext = [x.strip(" ") for x in parameter["Fileextension"]["fastq"].split(",")]
+ if arguments["technology"]:
+ if arguments["technology"]=="IonTorrent":
+ fastq_ext = []
else:
- return [arguments["input"]]
- elif arguments["technology"]=="IonTorrent":
+ bam_ext = []
- if os.path.isfile(arguments["input"]):
+ for root, dirs, files in os.walk(arguments["input"]):
+ for file in files:
+ if any([file.endswith(ext) for ext in fastq_ext + bam_ext ]):
+ all_files.append(join(root,file))
- if arguments["input"].endswith(".bam"):
- return arguments["input"]
- else:
- sys.exit("IonTorrent PGM input has to be a bam-file. Otherwise use -b. Use --help for more information.")
+ if not arguments["technology"]:
+ if len([x for x in all_files if any([ext in x for ext in bam_ext])]) == len(all_files):
+ arguments["technology"] = "IonTorrent"
else:
- sys.exit("Incorrect file input")
- else:
- sys.exit("Incorrect file input")
- return None
+ arguments["technology"] = "Illumina"
+ #if arguments["technology"]:
+ # if arguments["technology"]=="Illumina":
+ # # fastq only
+ # all_files = [x for x in all_files if any([ext in x for ext in fastq_ext])]
+ # elif arguments["technology"]=="IonTorrent":
+ # # bam only
+ # all_files = [x for x in all_files if any([ext in x for ext in bam_ext])]
+
+ if (len(all_files) == 0):
+ sys.exit(arguments["input"] + " does not contain fastq or bam files.")
+
+ # find read pairs
+ all_files = sorted(list(all_files))
+ paired_reads_pattern =sep_pattern.join([lane_pattern , "(" + r1_pattern + "|" + r2_pattern + ")","\d{3}"])
+ if all([re.search(paired_reads_pattern, x) for x in all_files]):
+ paired_reads = []
+ for setname, files in groupby(all_files, key=lambda x: "_".join(x.split("_")[:-3])):
+ temp = sorted(list(files))
+ r1_reads = [x for x in temp if r1_pattern in x]
+ r2_reads = [x for x in temp if r2_pattern in x]
+ if len(r1_reads) != 0:
+ if len(r2_reads) != 0:
+ paired_reads.append([r1_reads, r2_reads])
+ else:
+ paired_reads.append(r1_reads)
+ return paired_reads
+ else: # treat each file as sample
+ return [[x] for x in all_files ]
+
def getSetname():
try:
- print("Renmae", arguments["rename"], arguments["r1"])
if arguments["rename"]:
new_name = [arguments["rename"][x] for x in arguments["rename"].keys() if basename(arguments["r1"][0]).startswith(x)]
@@ -120,51 +126,57 @@ def getSetname():
else:
print("Renamed to " ,new_name)
return new_name[0]
-
- if arguments["technology"] == "Illumina":
-
- if isinstance(arguments["r1"], list):
- return "_".join(basename(arguments["r1"][0]).split("_")[:-3])
- else:
- return getFilenameWithoutExtension(arguments["r1"], True)
- else:
- return getFilenameWithoutExtension(arguments["r1"], getBase=True)
except:
+ print("Couldnt rename sample files.")
+ try:
+ paired_reads_pattern = sep_pattern.join([sep_pattern + lane_pattern, "(" + r1_pattern + "|" + r2_pattern + ")", "\d{3}.*"])
if isinstance(arguments["r1"], list):
- return getFilenameWithoutExtension(arguments["r1"][0], getBase=True)
+ return re.sub(paired_reads_pattern, "" ,basename(arguments["r1"][0]))
else:
- return getFilenameWithoutExtension(arguments["r1"], getBase=True)
+ return re.sub(paired_reads_pattern,"",basename(arguments["r1"]))
+ except:
+ print("Problems getting samplenames")
+ return arguments["r1"]
-def fillClasses(temp_bowtie_path = "", tmp=''):
+def runAnalyses(temp_bowtie_path, tmp):
try:
output = getDir([arguments["output"], "QCResults"], True)
-
sample = Sample(getSetname(), output, arguments["reference"],arguments["threads"], arguments["technology"])
+
if arguments["technology"].startswith("Illumina"):
- readsets = get_illumina_reads( output, tmp)
+ readsets = get_illumina_reads(tmp)
elif arguments["technology"] == "IonTorrent":
print("IonTorrent", arguments["r1"])
fastqfile = bam_to_fastq(arguments["r1"],tmp)
readsets = [ReadSet( fastqfile)]
for rs in readsets:
- readset = run_analysis(rs, arguments, output, tmp)
- sample = sample.add_readSet(readset)
+ print("Run FastQC...")
+ rs.run_FastQC(join(output, "FastQC"))
+ if not arguments["notrimming"]:
+ if arguments["trimBetter"]:
+ trimming_perc = arguments["trimBetter_threshold"]
+ else:
+ trimming_perc = ""
+ rs.run_Trimmomatic(arguments["adapter"], arguments["palindrome"], arguments["minlen"],
+ arguments["trimOption"], trimming_perc, arguments["gz"])
+ rs.trimRes = rs.trimRes.run_FastQC(tmp)
+ sample = sample.add_readSet(rs)
if not arguments["nomapping"]:
if arguments["save_mapping"]:
- sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, join(arguments["output"], "QCResults", sample.name +".bam"))
+ sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, join(arguments["output"], "QCResults", sample.name +".bam"), not arguments["notrimming"])
else:
- sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, "/dev/null")
+ sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, "/dev/null", not arguments["notrimming"])
if not arguments["nokraken"]:
print("Run Kraken.")
- sample = sample.run_Kraken(arguments["db"])
+ sample = sample.run_Kraken(arguments["db"], not arguments["notrimming"])
sample.stop = datetime.datetime.strftime(datetime.datetime.now(),"%d.%m.%Y, %H:%M:%S")
finally:
shutil.rmtree(tmp)
return sample
-def writeReport(sample, arguments):
+def writeReport(sample):
report_name = os.path.join(sample.mainResultsPath, "Report",
"summary_" + sample.name + "_" + datetime.datetime.strftime(datetime.datetime.now(), "%d-%m-%y_%H-%M"))
print("Writing latex " ,report_name)
@@ -172,15 +184,17 @@ def writeReport(sample, arguments):
env.loader = FileSystemLoader(os.path.dirname(__file__))
template = env.get_template("report.tex")
- pdf_latex = template.render(sample=sample, pipeline=Pipeline(), trim_param = sample.readSets[0].trimRes.print_param(arguments["palindrome"],arguments["minlen"], arguments["trimOption"]))
+ try:
+ trim_param= sample.readSets[0].trimRes.print_param(arguments["palindrome"], arguments["minlen"], arguments["trimOption"])
+ except:
+ trim_param = "None"
+ pdf_latex = template.render(sample=sample, pipeline=Pipeline(), trim_param = trim_param )
latex = open(report_name + ".tex", "w")
latex.write(pdf_latex)
latex.close()
process = subprocess.Popen(["pdflatex", "-interaction=nonstopmode", "-output-directory=" + join(sample.mainResultsPath, "Report"), report_name + ".tex"], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
- for line in iter(process.stdout.readline, b''):
- pass #print(line)
for line in iter(process.stderr.readline, b''):
print(line)
@@ -191,6 +205,140 @@ def writeReport(sample, arguments):
os.remove(report_name + ext)
except OSError:
pass
+def check_input_validity():
+ #
+ # Check options for Trimmomatic
+ #pattern = re.match("(?P<option>\w+):(?P<value1>\d+):(?P<value2>\d+)", arguments["trimOption"])
+ #if (arguments["trimOption"] != ""):
+ # if not ((pattern.group("option") == "SLIDINGWINDOW") or (pattern.group("option") == "MAXINFO")):
+ # sys.exit("Check trimOption. Only 'SLIDINGWINDOW' or 'MAXINFO' allowed")
+ # try:
+ # int(pattern.group("value1"))
+ # int(pattern.group("value2"))
+ # except:
+ # sys.exit("Check trimOptions.")
+
+ #
+ # Check if adapter is required
+ if not (arguments['notrimming'] or arguments['adapter'] or arguments["technology"] == "IonTorrent") :
+ sys.exit("Illumina projects requires an adapter file")
+
+ #
+ # Check if reference is required and valid
+ if arguments["nomapping"]:
+ arguments["reference"] = ""
+ else:
+ if not arguments["reference"]:
+ sys.exit("Mapping needs a reference.")
+ else:
+ if not os.path.exists(arguments["reference"]):
+ sys.exit("Reference does not exist.")
+
+ #
+ #Check validity of Kraken DB
+ if not arguments["nokraken"]:
+ if not os.path.exists(arguments["db"]):
+ sys.exit(arguments["db"] + " does not exist. Enter a valid database for kraken")
+ else:
+ if "database.kdb" not in os.listdir(arguments["db"]):
+ sys.exit("database " + arguments["db"] + " does not contain necessary file database.kdb")
+ #
+ # Check input
+ if arguments["input"]:
+ if not os.path.exists(arguments["input"]):
+ sys.exit(arguments["input"] + " does not exist.")
+ else:
+ if not os.path.exists(arguments["r1"]):
+ sys.exit(arguments["r1"] + " does not exist.Input file required. Use option -input or -1 / -2")
+
+ if arguments["r2"]:
+ if not os.path.exists(arguments["r2"]):
+ sys.exit(arguments["r2"] + " does not exist.")
+
+def get_defaults():
+ if arguments["forAssembly"]:
+ if not arguments["trimBetter_threshold"]:
+ arguments["trimBetter_threshold"] = parameter["forAssembly." + arguments['technology']][
+ "trimBetter_threshold"]
+ if not arguments["trimOption"]:
+ arguments["trimOption"] = parameter["forAssembly." + arguments['technology']]["trimOption"]
+
+ elif arguments["forMapping"]:
+ if not arguments["trimBetter_threshold"]:
+ arguments["trimBetter_threshold"] = parameter["forMapping." + arguments['technology']][
+ "trimBetter_threshold"]
+ if not arguments["trimOption"]:
+ arguments["trimOption"] = parameter["forMapping." + arguments['technology']]["trimOption"]
+
+ if not arguments["trimOption"]:
+ arguments["trimOption"] = parameter["Trimmomatic"]["trimOption"]
+ if not arguments["trimBetter_threshold"]:
+ arguments["trimBetter_threshold"] = parameter["Trimmomatic"]["trimBetter_threshold"]
+
+
+def plot():
+ try:
+ #
+ # Plot BOXPLOTS
+ boxplots = [{"file": "Per_sequence_quality_scores.csv",
+ "output": join(arguments["output"], "QCResults/Report/src/img", "Per_sequence_quality_scores.png"),
+ "title": "Per sequence quality scores",
+ "ylab": "Mean Sequence Quality (Phred Score)",
+ "xlab": "Sample"},
+ {"file": "Sequence_Length_Distribution.csv",
+ "output": join(arguments["output"], "QCResults/Report/src/img", "Sequence_Length_Distribution.png"),
+ "title": "Sequence Length Distribution",
+ "ylab": "Sequence Length (bp)",
+ "xlab": "Sample"},
+ {"file": "Per_sequence_GC_content.csv",
+ "output": join(arguments["output"], "QCResults/Report/src/img", "Per_sequence_GC_content.png"),
+ "title": "Per sequence GC content",
+ "ylab": "Mean GC content (%)",
+ "xlab": "Sample"}]
+ for plot in boxplots:
+ process = subprocess.Popen(" ".join(["Rscript --vanilla ", join(os.path.dirname(__file__), "boxplot.R"),
+ join(arguments["output"], "QCResults", "Report", "src", plot["file"]),
+ plot["output"], '"' + plot["title"] + '"', '"' + plot["xlab"] + '"',
+ '"' + plot["ylab"] + '"']),
+ stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+ #for line in iter(process.stderr.readline, b''):
+ # print(line)
+ process.communicate()
+
+ #
+ # Plot BARPLOTS
+ process = subprocess.Popen(
+ " ".join(["Rscript --vanilla ", join(os.path.dirname(__file__), "barplot.R"), join(arguments["output"], "QCResults/Report/src", "summary.json"),
+ join(arguments["output"], "QCResults/Report/src/img")]), stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+ process.communicate()
+ except:
+ print("Couldnt plot summary")
+
+
+def writeSummaryTex(batch_json):
+ try:
+ summary_latex_name = join(arguments["output"], "QCResults", "Report", "summary.tex")
+ with open(summary_latex_name, "w") as summary_latex:
+ summary_latex.write("\\rowcolors{2}{gray!10}{}\n\\begin{table}[H]\\begin{tabular}{lcc}\n")
+ summary_latex.write(
+ "\\rowcolor{tableBlue} \\textbf{Sample} & \\textbf{Reads after trimming [\#]} & \\textbf{Reads after trimming [\%]} &\\textbf{Map to reference [\#]} & \\textbf{Map to reference[\%]} & \\textbf{Classified (Kraken)} & \\textbf{Shorter fragments [\%]} \\ \n")
+
+ for i in range(0, len(batch_json["summary"])):
+ summary_latex.write(
+ str(batch_json["summary"][i]["setname"]) + " & " +
+ str(batch_json["summary"][i]["Reads after trimming [#]"]) + " & " +
+ str(batch_json["summary"][i]["Reads after trimming [%]"]) + " & " +
+ str(batch_json["summary"][i]["Map to reference [#]"]) + " & " +
+ str(batch_json["summary"][i]["Map to reference [%]"]) + " & " +
+ str(batch_json["summary"][i]["Classified (Kraken)"]) + " & " +
+ str(batch_json["summary"][i]["Shorter fragments"]) + "\\\\\n"
+ )
+ summary_latex.write("\\end{tabular}\n")
+ summary_latex.write("\\caption{Statistics on reads after trimming. }")
+ summary_latex.close()
+ except:
+
+ print("Couldnt write summary.tex file.")
def main(arguments):
@@ -204,7 +352,12 @@ def main(arguments):
makedirs(join(arguments["output"], "QCResults", "Report", "src"), exist_ok=True)
makedirs(join(arguments["output"], "QCResults", "Report", "src", "img"), exist_ok=True)
+ if not os.path.exists(join(arguments["output"], "QCResults", "Report", "lib" )):
+ shutil.copytree(join(os.path.dirname(__file__),"lib"),join(arguments["output"], "QCResults", "Report", "lib" ))
+
+ ####
# rename sample names
+ ####
if arguments["rename"]:
rename_dict = {}
with open(arguments["rename"], "r") as csvfile:
@@ -212,159 +365,58 @@ def main(arguments):
row = line.split("\t")
rename_dict[row[0]] = row[1].strip()
arguments["rename"] = rename_dict
+
+ ###
# create new boxplot csv files
+ ###
for csv in ["Per_sequence_quality_scores.csv", "Per_sequence_GC_content.csv", "Sequence_Length_Distribution.csv"]:
new_csv = open(join(arguments["output"], "QCResults", "Report", "src", csv), "w")
new_csv.write("Sample,Read,Lane,Trimmed,Value,Count\n")
new_csv.close()
- ##
- # SAV
+
+ ###
+ # Write SAV
+ ###
try:
if arguments["sav"]:
write_SAV(arguments["sav"], join(arguments["output"], "QCResults", "Report", "src"))
except:
print("Couldnt write SAV.")
- ##
- if not os.path.exists(join(arguments["output"], "QCResults", "Report", "lib" )):
- shutil.copytree(join(os.path.dirname(__file__),"lib"),join(arguments["output"], "QCResults", "Report", "lib" ))
- if arguments["technology"].startswith("Illumina"):
- technology = "Illumina"
- else:
- technology = "IonTorrent"
- if arguments["forAssembly"]:
- if not arguments["trimBetter_threshold"]:
- arguments["trimBetter_threshold"] = parameter["forAssembly." + technology]["trimBetter_threshold"]
- if not arguments["trimOption"]:
- arguments["trimOption"] = parameter["forAssembly." + technology]["trimOption"]
-
- elif arguments["forMapping"]:
- if not arguments["trimBetter_threshold"]:
- arguments["trimBetter_threshold"] = parameter["forMapping." + technology]["trimBetter_threshold"]
- if not arguments["trimOption"]:
- arguments["trimOption"] = parameter["forMapping." + technology]["trimOption"]
-
- if not arguments["trimOption"]:
- arguments["trimOption"] = parameter["Trimmomatic"]["trimOption"]
- if not arguments["trimBetter_threshold"]:
- arguments["trimBetter_threshold"] = parameter["Trimmomatic"]["trimBetter_threshold"]
-
- # check input
- pattern = re.match("(?P<option>\w+):(?P<value1>\d+):(?P<value2>\d+)", arguments["trimOption"])
- if(arguments["trimOption"]!=""):
- if not ((pattern.group("option") =="SLIDINGWINDOW") or (pattern.group("option") =="MAXINFO")):
- sys.exit("Check trimOption. Only 'SLIDINGWINDOW' or 'MAXINFO' allowed")
- try:
- int(pattern.group("value1"))
- int(pattern.group("value2"))
- except:
- sys.exit("Check trimOptions.")
-
- if arguments['technology'] == "Illumina":
- if not arguments['adapter']:
- sys.exit("Illumina projects requires an adapter file")
- if arguments["nomapping"]:
- arguments["reference"] = ""
- else:
- if not arguments["reference"]:
- sys.exit("Mapping needs a reference.")
- else:
- if not os.path.exists(arguments["reference"]):
- sys.exit("Reference does not exist.")
-
- if not arguments["nokraken"]:
- if not os.path.exists(arguments["db"]):
- sys.exit(arguments["db"] + " does not exist. Enter a valid database for kraken")
- else:
- if "database.kdb" not in os.listdir(arguments["db"]):
- sys.exit("database "+arguments["db"] +" does not contain necessary file database.kdb")
- if not arguments["input"]:
- if arguments["r1"]:
- print(os.path.abspath(arguments["r1"]),os.path.exists(arguments["r1"]))
- if not os.path.exists(arguments["r1"]):
- sys.exit(arguments["r1"] + " does not exist.")
- else:
- sys.exit("Input file required. Use option -input or -1 / -2")
-
- if arguments["r2"]:
- if not os.path.exists(arguments["r2"]):
- sys.exit(arguments["r2"] + " does not exist.")
+ #
+ # Check input validity
+ check_input_validity()
if arguments["index"]:
mapping_index_path = arguments["index"]
else:
mapping_index_path = join(tmp, "bowtie2_index")
os.makedirs(mapping_index_path, exist_ok=True)
- # RUN
- try:
- batch_json = json.dump({"commandline":arguments, "summary":[],"kraken":{}, "versions":Pipeline().__dict__}, open(join(arguments["output"], "QCResults/Report/src", "summary.json"),"w"))
- i = 1
- project = []
- tmp_overview = open(join(tmp, "overview.csv"), "w")
- tmp_overview.write("Sample,Trimmed,Mapped,Classified\n")
- myFiles = []
-
- #filter samples and put it into for loop
- # Get folder and filter by last three pattern splitted by "_"
- if arguments["input"]:
- input = [ join(arguments["input"], x) for x in sorted(os.listdir(arguments["input"]))]
- #### check input format ####
- # Case 1: Folder contains fastq files
- if len([x for x in input if isfile(x) and any([x.endswith(ext) for ext in file_extensions]) ])>0:
- input= [x for x in input if isfile(x) and any([x.endswith(ext) for ext in file_extensions]) ]
- if len(input)!=0:
- if arguments["technology"].startswith("Illumina"):
- for setname, files in groupby(input, key=lambda x: "_".join(x.split("_")[:-3])):
- #for setname, files in groupby(files, key=lambda x: "_".join(x.split("_")[:-2])):
- temp_file = sorted(list(files))
- r1_reads = [x for x in temp_file if r1_pattern in x]
- r2_reads = [x for x in temp_file if r2_pattern in x]
- if len(r1_reads) != 0:
- if len(r2_reads) != 0:
- myFiles.append([r1_reads, r2_reads])
- else:
- myFiles.append(r1_reads)
- else:
- # IonTorrent
- myFiles = input
- # Case 2: Folder contains subfolder for each sample
- else:
- input = [x for x in input if os.path.isdir(x)]
-
- if len(input) > 0:
- for sample in input:
- files = sorted([join(sample,x) for x in listdir(sample) if any([x.endswith(ext) for ext in file_extensions])])
- if arguments["technology"].startswith("Illumina"):# and len(files) > 1:
- for setname, files in groupby(files, key=lambda x: "_".join(x.split("_")[:-3])):
- temp_file = sorted(list(files))
- r1_reads = [x for x in temp_file if r1_pattern in x]
- r2_reads = [x for x in temp_file if r2_pattern in x]
- if len(r1_reads)!=0:
- if len(r2_reads)!=0:
- myFiles.append([r1_reads, r2_reads])
- else:
- myFiles.append(r1_reads)
- else:
- myFiles.append(sorted(list(files)))
- files = None
- if len(myFiles) == 0:
- sys.exit("Enter a valid folder, which contain sample folders")
- else:
- if arguments["r1"]:
- if not os.path.exists(arguments["r1"]):
- sys.exit(arguments["r1"] + " does not exist.")
- if arguments["r2"]:
- myFiles.append([arguments["r1"], arguments["r2"]])
- else:
- myFiles.append([arguments["r1"]])
+ myFiles = get_input()
+ print("Found " + str(len(myFiles)) + " sample(s).")
- print("Found " + str(len(myFiles)) + " sample(s).")
+ #
+ # Get parameter defaults
+ get_defaults()
+ batch_json = json.dump({
+ "commandline": arguments,
+ "summary": [],
+ "kraken": {},
+ "versions": Pipeline().__dict__},
+ open(join(arguments["output"], "QCResults/Report/src", "summary.json"), "w"))
+
+ output = join(arguments["output"], "QCResults") # getDir([arguments["output"], "QCResults"], True)
+
+ i = 1
+ #######
+ # Run for each sample
+ #######
+ try:
for subset in myFiles:
print("Analyse " + str(i) + "/" + str(len(myFiles)))
-
sample_tmp = join(tmp, "Sample_" + str(i))
os.makedirs(sample_tmp, exist_ok=True)
@@ -375,11 +427,17 @@ def main(arguments):
if len(subset)>1:
arguments["r2"] = subset[1]
- sample = fillClasses( mapping_index_path, sample_tmp )
+ ####
+ # Run Analyses
+ ####
+ sample = runAnalyses( mapping_index_path, sample_tmp )
+
+ ####
+ # Write output
+ ####
if sample is not None:
- project.append(sample)
try:
- writeReport(sample, arguments)
+ writeReport(sample)
except:
print("Couldnt write pdf.")
try:
@@ -396,88 +454,43 @@ def main(arguments):
else:
kraken = sample.krakenRes.pClassified
batch_json["kraken"][sample.name] = json.loads(str_to_json(sample.krakenRes.report))
- tmp_overview.write(",".join([sample.name, str(sample.nTrimmed()), str(nMapping), str(kraken)]) +"\n")
+ #tmp_overview.write(",".join([sample.name, str(sample.nTrimmed()), str(nMapping), str(kraken)]) +"\n")
+ try:
+ trim_param = sample.readSets[0].trimRes.print_param(arguments["palindrome"], arguments["minlen"],
+ arguments["trimOption"])
+ except:
+ trim_param="None"
batch_json["summary"].append(
OrderedDict([("setname", sample.name),
- ("Total reads [#]", sample.total_reads()),
- ("Reads after trimming [#]", sample.nTrimmed()),
- ("Reads after trimming [%]", sample.pTrimmed()),
+ ("Total reads [#]", sample.total_reads()),
+ ("Reads after trimming [#]", sample.nTrimmed()),
+ ("Reads after trimming [%]", sample.pTrimmed()),
("Map to reference [#]", nMapping),
("Map to reference [%]", pMapping),
("Classified (Kraken)", kraken),
("Shorter fragments", sample.get_mean_trimRes()),
- ("Trim Parameter", sample.readSets[0].trimRes.print_param(arguments["palindrome"],arguments["minlen"],arguments["trimOption"])),
+ ("Trim Parameter",trim_param),
("images", [x.get_all_qc_dict() for x in sample.readSets])]))
json.dump(batch_json, open(join(arguments["output"], "QCResults/Report/src", "summary.json"), "w"))
except:
print("Couldnt produce HTML report.")
+
sample = None
+ arguments["r1"]=None
+ arguments["r2"]=None
i+=1
+ # plot boxplots and barplots
+ plot()
+ writeSummaryTex(batch_json)
+ shutil.copy(join(os.path.dirname(__file__), "batch_report.html"), join(arguments["output"], "QCResults", "Report", "batch_report.html"))
- try:
- # plot boxplots
- boxplots = [{"file":"Per_sequence_quality_scores.csv",
- "output": join(arguments["output"], "QCResults/Report/src/img", "Per_sequence_quality_scores.png"),
- "title": "Per sequence quality scores",
- "ylab": "Mean Sequence Quality (Phred Score)",
- "xlab": "Sample" },
- {"file": "Sequence_Length_Distribution.csv",
- "output": join(arguments["output"], "QCResults/Report/src/img", "Sequence_Length_Distribution.png"),
- "title": "Sequence Length Distribution",
- "ylab": "Sequence Length (bp)",
- "xlab": "Sample"},
- {"file": "Per_sequence_GC_content.csv",
- "output": join(arguments["output"], "QCResults/Report/src/img", "Per_sequence_GC_content.png"),
- "title": "Per sequence GC content",
- "ylab": "Mean GC content (%)",
- "xlab": "Sample"} ]
- for plot in boxplots:
- process = subprocess.Popen(" ".join(["Rscript --vanilla ",join(os.path.dirname(__file__) ,"boxplot.R"),
- join(arguments["output"],"QCResults", "Report", "src", plot["file"]), plot["output"], '"'+ plot["title"]+'"', '"'+plot["xlab"]+'"','"'+ plot["ylab"]+'"']),
- stderr=subprocess.PIPE, stdout= subprocess.PIPE, shell =True)
- for line in iter(process.stderr.readline, b''):
- print(line)
- process.communicate()
- #plot barplots
- tmp_overview.close()
- process = subprocess.Popen(" ".join(["Rscript --vanilla ", join(os.path.dirname(__file__), "barplot.R"),join(tmp, "overview.csv"),
- join(arguments["output"], "QCResults/Report/src/img")]),stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
- for line in iter(process.stderr.readline, b''):
- print(line)
- process.communicate()
- shutil.copy(join(os.path.dirname(__file__), "batch_report.html"), join(arguments["output"], "QCResults", "Report", "batch_report.html"))
- except:
- print("Couldnt plot boxplots.")
- try:
- summary_latex_name = join(arguments["output"], "QCResults", "Report", "summary.tex")
- with open(summary_latex_name, "w") as summary_latex:
- summary_latex.write("\\rowcolors{2}{gray!10}{}\n\\begin{table}[H]\\begin{tabular}{lcc}\n")
- summary_latex.write(
- "\\rowcolor{tableBlue} \\textbf{Sample} & \\textbf{Map to reference [\#]} & \\textbf{Map to reference[\%]} & \\textbf{Reads after trimming [\#]} & \\textbf{Reads after trimming [\%]} & \\textbf{Classified (Kraken)} & \\textbf{Shorter fragments [\%]} \\ \n")
-
- for i in range(0, len(batch_json["summary"])):
- summary_latex.write(
- str(batch_json["summary"][i]["setname"]) + " & " +
- str(batch_json["summary"][i]["Map to reference [#]"]) + " & " +
- str(batch_json["summary"][i]["Map to reference [%]"]) + " & " +
- str(batch_json["summary"][i]["Reads after trimming [#]"]) + " & " +
- str(batch_json["summary"][i]["Reads after trimming [%]"]) + " & " +
- str(batch_json["summary"][i]["Classified (Kraken)"]) + " & " +
- str(batch_json["summary"][i]["Shorter fragments"]) + "\\\\\n"
- )
- summary_latex.write("\\end{tabular}\n")
- summary_latex.write("\\caption{Statistics on reads after trimming. }")
- summary_latex.close()
- except:
-
- print("Couldnt write summary.tex file.")
finally:
shutil.rmtree(tmp)
delete_files(join(arguments["output"],"QCResults"), "FastQC", "_fastqc.html")
delete_files(join(arguments["output"],"QCResults"), "Trimmed/FastQC", "_fastqc.html")
-
+
if __name__ == "__main__":
config = configparser.ConfigParser()
config.read(join(dirname(__file__), "config.txt"))
@@ -492,7 +505,7 @@ if __name__ == "__main__":
parser.add_argument( '-2', dest='r2', help = "input file", required=False)
parser.add_argument('-output', dest='output', default="")
- parser.add_argument('-technology', dest='technology',choices=['Illumina',"IonTorrent"] ,required = True)
+ parser.add_argument('-technology', dest='technology',choices=['Illumina',"IonTorrent"] ,required=False , help = "If not set, automatically determine technology and search for fastq and bam files. Set technology to IonTorrent if all files are bam-files, else set technology to Illumina.")
parser.add_argument('-threads', dest='threads', default = 4, type = int, help= "Number of threads. Default: %(default)s")
parser.add_argument('-adapter', dest = 'adapter', choices = ['TruSeq2-PE', 'TruSeq2-SE' , 'TruSeq3-PE' , 'TruSeq3-SE' , 'TruSeq3-PE-2', 'NexteraPE-PE' ])
parser.add_argument('-reference', dest='reference', required = False, help = "Map reads against reference")
@@ -503,16 +516,25 @@ if __name__ == "__main__":
parser.add_argument('-db', dest='db', help='Kraken database', required = False, default= kraken_db)
parser.add_argument('-palindrome', dest='palindrome', default=30, help= 'Use palindrome parameter 30 or 1000 for further analysis. Default: %(default)s', type = int)
parser.add_argument('-minlen', default=50,dest='minlen',help='Minlen parameter for Trimmomatic. Default: %(default)s', type=int)
- parser.add_argument('-trimOption', dest="trimOption", help='Use maxinfo or slidinginfo for Trimmomatic.MAXINFO:<target length>:<strictness> | SLIDINGWINDOW:<window size>:<required quality>. default: SLIDINGWINDOW:4:15', type= str)
+ parser.add_argument('-trimOption', dest="trimOption", help='Use maxinfo or slidinginfo for Trimmomatic.MAXINFO:<target length>:<strictness> | SLIDINGWINDOW:<window size>:<required quality>.', type= str)
parser.add_argument('-version', action='version', version='%(prog)s v' + __version__)
parser.add_argument('-nokraken', action = "store_true")
parser.add_argument('-nomapping', action = "store_true")
+ parser.add_argument('-notrimming', action = "store_true")
parser.add_argument('-gz', action = "store_true")
parser.add_argument('-trimBetter', action = "store_true", help= "Optimize trimming parameter using 'Per sequence base content' from fastqc. Not recommended for amplicons.")
parser.add_argument('-trimBetter_threshold', dest='trimBetter_threshold', help = "Set -trimBetter to use this option.Default setting for Illumina: 0.15 and for IonTorrent: 0.25.", required=False, type=float)
parser.add_argument('-forAssembly', action = "store_true", help = "Set -trimBetter to use this option.Trim parameter are optimized for assemblies (trim more aggressive).")
parser.add_argument('-forMapping', action = "store_true", help = "Set -trimBetter to use this option.Trim parameter are optimized for mapping (allow more errors).")
parser.add_argument('-rename', dest="rename", required=False, help="TSV File with two columns: <old sample name> <new sample name>")
+ parser.add_argument('-parameters', dest = 'parameters', default = join(dirname(__file__), "parameter.txt"))
arguments = vars(parser.parse_args())
+ parameter = configparser.ConfigParser(strict=False, interpolation=None)
+ parameter.read(arguments['parameters'])
+
+ r1_pattern = parameter["Pattern"]["R1"]
+ r2_pattern = parameter["Pattern"]["R2"]
+ sep_pattern = parameter["Pattern"]["sep"]
+ lane_pattern = parameter["Pattern"]["lane"]
main(arguments)
diff --git a/barplot.R b/barplot.R
index 350342c..93dfd20 100755
--- a/barplot.R
+++ b/barplot.R
@@ -1,29 +1,45 @@
options(warn=-1)
-library(ggplot2)
+require(jsonlite)
+require(ggplot2)
+
args = commandArgs(trailingOnly=TRUE)
-summary<- as.data.frame(read.csv(args[1]))
+convert2filename<- function(string, ext=".png"){
+ string<- gsub("\\[%\\]", "percentage", string)
+ string<- gsub("\\[#\\]", "number", string)
+ string<- gsub(" ", "_", string)
+ return(paste(string, ext, sep=""))
+}
-ggplot(summary, aes(x=Sample, y = Trimmed )) +
- geom_bar(stat = "identity", fill="#67a9cf") +
- theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5)) +
- ggtitle("Number of reads after trimming") +
- xlab("Sample")+
- ylab("Reads [#]")
-ggsave(paste(args[2], "/trimming.png", sep=""))
+summary_json<- jsonlite::fromJSON(args[1])$summary
+tablenames<- names(summary_json)[!names(summary_json) %in% c("images", "Trim Parameter")]
-ggplot(summary, aes(x=Sample, y = Mapped )) +
- geom_bar(stat = "identity", fill="#67a9cf") +
- theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5)) +
- ggtitle("Number of reads mapped to reference") +
+summary_json<- summary_json[tablenames]
+#summary<- as.data.frame(read.csv(args[1]))
+for( i in tablenames[2:length(tablenames)]){
+
+ggplot(summary_json, aes(x=summary_json[,"setname"], y = summary_json[,i]), environment = environment()) +
+ geom_bar(stat = "identity", fill="#67a9cf") +
+ theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5), legend.position = "none") +
+ ggtitle(i) +
xlab("Sample")+
- ylab("Reads [#]")
-ggsave(paste(args[2], "/mapping.png", sep=""))
+ ylab(i)
+ ggsave(paste(args[2], convert2filename(i), sep="/"))
-ggplot(summary, aes(x=Sample, y = Classified )) +
- geom_bar(stat = "identity", fill="#67a9cf") +
+}
+temp_json <- data.frame(rbind(cbind( setname = summary_json$setname, type = "Total reads [#]", value= summary_json$`Total reads [#]`),
+ cbind( setname = summary_json$setname, type = "Reads after trimming [#]", value= summary_json$`Reads after trimming [#]`)
+ ))
+temp_json$value <- as.numeric(as.character(temp_json$value))
+
+ggplot(temp_json, aes(x= ))
+
+ggplot(temp_json, aes(x=setname, y = value, by=type, fill=type))+
+ geom_bar(stat="identity",position = "identity", alpha= 0.9) +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5)) +
- ggtitle("Precentage of reads classified by Kraken") +
+ ggtitle("Number of Reads") +
xlab("Sample")+
- ylab("Reads [%]")
-ggsave(paste(args[2], "/kraken.png", sep=""))
+ ylab("Number of Reads")
+ggsave(paste(args[2], "number_of_reads.png", sep="/"))
+
+
diff --git a/batch_report.html b/batch_report.html
index 23fc0f4..104a42b 100755
--- a/batch_report.html
+++ b/batch_report.html
@@ -573,18 +573,18 @@
</br>
<div class="col-md-4">
- <a ng-click="selected.title='Number of reads remained after trimming'; selected.caption=''; selected.img='src/img/trimming.png'"
+ <a ng-click="selected.title='Number of reads remained after trimming'; selected.caption=''; selected.img='src/img/Reads_after_trimming_number.png'"
data-image-id="" data-toggle="modal"
data-target="#image-gallery">
- <img height="500px" ng-src="src/img/trimming.png">
+ <img height="500px" ng-src="src/img/Reads_after_trimming_number.png">
</a>
</div>
<div class="col-md-4">
- <a ng-click="selected.title='Number of reads mapped to reference'; selected.caption=''; selected.img='src/img/mapping.png'"
+ <a ng-click="selected.title='Number of reads mapped to reference'; selected.caption=''; selected.img='src/img/Map_to_reference_number.png'"
data-image-id="" data-toggle="modal"
data-target="#image-gallery">
- <img height="500px" ng-src="src/img/mapping.png">
+ <img height="500px" ng-src="src/img/Map_to_reference_number.png">
</a>
</div>
@@ -593,7 +593,7 @@
<a ng-click="selected.title='Percentage of reads classified by Kraken'; selected.caption=''; selected.img='src/img/kraken.png'"
data-image-id="" data-toggle="modal"
data-target="#image-gallery">
- <img height="500px" ng-src="src/img/kraken.png">
+ <img height="500px" ng-src="src/img/Classified_(Kraken).png">
</a>
</div>
diff --git a/classes.py b/classes.py
index bbfb42f..3066dde 100755
--- a/classes.py
+++ b/classes.py
@@ -79,18 +79,17 @@ class Sample(object):
def get_all_qc_plots(self):
qc_dict = OrderedDict([("setname", self.name),("raw",[]), ("trimmed",[])])
for rs in self.readSets:
- qc_dict["raw"].append(rs.r1.qcRes.get_plots())
- qc_dict["trimmed"].append(rs.trimRes.readset.r1.qcRes.get_plots())
+ qc_dict["raw"].append(rs.r1.qcRes.plots)
+ qc_dict["trimmed"].append(rs.trimRes.readset.r1.qcRes.plots)
if (rs.r2 is not None):
- qc_dict["raw"].append(rs.r2.qcRes.get_plots())
- qc_dict["trimmed"].append(rs.trimRes.readset.r2.qcRes.get_plots())
+ qc_dict["raw"].append(rs.r2.qcRes.plots)
+ qc_dict["trimmed"].append(rs.trimRes.readset.r2.qcRes.plots)
return qc_dict
def total_reads(self):
total = 0
for rs in self.readSets:
- if rs.trimRes:
- total += rs.trimRes.total
+ total += rs.numberOfReads
return total
def pTrimmed(self):
@@ -112,15 +111,15 @@ class Sample(object):
self.readSets.append(readSet)
return self
- def run_Kraken(self, db):
- r1, r2 = self.get_all_reads()
+ def run_Kraken(self, db, trimmed):
+ r1, r2 = self.get_all_reads(trimmed)
self.krakenRes = KrakenResult(r1,r2,db, self.name)
return self
- def run_Bowtie2(self, bowtie_index, save_mapping ):
- r1, r2 = self.get_all_reads()
+ def run_Bowtie2(self, bowtie_index, save_mapping, trimmed ):
+ r1, r2 = self.get_all_reads(trimmed)
self.mappingRes = MappingResult(self.reference, r1, r2, bowtie_index)
- self.mappingRes = self.mappingRes.run_bowtie(self.threads, save_mapping)
+ self.mappingRes = self.mappingRes.run_bowtie(save_mapping)
return self.mappingRes
def get_all_reads(self, trimmed = True):
@@ -133,7 +132,10 @@ class Sample(object):
return r1,r2
def get_mean_trimRes(self):
- return round(np.mean([x.trimRes.shorter_fragments_percentage for x in self.readSets]),2)
+ try:
+ return round(np.mean([x.trimRes.shorter_fragments_percentage for x in self.readSets]),2)
+ except:
+ return None
class ReadSet:
''' Readset contains of r1 and r2'''
@@ -151,7 +153,6 @@ class ReadSet:
self.numberOfReads = 0
self.numberofTrimmedReads = 0
self.trimRes = None
- #self.outputDir = outputDir
self.paired = True
if len( basename(self.r1.filename).split("_")[:-3]) > 4:
self.setname = "_".join(basename(self.r1.filename).split("_")[:-3])
@@ -162,8 +163,10 @@ class ReadSet:
def run_FastQC(self, output):
self.r1 = self.r1.run_FastQC(output)
+ self.numberOfReads += self.r1.qcRes.total_reads
if(self.r2 is not None):
self.r2 = self.r2.run_FastQC(output)
+ self.numberOfReads += self.r1.qcRes.total_reads
return self
def run_Trimmomatic(self, adapter, palindrome, minlen, trimOption, betterTrimming,gz):
@@ -177,19 +180,14 @@ class ReadSet:
return qc_results
def get_all_qc_dict(self):
- '''qc_dict =OrderedDict( {"raw":[],"trimmed":[]} )
- qc_dict["raw"] = [[x.__dict__ for x in self.r1.qcRes.results_to_base64()]]
- qc_dict["trimmed"] = [[x.__dict__ for x in self.trimRes.readset.r1.qcRes.results_to_base64()]]
- if (self.r2 is not None):
- qc_dict["raw"].append([x.__dict__ for x in self.r2.qcRes.results_to_base64()])
- qc_dict["trimmed"].append([x.__dict__ for x in self.trimRes.readset.r2.qcRes.results_to_base64()])
- return qc_dict'''
qc_dict = OrderedDict({"raw": [], "trimmed": []})
qc_dict["raw"] = [[x.__dict__ for x in self.r1.qcRes.results_for_report()]]
- qc_dict["trimmed"] = [[x.__dict__ for x in self.trimRes.readset.r1.qcRes.results_for_report()]]
+ if self.trimRes:
+ qc_dict["trimmed"] = [[x.__dict__ for x in self.trimRes.readset.r1.qcRes.results_for_report()]]
if (self.r2 is not None):
qc_dict["raw"].append([x.__dict__ for x in self.r2.qcRes.results_for_report()])
- qc_dict["trimmed"].append([x.__dict__ for x in self.trimRes.readset.r2.qcRes.results_for_report()])
+ if self.trimRes:
+ qc_dict["trimmed"].append([x.__dict__ for x in self.trimRes.readset.r2.qcRes.results_for_report()])
return qc_dict
@@ -202,7 +200,6 @@ class FastQFile:
self.concat_files = None
def run_FastQC(self, outputDir):
- print("Run fastqc in fastqfile ", " ".join([join(get_tool_path('fastqc'), "fastqc") , self.filename , "-o" , outputDir, "-t", str(num_threads), "--extract"]))
process = subprocess.Popen([join(get_tool_path('fastqc'), "fastqc") , self.filename , "-o" , outputDir, "-t", str(num_threads), "--extract"], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
for line in iter(process.stderr.readline, b''):
line = line.decode("utf-8")
@@ -217,7 +214,6 @@ class FastQFile:
pattern = re.match("Quality encoding detected as (?P<phred>\w+\d{2})", line)
if pattern:
self.phred = pattern.group("phred")
- print("")
process.communicate()
self.qcRes = QCResult(join(outputDir, getFilenameWithoutExtension(basename(self.filename)) + "_fastqc"), join(mainResultsPath, "Report", "src"))
return self
@@ -231,8 +227,8 @@ class QCResult:
def __init__(self, resultsPath, summary_output):
self.path = resultsPath
self.results = self.get_results()
- #write summary csv for boxplots
- write_fastqc_summary(resultsPath, summary_output)
+ #write summary csv for boxplots and assign total number of reads
+ self.total_reads = write_fastqc_summary(resultsPath, summary_output)
def get_results(self):
summary_list = []
@@ -254,36 +250,6 @@ class QCResult:
summary_list.append(Plot("NotExisting", row[1], "NotExisting"))
return summary_list
- def get_plots(self):
- print("extracting data ", self.path)
- store = False
- data = {"name": basename(self.path).replace("_fastc",""),
- "Sequence_Length_Distribution": [],
- "Per_sequence_GC_content": [],
- "Per_base_sequence_quality": []}
-
- with open(join(self.path, "fastqc_data.txt"), "r") as fastqcdatafile:
- box_values = []
- for line in iter(fastqcdatafile):
- if line.startswith("#"):
- pass
- elif line.startswith(">>END_MODULE"):
- if store==True:
- data[key] = get_distribution(box_values, key)
- store = False
- key = None
- box_values = []
- elif line.startswith(">>"):
- key = line.split("\t")[0]
- key=key.replace(">>","")
- key = key.replace(" ","_")
- if key in data.keys():
- store = True
- else:
- if store:
- box_values.append(line)
- return data
-
def results_to_base64(self):
converted_plot = []
for plot in self.results:
@@ -357,6 +323,7 @@ class TrimResult:
self.logs = ""
#Results
+ self.total=0
self.shorter_fragments_percentage = 0
self.input_number = 0
self.nTrimmed = 0
@@ -454,6 +421,8 @@ class TrimResult:
process = subprocess.Popen(" ".join(call), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
survived1, total, nTrimmedReads, pTrimmedReads, log = trimmomatic_report(process, call[0], True)
process.communicate()
+ if survived1==0:
+ sys.exit("No reads remained after trimming. Check fastqc results before trimming in QCResults/Fastqc. Please rerun without -trimBetter or check -trimOptions and minlen again.")
self.total = total
self.nTrimmed = nTrimmedReads
@@ -521,17 +490,18 @@ class MappingResult:
index_name = join(index, getFilenameWithoutExtension(basename(reference)) +"_bt2") # reference name "bowtie2 build index_name"
if not isdir(index):
return index
- process = subprocess.Popen([join(get_tool_path("bowtie2"),"bowtie2-build"),reference, index_name ], stderr=subprocess.PIPE, stdout = subprocess.PIPE)
+ if not exists(index_name + ".1.bt2"):
+ process = subprocess.Popen([join(get_tool_path("bowtie2"),"bowtie2-build"),reference, index_name ], stderr=subprocess.PIPE, stdout = subprocess.PIPE)
- for line in iter(process.stderr.readline, b''):
- line = line.decode("utf-8")
- print(line)
- process.communicate()
- print("Bowtie2-build. Done.")
+ for line in iter(process.stderr.readline, b''):
+ line = line.decode("utf-8")
+ print(line)
+ process.communicate()
+ print("Bowtie2-build. Done.")
return index_name
- def run_bowtie(self, threads, save_mapping):
+ def run_bowtie(self, save_mapping):
print("Run Bowtie2")
call = [join(get_tool_path("bowtie2"), "bowtie2"), "-x", self.index, "--no-unal --threads ", str(num_threads)]
diff --git a/config.txt b/config.txt
index 124f887..258a35a 100755
--- a/config.txt
+++ b/config.txt
@@ -1,10 +1,10 @@
[DEFAULT]
-kraken_db =/opt/common/pipelines/databases/minikraken_20141208/
+kraken_db = /opt/common/pipelines/databases/minikraken_20141208/
[PATH]
kraken =
-adapter = /data/GS/tools/Trimmomatic-0.32/adapters/
+adapter = /opt/common/pipelines/tools/Trimmomatic-0.36/adapters/
fastqc = /opt/common/pipelines/tools/FastQC_0.11.5/
-trimmomatic =
+trimmomatic = /opt/common/pipelines/tools/Trimmomatic-0.36/
bowtie2 =
diff --git a/helper.py b/helper.py
index d7fb524..c849e5a 100755
--- a/helper.py
+++ b/helper.py
@@ -151,7 +151,7 @@ def optimize_trimming(fastqc_folder,perc=0.1):
column[nucl].mask[-(i + 5):(min(len(mytable[nucl]), len(mytable[nucl]) - i))] = True
if (perc_slope(numpy.mean(mytable[nucl][-(i + 6): (min(len(mytable[nucl]) - 1, len(mytable[nucl]) - 1 - i))]),numpy.mean(column[nucl]), perc=perc)) & (tailcrop > len(mytable[nucl]) - (i + 5)):
# if perc_slope(numpy.var(mytable[nucl][-(i+6): (min(len(mytable["A"])-1, len(mytable[nucl])-1-i))]), numpy.mean(numpy.var(column)), perc=perc):
- print("TAILCROP", tailcrop)
+ #print("TAILCROP", tailcrop)
tailcrop = len(mytable[nucl]) - (i + 5)
trim_bool = True
else:
@@ -215,9 +215,9 @@ def delete_files(path, extended="", pattern="_fastqc"):
else:
shutil.rmtree(filename)
-
+'''
def get_distribution(lines, key):
- '''creates CSV used for boxplotting'''
+ #creates CSV used for boxplotting
output = []
for line in lines:
line = line.strip()
@@ -235,7 +235,7 @@ def get_distribution(lines, key):
v = fields[0] # quality or gc [%]
output.extend(int(v) * [int(float(fields[1]))])
return output
-
+'''
def createCSV(dataset, lines, key, path, trimmed):
'''creates CSV used for boxplotting'''
#output = []
@@ -280,6 +280,8 @@ def write_fastqc_summary(fastqcdir, output):
if line.startswith("#"):
pass # print("comment")
# continue
+ elif line.startswith("Total Sequences"):
+ total_seq = int(line.split("\t")[-1])
elif line.startswith('>>END_MODULE'):
if len(data) > 0:
name1 = basename(fastqcdir)
@@ -300,6 +302,7 @@ def write_fastqc_summary(fastqcdir, output):
if store:
data.extend([line])
fastqcdatafile.close()
+ return total_seq
def write_SAV(folder, output):
print("Plot SAV images...")
diff --git a/parameter.txt b/parameter.txt
index 14aeca2..ea2902e 100755
--- a/parameter.txt
+++ b/parameter.txt
@@ -35,13 +35,12 @@ trimOption = SLIDINGWINDOW:4:15
trimBetter_threshold = 0.25
trimOption = SLIDINGWINDOW:4:15
-[Adapter]
-Illumina Universal Adapter = TruSeq2
-
[Fileextension]
-fastq = .fastq.gz, .fastq, .fq, .fq.gz, .bam
+fastq = .fastq.gz, .fastq, .fq, .fq.gz
+bam = .bam
[Pattern]
-R1 = _R1_
-R2 = _R2_
-lane = _L\d{3}_
\ No newline at end of file
+sep=_
+R1 = R1
+R2 = R2
+lane = L\d{3}
\ No newline at end of file
diff --git a/report.tex b/report.tex
index e687aa0..65558cc 100755
--- a/report.tex
+++ b/report.tex
@@ -67,9 +67,11 @@ Processed reads: \\
%-------------------- Summary -------------------%
\begin{tcolorbox}
{\large{Summary} } \\
-{{sample.nTrimmed()}} of {{sample.total_reads()}} ({{sample.pTrimmed()}}\%) reads remained after trimming \\
+{% if sample.nTrimmed() != 0 %} {{sample.nTrimmed()}} of {{sample.total_reads()}} ({{sample.pTrimmed()}}\%) reads remained after trimming \\
{{sample.get_mean_trimRes()}} \% of fragments were shorter than read length\\
-
+{% else %}
+No trimming was performed \\
+{% endif %}
{%if sample.mappingRes !=None%}{{sample.mappingRes.numberOfAlignedReads}} ({{sample.mappingRes.percentAlignedReads}}\%) of reads aligned to \path{ {{sample.reference}} }\\ {%endif%}
{%if sample.krakenRes !=None%}{{sample.krakenRes.nClassified}} sequences classified ({{sample.krakenRes.pClassified}}\%) {%endif%}
@@ -117,12 +119,13 @@ Trimming Log: \\
{\includegraphics[width=\textwidth]{/{{read.r1.qcRes.results[i].file}}} }
\end{subfigure}
\end{adjustbox}\qquad
- \begin{adjustbox}{minipage=0.4\textwidth-2\fboxrule, cframe = {{read.trimRes.readset.r1.qcRes.results[i].color}} 2}%
- \begin{subfigure}%
+ {% if sample.nTrimmed() != 0 %} \begin{adjustbox}{minipage=0.4\textwidth-2\fboxrule, cframe = {{read.trimRes.readset.r1.qcRes.results[i].color}} 2}%
+ \begin{subfigure}%
{\includegraphics[width=\textwidth]{/{{read.trimRes.readset.r1.qcRes.results[i].file}}} }
\end{subfigure}
\end{adjustbox}
\caption{ {{read.trimRes.readset.r1.qcRes.results[i].name}}}
+ {% endif %}
\end{figure}
{% endfor %}
@@ -145,12 +148,13 @@ Concatenated files: \\
{\includegraphics[width=\textwidth]{/{{read.r2.qcRes.results[i].file}}} }
\end{subfigure}
\end{adjustbox}\qquad
- \begin{adjustbox}{minipage=0.4\textwidth -2\fboxrule, cframe = {{read.trimRes.readset.r2.qcRes.results[i].color}} 2}%
+ {% if sample.nTrimmed() != 0 %} \begin{adjustbox}{minipage=0.4\textwidth -2\fboxrule, cframe = {{read.trimRes.readset.r2.qcRes.results[i].color}} 2}%
\begin{subfigure}%
{\includegraphics[width=\textwidth]{/{{read.trimRes.readset.r2.qcRes.results[i].file}}} }
\end{subfigure}
\end{adjustbox}
\caption{ {{read.trimRes.readset.r2.qcRes.results[i].name}}}
+ {% endif %}
\end{figure}
{% endfor %}
{% endif %}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/qcumber.git
More information about the debian-med-commit
mailing list