[med-svn] [ngsqctoolkit] 05/08: New upstream version 2.3.3
Andreas Tille
tille at debian.org
Wed Dec 13 16:11:33 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository ngsqctoolkit.
commit 18f35832212d705078530d0348728fca0d8e2822
Author: Andreas Tille <tille at debian.org>
Date: Wed Dec 13 17:09:20 2017 +0100
New upstream version 2.3.3
---
Format-converter/FastqTo454.pl | 227 ++
Format-converter/FastqToFasta.pl | 87 +
Format-converter/SangerFastqToIlluFastq.pl | 88 +
Format-converter/SolexaFastqToIlluFastq.pl | 89 +
NGSQCToolkitv2.3.3_manual.pdf | Bin 0 -> 889689 bytes
QC/454QC.pl | 1572 ++++++++++++++
QC/454QC_PE.pl | 1996 ++++++++++++++++++
QC/454QC_PRLL.pl | 2021 ++++++++++++++++++
QC/IlluQC.pl | 2328 +++++++++++++++++++++
QC/IlluQC_PRLL.pl | 2994 +++++++++++++++++++++++++++
QC/lib/454PEhtml.pl | 384 ++++
QC/lib/454html.pl | 347 ++++
QC/lib/Fonts/Dustismo_Sans.ttf | Bin 0 -> 63040 bytes
QC/lib/Fonts/LucidaSansDemiBold.ttf | Bin 0 -> 317896 bytes
QC/lib/Parallel/Changes | 36 +
QC/lib/Parallel/ForkManager.pm | 416 ++++
QC/lib/Parallel/ForkManager/callback.pl | 48 +
QC/lib/Parallel/ForkManager/parallel_get.pl | 17 +
QC/lib/Parallel/MANIFEST | 8 +
QC/lib/Parallel/Makefile.PL | 7 +
QC/lib/Parallel/TODO | 2 +
QC/lib/Parallel/test.pl | 20 +
QC/lib/html.pl | 470 +++++
QC/lib/main.css | 46 +
QC/lib/version | 1 +
QC/lib/version~ | 1 +
Statistics/AvgQuality.pl | 108 +
Statistics/N50Stat.pl | 172 ++
Trimming/AmbiguityFiltering.pl | 405 ++++
Trimming/HomopolymerTrimming.pl | 233 +++
Trimming/TrimmingReads.pl | 446 ++++
debian/changelog | 5 -
debian/compat | 1 -
debian/control | 26 -
debian/copyright | 12 -
debian/docs | 1 -
debian/install | 2 -
debian/rules | 21 -
debian/source/format | 1 -
debian/upstream/metadata | 12 -
debian/watch | 7 -
41 files changed, 14569 insertions(+), 88 deletions(-)
diff --git a/Format-converter/FastqTo454.pl b/Format-converter/FastqTo454.pl
new file mode 100644
index 0000000..386dd7f
--- /dev/null
+++ b/Format-converter/FastqTo454.pl
@@ -0,0 +1,227 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use File::Basename;
+
+# Parameter variables
+my $file;
+my $helpAsked;
+my $outFolder = "";
+my $subVal;
+
+my $seqFormat = "a"; # 1: Sanger; 2: Solexa; 3: Illumina 1.3+; 4: Illumina 1.5+;
+
+GetOptions(
+ "i=s" => \$file,
+ "h|help" => \$helpAsked,
+ "o|outputFolder=s" => \$outFolder,
+ "v|fastqVariant=s" => \$seqFormat,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(!defined($file)) {
+ prtError("No input files are provided");
+}
+
+my ($fileName, $filePath) = fileparse($file);
+$outFolder = $filePath if($outFolder eq "");
+$outFolder .= "/" if($outFolder !~ /\/$/);
+if(! -e $outFolder) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+}
+
+if($seqFormat =~ /a/i) {
+ print "Checking FASTQ format: File $file...\n";
+ my $nLines = checkFastQFormat($file, 1);
+}
+if($seqFormat == 1) {
+ $subVal = 33;
+ print "Input FASTQ file format: Sanger\n";
+}
+if($seqFormat == 2) {
+ $subVal = 64;
+ print "Input FASTQ file format: Solexa\n";
+}
+if($seqFormat == 3) {
+ $subVal = 64;
+ print "Input FASTQ file format: Illumina 1.3+\n";
+}
+if($seqFormat == 4) {
+ $subVal = 64;
+ print "Input FASTQ file format: Illumina 1.5+\n";
+}
+if($seqFormat == 5) {
+ $subVal = 33;
+ print "Input FASTQ file format: Illumina 1.8+\n";
+}
+
+
+my $outFnaFile = $outFolder . $fileName . "_fna";
+my $outQualFile = $outFolder . $fileName . "_qual";
+
+open(I, "<$file") or die "Can not open file: $file\n";
+open(OF, ">$outFnaFile") or die "Can not open file: $outFnaFile\n";
+open(OQ, ">$outQualFile") or die "Can not open file: $outQualFile\n";
+
+
+while(my $line = <I>) {
+ chomp($line);
+ my $id = $line;
+ $id =~ s/^\@//;
+ print OF ">$id\n";
+ my $seq = <I>;
+ chomp $seq;
+ print OF formatSeq($seq), "\n";
+ <I>;
+ print OQ ">$id\n";
+ my $qualLine = <I>;
+ chomp($qualLine);
+ print OQ &IlluToPhred($qualLine), "\n";
+}
+
+exit;
+
+sub IlluToPhred {
+ my $qualLine = $_[0];
+ my $retQualLine = "";
+ my @ASCII = unpack("C*", $qualLine);
+ my @newASCII = ();
+ foreach my $val (@ASCII) {
+ my $newVal = $val - $subVal;
+ $retQualLine .= $newVal . " ";
+ }
+ chop $retQualLine;
+ return formatQualSeq($retQualLine);
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTQ) (Required)\n";
+ print " -i <Illumina FASTQ read file>\n";
+ print " Read file in Illumina FASTQ format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print " -o | -outputFolder <Output folder name>\n";
+ print " Output will be stored in the given folder\n";
+ print " default: By default, files will be stored where the input file is\n";
+ print " -v | -fastqVariant <FASTQ variant>\n";
+ print " FASTQ variants:\n";
+ print " 1 = Sanger (Phred+33, 33 to 73)\n";
+ print " 2 = Solexa (Phred+64, 59 to 104)\n";
+ print " 3 = Illumina (1.3+) (Phred+64, 64 to 104)\n";
+ print " 4 = Illumina (1.5+) (Phred+64, 66 to 104)\n";
+ print " 5 = Illumina (1.8+) (Phred+33, 33 to 74)\n";
+ print " A = Automatic detection of FASTQ variant\n";
+ print " default: \"A\"\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+sub formatSeq {
+ my $seq = $_[0];
+ my $newSeq = "";
+ my $ch = 60;
+ my $len = length $seq;
+ for(my $i=0; $i<$len; $i+=$ch) {
+ $newSeq .= substr($seq, $i, $ch) . "\n";
+ }
+ chomp($newSeq); # To remove \n at the end of the whole sequence..
+ return $newSeq;
+}
+
+sub formatQualSeq {
+ my $qualSeq = $_[0];
+ my $fQSeq = "";
+ my $ch = 60;
+ my $valCount = 0;
+ my @arr = split(/\s+/, $qualSeq);
+ for(my $i=0; $i<@arr; $i++) {
+ $valCount++;
+ if($valCount % $ch == 0) {
+ $fQSeq .= $arr[$i] . "\n";
+ }
+ else {
+ $fQSeq .= $arr[$i] . " ";
+ }
+ }
+ $fQSeq =~ s/\s+$//;
+ return $fQSeq;
+}
+
+sub checkFastQFormat { # Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file.
+ my $file = $_[0];
+ my $isVariantIdntfcntOn = $_[1];
+ my $lines = 0;
+ open(F, "<$file") or die "Can not open file $file\n";
+ my $counter = 0;
+ my $minVal = 1000;
+ my $maxVal = 0;
+ while(my $line = <F>) {
+ $lines++;
+ $counter++;
+ next if($line =~ /^\n$/);
+ if($counter == 1 && $line !~ /^\@/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 3 && $line !~ /^\+/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 4 && $lines < 1000000) {
+ chomp $line;
+ my @ASCII = unpack("C*", $line);
+ $minVal = min(min(@ASCII), $minVal);
+ $maxVal = max(max(@ASCII), $maxVal);
+ }
+ if($counter == 4) {
+ $counter = 0;
+ }
+ }
+ close(F);
+ my $tseqFormat = 0;
+ if($minVal >= 33 && $minVal <= 73 && $maxVal >= 33 && $maxVal <= 73) {
+ $tseqFormat = 1;
+ }
+ elsif($minVal >= 66 && $minVal <= 105 && $maxVal >= 66 && $maxVal <= 105) {
+ $tseqFormat = 4; # Illumina 1.5+
+ }
+ elsif($minVal >= 64 && $minVal <= 105 && $maxVal >= 64 && $maxVal <= 105) {
+ $tseqFormat = 3; # Illumina 1.3+
+ }
+ elsif($minVal >= 59 && $minVal <= 105 && $maxVal >= 59 && $maxVal <= 105) {
+ $tseqFormat = 2; # Solexa
+ }
+ elsif($minVal >= 33 && $minVal <= 74 && $maxVal >= 33 && $maxVal <= 74) {
+ $tseqFormat = 5; # Illumina 1.8+
+ }
+ if($isVariantIdntfcntOn) {
+ $seqFormat = $tseqFormat;
+ }
+ else {
+ if($tseqFormat != $seqFormat) {
+ print STDERR "Warning: It seems the specified variant of FASTQ doesn't match the quality values in input FASTQ files.\n";
+ }
+ }
+ return $lines;
+}
+
diff --git a/Format-converter/FastqToFasta.pl b/Format-converter/FastqToFasta.pl
new file mode 100644
index 0000000..c5451c2
--- /dev/null
+++ b/Format-converter/FastqToFasta.pl
@@ -0,0 +1,87 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use File::Basename;
+
+# Parameter variables
+my $file;
+my $helpAsked;
+my $outFile = "";
+
+GetOptions(
+ "i=s" => \$file,
+ "h|help" => \$helpAsked,
+ "o|outputFile=s" => \$outFile,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(!defined($file)) {
+ prtError("No input files are provided");
+}
+
+my ($fileName, $filePath) = fileparse($file);
+$outFile = $file . "_fasta" if($outFile eq "");
+
+open(I, "<$file") or die "Can not open file: $file\n";
+open(OF, ">$outFile") or die "Can not open file: $outFile\n";
+
+
+while(my $line = <I>) {
+ chomp($line);
+ my $id = $line;
+ $id =~ s/^\@//;
+ print OF ">$id\n";
+ my $seq = <I>;
+ print OF formatSeq($seq);
+ <I>;
+ <I>;
+}
+
+exit;
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTQ) (Required)\n";
+ print " -i <FASTQ read file>\n";
+ print " Read file in FASTQ format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print " -o | -outputFile <Output file name>\n";
+ print " Output will be stored in the given file\n";
+ print " default: By default, file will be stored where the input file is\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+sub formatSeq {
+ my $seq = $_[0];
+ my $newSeq = "";
+ my $ch = 60;
+ for(my $i=0; $i<length $seq; $i+=$ch) {
+ $newSeq .= substr($seq, $i, $ch) . "\n";
+ }
+ chomp($newSeq); # To remove \n at the end of the whole sequence..
+ return $newSeq;
+}
+
diff --git a/Format-converter/SangerFastqToIlluFastq.pl b/Format-converter/SangerFastqToIlluFastq.pl
new file mode 100644
index 0000000..ec95d9c
--- /dev/null
+++ b/Format-converter/SangerFastqToIlluFastq.pl
@@ -0,0 +1,88 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use File::Basename;
+
+# Parameter variables
+my $file;
+my $helpAsked;
+my $outFile = "";
+
+GetOptions(
+ "i=s" => \$file,
+ "h|help" => \$helpAsked,
+ "o|outputFile=s" => \$outFile,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(!defined($file)) {
+ prtError("No input files are provided");
+}
+
+my ($fileName, $filePath) = fileparse($file);
+$outFile = $file . "_illu.fq" if($outFile eq "");
+
+open(I, "<$file") or die "Can not open file: $file\n";
+open(OF, ">$outFile") or die "Can not open file: $outFile\n";
+
+while(my $line = <I>) {
+ chomp($line);
+ my $id = $line;
+ $id =~ s/^\@//;
+ print OF "\@$id\n";
+ my $seq = <I>;
+ print OF "$seq";
+ <I>;
+ print OF "+\n";
+ my $qualLine = <I>;
+ chomp($qualLine);
+ print OF &IlluToSanger($qualLine), "\n";
+}
+
+exit;
+
+sub IlluToSanger {
+ my $qualLine = $_[0];
+ my @ASCII = unpack("C*", $qualLine);
+ my @newASCII = ();
+ foreach my $val (@ASCII) {
+ push(@newASCII, $val+31);
+ }
+ my $retQualLine = pack("C*", @newASCII);
+ return $retQualLine;
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTQ) (Required)\n";
+ print " -i <Sanger FASTQ read file>\n";
+ print " Read file in Sanger FASTQ format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print " -o | -outputFile <Output file name>\n";
+ print " Output will be stored in the given file\n";
+ print " default: By default, file will be stored where the input file is\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
diff --git a/Format-converter/SolexaFastqToIlluFastq.pl b/Format-converter/SolexaFastqToIlluFastq.pl
new file mode 100644
index 0000000..edaaffd
--- /dev/null
+++ b/Format-converter/SolexaFastqToIlluFastq.pl
@@ -0,0 +1,89 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use File::Basename;
+
+# Parameter variables
+my $file;
+my $helpAsked;
+my $outFile = "";
+
+GetOptions(
+ "i=s" => \$file,
+ "h|help" => \$helpAsked,
+ "o|outputFile=s" => \$outFile,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(!defined($file)) {
+ prtError("No input files are provided");
+}
+
+my ($fileName, $filePath) = fileparse($file);
+$outFile = $file . "_illu.fq" if($outFile eq "");
+
+open(I, "<$file") or die "Can not open file: $file\n";
+open(OF, ">$outFile") or die "Can not open file: $outFile\n";
+
+while(my $line = <I>) {
+ chomp($line);
+ my $id = $line;
+ $id =~ s/^\@//;
+ print OF "\@$id\n";
+ my $seq = <I>;
+ print OF "$seq";
+ <I>;
+ print OF "+\n";
+ my $qualLine = <I>;
+ chomp($qualLine);
+ print OF &IlluToSanger($qualLine), "\n";
+}
+
+exit;
+
+sub IlluToSanger {
+ my $qualLine = $_[0];
+ my @ASCII = unpack("C*", $qualLine);
+ my @newASCII = ();
+ foreach my $val (@ASCII) {
+ my $Q = sprintf "%0.0f", 10 * log(1 + 10 ** (($val - 64) / 10.0)) / log(10);
+ push(@newASCII, $Q+64);
+ }
+ my $retQualLine = pack("C*", @newASCII);
+ return $retQualLine;
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTQ) (Required)\n";
+ print " -i <Solexa FASTQ read file>\n";
+ print " Read file in Solexa FASTQ format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print " -o | -outputFile <Output file name>\n";
+ print " Output will be stored in the given file\n";
+ print " default: By default, file will be stored where the input file is\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
diff --git a/NGSQCToolkitv2.3.3_manual.pdf b/NGSQCToolkitv2.3.3_manual.pdf
new file mode 100644
index 0000000..503c2a1
Binary files /dev/null and b/NGSQCToolkitv2.3.3_manual.pdf differ
diff --git a/QC/454QC.pl b/QC/454QC.pl
new file mode 100644
index 0000000..260bba3
--- /dev/null
+++ b/QC/454QC.pl
@@ -0,0 +1,1572 @@
+#! /usr/bin/perl
+
+use File::Basename;
+#BEGIN {
+# my ($tmp, $path) = fileparse($0);
+# push ( @INC,"$path/lib");
+# #use lib "$path";
+#}
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use Cwd qw(abs_path);
+use IO::Zlib;
+use FindBin qw($RealBin);
+use lib "$RealBin/lib";
+require "454html.pl";
+
+eval {
+ require Parallel::ForkManager;
+ require String::Approx;
+ require GD::Graph::linespoints;
+ require GD::Graph::bars;
+ require GD::Graph::pie;
+ require GD::Text::Wrap;
+};
+
+my $isGDMod = 1;
+
+if($@) {
+ my $errorText = join("", $@);
+ if($errorText =~ /Parallel/) {
+ print "Error:\n\tCan not find 'lib' folder with this perl program\n"; #module 'Parallel::ForkManager'\n";
+ print "\tCopy the 'lib' folder, provided with the toolkit, to the directory where this perl program is and try again\n\n";
+ exit;
+ }
+ elsif($errorText =~ /GD\/Graph\/linespoints/) {
+ print STDERR "Warning:\n\tCan not find module 'GD::Graph'\n";
+ print STDERR "\tGraphs for statistics will not be produced. \n\t\t\tOR \n\tInstall GD::Graph module and try again.\n\n";
+ $isGDMod = 0;
+ }
+ elsif($errorText =~ /String\/Approx/) {
+ print STDERR "Error:\n\tCan not find module 'String::Approx'\n";
+ print STDERR "\tInstall it and try again\n\n";
+ exit;
+ }
+}
+
+
+# Setting parameters
+my $lowestValidLen = 100;
+my @files = ();
+my $noOfInp = 3;
+my $helpAsked;
+my $cutOffReadLen4HQ = 70;
+my $cutOffPhScore = 20;
+my $outFolder = "";
+my $isOnlyStat;
+my $statOutFmt = 1;
+my $noOfProcesses = 1;
+my $homoPolyLen = 0;
+my $priAdaLib;
+my $isLenFilterOn = 1;
+my @priAdaLibNames = ("Rapid Library (Standard)", "Paired End Library", "Amplicon PE Library", "Small RNA Library");
+my $priAdaFile;
+my @usrDefinedPriAda = ();
+my $outputDataFmt = "t"; # t/T: Text; g/G: Gzip.
+
+GetOptions(
+ "i=s{$noOfInp}" => \@files,
+ "h|help" => \$helpAsked,
+ "l|cutOffReadLen4HQ=f" => \$cutOffReadLen4HQ,
+ "n|homoPolyLen=i" => \$homoPolyLen,
+ "o|outputFolder=s" => \$outFolder,
+ "z|outputDataCompression=s" => \$outputDataFmt,
+ "t|statOutFmt=i" => \$statOutFmt,
+ "onlyStat" => \$isOnlyStat,
+ "p|processes=i" => \$noOfProcesses,
+ "s|cutOffQualScore=i" => \$cutOffPhScore,
+ "m|minLen=i" => \$lowestValidLen,
+ "f|lenFilter=s" => \$isLenFilterOn,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(@files == 0) {
+ prtError("No input files are provided");
+}
+my @tempFiles = ();
+prtError("Missing inputs for option -i") if((scalar @files)%$noOfInp != 0);
+for(my $i=0; $i<@files; $i+=$noOfInp) {
+ my $str = "$files[$i] $files[$i+1] $files[$i+2]";
+ if($files[$i+2] =~ /^-/) {
+ prtError("Missing inputs for option -i: at '-i $str'")
+ }
+ if($files[$i+2] =~ /^\d$/) {
+ if($files[$i+2] < 1 || $files[$i+2] > 4) {
+ prtError("Incorrect option for Primer/Adaptor library: at '-i $str'");
+ }
+ }
+ push(@tempFiles, $str);
+}
+ at files = ();
+ at files = @tempFiles;
+if($cutOffReadLen4HQ < 0 || $cutOffReadLen4HQ > 100) {
+ prtError("Incorrect value for -l|cutOffReadLen4HQ option: at '-l $cutOffReadLen4HQ'");
+}
+if($cutOffPhScore < 0 || $cutOffPhScore > 40) {
+ prtError("Incorrect value for -s|cutOffPhScore option: at '-s $cutOffPhScore'");
+}
+if($statOutFmt < 1 || $statOutFmt > 2) {
+ prtError("Incorrect value for -statOutFmt: at '-statOutFmt $statOutFmt'");
+}
+if($isLenFilterOn =~ /^N/i) {
+ $isLenFilterOn = 0;
+}
+else {
+ $isLenFilterOn = 1;
+}
+if($outputDataFmt !~ /^[tg]$/i) {
+ prtError("Incorrect value for -f|outputDataFmt option: at '-f $outputDataFmt'");
+}
+
+my $pm = new Parallel::ForkManager($noOfProcesses);
+
+
+my $trimCount = 0;
+my $seqCount = 0;
+my $ttlSeqCount = 0;
+my $lt100 = 0;
+my $hQCount = 0;
+my $lQCount = 0;
+my $maxRawLen = 0;
+my $minRawLen = 1000000000000;
+my $avgRawLen = 0;
+my $maxHQLen = 0;
+my $minHQLen = 1000000000000;
+my $avgHQLen = 0;
+my @rawLen = ();
+my @hQLen = ();
+my $totalBases = 0;
+my $totalHQBases = 0;
+my $totalBasesAfterHQ = 0;
+my $totalHQBasesAfterHQ = 0;
+my $totalBasesFinal = 0;
+my $totalHQBasesFinal = 0;
+my $totalReadsFinal = 0;
+my $avgQual = 0;
+my $avgQualFinal = 0;
+my $totalValidReadsWithPriAda = 0;
+my $totalValidReadsNoPriAda = 0;
+my $substrlen = 20; # For removePriAda
+my $mismLim = 1; # For removePriAda
+
+my $fastaSeqId = "";
+my $fastaSeq = "";
+my $qualSeqId = "";
+my $qualSeq = "";
+my $prevFastaSeqId = "";
+my $indOfAnalysis = 0;
+
+my @lenDistrib = ();
+my $lenInterval = 40;
+my @qualDistrib = ();
+my $qualInterval = 1;
+my @gcDistrib = ();
+my $gcInterval = 5;
+my @charCount = ();
+
+my $font_spec = getFilePath($0) . "lib/Fonts/Dustismo_Sans.ttf";
+my $f = getFilePath($0) . "lib/Fonts/LucidaSansDemiBold.ttf";
+
+
+foreach my $inpData (@files) {
+ $indOfAnalysis++;
+my $pid = $pm->start and next;
+ $inpData =~ s/\\([A-Za-z_\.])/\/$1/g; # To remove '\' from the path of windows file
+ my @iData = split(" ", $inpData);
+ my $seqFile = $iData[0];
+ my $qualFile = $iData[1];
+ $priAdaLib = $iData[2];
+ print "Analysis has been started for \"$seqFile\": Index: $indOfAnalysis\n";
+ if($priAdaLib =~ /^n$/i) {
+ undef $priAdaLib;
+ }
+ elsif($priAdaLib =~ /^\d$/) {
+ $priAdaLib = $priAdaLib - 1;
+ }
+ else {
+ $priAdaFile = $priAdaLib;
+ $priAdaLib = "u";
+ open(PRIADA, "<$priAdaFile") or die "Can not open the user-defined primer/adapter file: $priAdaFile\n";
+ @usrDefinedPriAda = <PRIADA>;
+ for(my $i=0; $i<$#usrDefinedPriAda; $i++) {
+ $usrDefinedPriAda[$i] =~ s/\s+//g;
+ }
+ }
+ my ($seqFileName, $filePath) = fileparse($seqFile);
+ my ($qualFileName) = fileparse($qualFile);
+ $outFolder = $filePath . "454QC_Filtered_files" if($outFolder eq "");
+ $outFolder .= "/" if($outFolder !~ /\/$/);
+ if(! -e $outFolder) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+ }
+ my $outSeqFile = $outFolder . $seqFileName . "_filtered";
+ my $outQualFile = $outFolder . $qualFileName . "_filtered";
+ $outSeqFile .= ".gz" if($outputDataFmt =~ /g/i);
+ $outQualFile .= ".gz" if($outputDataFmt =~ /g/i);
+ my $statFile = $outFolder . $seqFileName . "_stat";
+
+ my $iH;
+ openFileGetHandle($seqFile, "r", \$iH);
+ *I = $iH;
+ my $qH;
+ openFileGetHandle($qualFile, "r", \$qH);
+ *Q = $qH;
+ if(!defined($isOnlyStat)) {
+ my $oiH;
+ openFileGetHandle($outSeqFile, "w", \$oiH);
+ *OI = $oiH;
+ my $oqH;
+ openFileGetHandle($outQualFile, "w", \$oqH);
+ *OQ = $oqH;
+ }
+ open(STAT, ">$statFile") or die "Can not open file: $statFile\n";
+ while(my $line = <I>) {
+ $ttlSeqCount++ if($line =~ /^>/);
+ }
+ close(I);
+ print "$indOfAnalysis: Number of reads processed: " . "0/$ttlSeqCount (0\%)...\n";
+ undef($iH);
+ openFileGetHandle($seqFile, "r", \$iH);
+ *I = $iH;
+
+ while(my $line = <I>) {
+ chomp $line;
+ my $qualLine = <Q>;
+ chomp($qualLine);
+ if($line =~ /^>/) {
+ $seqCount++;
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ $qualSeqId = $qualLine;
+ if($fastaSeqId ne $qualSeqId) {
+ print STDERR "Error: Read Id doesn't match in sequence and quality file for read number $seqCount in sequence file.\n";
+ exit(-1);
+ }
+ if($fastaSeq ne "") {
+ processSeq();
+ }
+ $fastaSeq = "";
+ $qualSeq = "";
+ }
+ else {
+ $fastaSeq .= $line;
+ $qualSeq .= $qualLine . " ";
+ }
+ }
+ if($fastaSeq ne "") {
+ $prevFastaSeqId = $fastaSeqId;
+ processSeq();
+ }
+
+ print "$indOfAnalysis: Number of reads processed: " . "$ttlSeqCount/$ttlSeqCount (100\%)...\n";
+ print "$indOfAnalysis: Analysis completed\n";
+
+ print "$indOfAnalysis: Printing Statistics...\n";
+
+ if($statOutFmt == 1) {
+ my $inde = " " x 1;
+ my $tmpPer = 0;
+ printf STAT "Parameters\n";
+ printf STAT "$inde %-40s %s %s\n", "Input files ", $seqFile, $qualFile;
+ printf STAT "$inde %-40s %s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf STAT "$inde %-40s %s\n", "Homopolymer trimming", "Off" if($homoPolyLen == 0);
+ printf STAT "$inde %-40s %s\n", "Homopolymer trimming", "On" if($homoPolyLen != 0);
+ printf STAT "$inde %-40s %s\n", "Length of the homopolymer to be removed", $homoPolyLen if($homoPolyLen != 0);
+ printf STAT "$inde %-40s %s\n", "Length filter", ($isLenFilterOn)?"On":"Off";
+ printf STAT "$inde %-40s %s\n", "Cut-off for minimum read length", $lowestValidLen if($isLenFilterOn);
+ printf STAT "$inde %-40s %s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf STAT "$inde %-40s %s\n", "Cut-off quality score", $cutOffPhScore;
+ printf STAT "$inde %-40s %s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf STAT "$inde %-40s %s\n", "Number of processes", $noOfProcesses;
+
+ print STAT "\n\n";
+
+ print STAT "QC statistics\n";
+ printf STAT "$inde %-70s %s\n", "File name", $seqFileName;
+ printf STAT "$inde %-70s %d\n", "Total number of reads", $seqCount;
+ printf STAT "$inde %-70s %d\n", "Total number of trimmed reads containing homopolymer", $trimCount;
+ printf STAT "$inde %-70s %d\n", "Total number of trashed reads (<$lowestValidLen bp in length after trimming)", $lt100;
+ printf STAT "$inde %-70s %d\n", "Total number of low quality reads (excluding <$lowestValidLen reads)", $lQCount;
+ printf STAT "$inde %-70s %d\n", "Total number of HQ reads", $hQCount;
+ $tmpPer = sprintf "%0.2f", $hQCount/$seqCount*100;
+ printf STAT "$inde %-70s %s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf STAT "$inde %-70s %.f\n", "Total number of bases", $totalBases;
+ printf STAT "$inde %-70s %.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ;
+ printf STAT "$inde %-70s %.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ;
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ/$totalBasesAfterHQ*100;
+ printf STAT "$inde %-70s %s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf STAT "$inde %-70s %d\n", "Number of Primer/Adaptor trimmed reads", $totalValidReadsWithPriAda;
+ }
+ else {
+ printf STAT "$inde %-70s %s\n", "Number of Primer/Adaptor trimmed reads", "NA", "NA";
+ }
+ printf STAT "$inde %-70s %d\n", "Total number of HQ filtered reads", $totalReadsFinal;
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal/$seqCount*100;
+ printf STAT "$inde %-70s %s\n", "Percentage of HQ filtered reads", $tmpPer."%";
+
+ print STAT "\n\n";
+
+ print STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", $seqFileName, (fileparse($outSeqFile))[0]],
+ ["Total number of reads", $seqCount, $totalReadsFinal],
+ ["Minimum read length", $minRawLen, $minHQLen],
+ ["Maximum read length", $maxRawLen, $maxHQLen],
+ ["Average read length", (sprintf "%0.2f", $totalBases/$seqCount), (sprintf "%0.2f", $totalBasesFinal/$totalReadsFinal)],
+ ["Median read length", calcMedian(@rawLen), calcMedian(@hQLen)],
+ ["N25 length", calcN50(\@rawLen, 25), calcN50(\@hQLen, 25)],
+ ["N50 length", calcN50(\@rawLen, 50), calcN50(\@hQLen, 50)],
+ ["N75 length", calcN50(\@rawLen, 75), calcN50(\@hQLen, 75)],
+ ["N90 length", calcN50(\@rawLen, 90), calcN50(\@hQLen, 90)],
+ ["N95 length", calcN50(\@rawLen, 95), calcN50(\@hQLen, 95)],
+ ["Total number of bases", $totalBases, $totalBasesFinal],
+ ["Total number of HQ bases", $totalHQBases, $totalHQBasesFinal],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases/$totalBases*100)."%", (sprintf "%0.2f", $totalHQBasesFinal/$totalBasesFinal*100)."%"],
+ ["Average quality score (Overall)", (sprintf "%0.2f", $avgQual/$seqCount), (sprintf "%0.2f", $avgQualFinal/$totalReadsFinal)],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf STAT "$inde %-50s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf STAT "$inde %-50s %s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print STAT "\n\n";
+ }
+ elsif($statOutFmt == 2) {
+ my $inde = " " x 1;
+ my $tmpPer = 0;
+ printf STAT "Parameters\n";
+ printf STAT "\t%s\t%s\t%s\n", "Input files ", $seqFile, $qualFile;
+ printf STAT "\t%s\t%s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf STAT "\t%s\t%s\n", "Homopolymer trimming", "Off" if($homoPolyLen == 0);
+ printf STAT "\t%s\t%s\n", "Homopolymer trimming", "On" if($homoPolyLen != 0);
+ printf STAT "\t%s\t%s\n", "Length of the homopolymer to be removed", $homoPolyLen if($homoPolyLen != 0);
+ printf STAT "\t%s\t%s\n", "Length filter", ($isLenFilterOn)?"On":"Off";
+ printf STAT "\t%s\t%s\n", "Cut-off for minimum read length", $lowestValidLen if($isLenFilterOn);
+ printf STAT "\t%s\t%s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf STAT "\t%s\t%s\n", "Cut-off quality score", $cutOffPhScore;
+ printf STAT "\t%s\t%s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf STAT "\t%s\t%s\n", "Number of processes", $noOfProcesses;
+
+ print STAT "\n\n";
+
+ print STAT "QC statistics\n";
+ printf STAT "\t%s\t%s\n", "File name", $seqFileName;
+ printf STAT "\t%s\t%d\n", "Total number of reads", $seqCount;
+ printf STAT "\t%s\t%d\n", "Total number of trimmed reads containing homopolymer", $trimCount;
+ printf STAT "\t%s\t%d\n", "Total number of trashed reads (<$lowestValidLen bp in length after trimming)", $lt100;
+ printf STAT "\t%s\t%d\n", "Total number of low quality reads (excluding <$lowestValidLen reads)", $lQCount;
+ printf STAT "\t%s\t%d\n", "Total number of HQ reads", $hQCount;
+ $tmpPer = sprintf "%0.2f", $hQCount/$seqCount*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf STAT "\t%s\t%.f\n", "Total number of bases", $totalBases;
+ printf STAT "\t%s\t%.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ;
+ printf STAT "\t%s\t%.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ;
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ/$totalBasesAfterHQ*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf STAT "\t%s\t%d\n", "Number of Primer/Adaptor trimmed reads", $totalValidReadsWithPriAda;
+ }
+ else {
+ printf STAT "\t%s\t%s\n", "Number of Primer/Adaptor trimmed reads", "NA", "NA";
+ }
+ printf STAT "\t%s\t%d\n", "Total number of HQ filtered reads", $totalReadsFinal;
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal/$seqCount*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ filtered reads", $tmpPer."%";
+
+ print STAT "\n\n";
+
+ print STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", $seqFileName, (fileparse($outSeqFile))[0]],
+ ["Total number of reads", $seqCount, $totalReadsFinal],
+ ["Minimum read length", $minRawLen, $minHQLen],
+ ["Maximum read length", $maxRawLen, $maxHQLen],
+ ["Average read length", (sprintf "%0.2f", $totalBases/$seqCount), (sprintf "%0.2f", $totalBasesFinal/$totalReadsFinal)],
+ ["Median read length", calcMedian(@rawLen), calcMedian(@hQLen)],
+ ["N25 length", calcN50(\@rawLen, 25), calcN50(\@hQLen, 25)],
+ ["N50 length", calcN50(\@rawLen, 50), calcN50(\@hQLen, 50)],
+ ["N75 length", calcN50(\@rawLen, 75), calcN50(\@hQLen, 75)],
+ ["N90 length", calcN50(\@rawLen, 90), calcN50(\@hQLen, 90)],
+ ["N95 length", calcN50(\@rawLen, 95), calcN50(\@hQLen, 95)],
+ ["Total number of bases", $totalBases, $totalBasesFinal],
+ ["Total number of HQ bases", $totalHQBases, $totalHQBasesFinal],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases/$totalBases*100)."%", (sprintf "%0.2f", $totalHQBasesFinal/$totalBasesFinal*100)."%"],
+ ["Average quality score (Overall)", (sprintf "%0.2f", $avgQual/$seqCount), (sprintf "%0.2f", $avgQualFinal/$totalReadsFinal)],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf STAT "\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf STAT "\t%s\t%s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print STAT "\n\n";
+ }
+
+ my $lenDistF1 = getFileName($seqFile)."_lenDistribution.png";
+ my $qualDistF1 = getFileName($seqFile)."_qualDistribution.png";
+ my $sumPieF = getFileName($seqFile). "_summary.png";
+ my $gcDistF1 = getFileName($seqFile)."_gcDistribution.png";
+ my $baseCntF1 = getFileName($seqFile)."_baseCompostion.png";
+
+ my $c = 0;
+ my @lenLabels = ();
+ foreach my $arrRef (@lenDistrib) {
+ my $str = "";
+ foreach my $val (@{$arrRef}) {
+ if($c == 0) {
+ $str = "0-$lenInterval";
+ }
+ else {
+ $str = $lenInterval*$c . "-" . $lenInterval*($c+1);
+ }
+ $c++;
+ push(@lenLabels, $str);
+ }
+ last;
+ }
+
+ unshift(@lenDistrib, \@lenLabels);
+
+ if($isGDMod) {
+ drawLenDist(\@lenDistrib, $outFolder.$lenDistF1, getFileName($seqFile), 550, 350);
+ }
+
+ $c = 0;
+ my @qualLabels = ();
+ foreach my $arrRef (@qualDistrib) {
+ my $str = "";
+ foreach my $val (@{$arrRef}) {
+ if($c == 0) {
+ $str = "0";
+ $str .= "-$qualInterval" if($qualInterval>1);
+ }
+ else {
+ $str = $qualInterval*$c;
+ $str .= "-" . $qualInterval*($c) if($qualInterval>1);
+ }
+ push(@qualLabels, $str);
+ $c++;
+ }
+ last;
+ }
+
+ unshift(@qualDistrib, \@qualLabels);
+
+ if($isGDMod) {
+ drawQualDist(\@qualDistrib, $outFolder.$qualDistF1, getFileName($seqFile), 650, 350);
+ }
+
+ my $trashedReads = $lt100;
+ my $trimmedHP = $trimCount;
+ my $trimmedPA = $totalValidReadsWithPriAda;
+ my $hQreadsExcptHP_PATrimmed = $totalReadsFinal - $trimmedHP - $trimmedPA;
+ my $lQreadsGT100 = $seqCount - $totalReadsFinal - $trashedReads;
+ my @summaryData = (["", "", "", "", ""], [$trashedReads, $trimmedHP, $trimmedPA, $hQreadsExcptHP_PATrimmed, $lQreadsGT100]);
+
+ if($isGDMod) {
+ drawSummaryPie(\@summaryData, $outFolder.$sumPieF, 520, 350);
+ }
+
+ $c=0;
+ my @gcLabel;
+ foreach my $ref (@gcDistrib) {
+ foreach my $val (@{$ref}) {
+ my $str = "";
+ if($c == 0) {
+ $str = "0-$gcInterval";
+ }
+ else {
+ $str = $gcInterval*$c . "-" . $gcInterval*($c+1);
+ }
+ $c++;
+ push(@gcLabel, $str);
+ }
+ last;
+ }
+
+ unshift(@gcDistrib, \@gcLabel);
+ if($isGDMod) {
+ drawGCDist(\@gcDistrib, $outFolder.$gcDistF1, getFileName($seqFile), 550, 350);
+ }
+
+
+ my @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCount[0]);
+ @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCount[0], $charCount[1]) if(!$isOnlyStat);
+ if($isGDMod) {
+ drawBaseComp(\@file1, $outFolder.$baseCntF1, getFileName($seqFile), 500, 300);
+ }
+
+
+ close(I);
+ close(Q);
+ close(OI);
+ close(OQ);
+ close(STAT);
+
+ my $iFol = getFilePath(abs_path($seqFile));
+ my $oFol = abs_path($outFolder) . "/";
+ my $inpFs = getFileName($seqFile);
+ $inpFs .= ":::::" . getFileName($qualFile);
+ my $htF = $oFol . "output_" . getFileName($seqFile);
+ $htF .= ".html";
+ my @fileNames4HTML;
+ @fileNames4HTML = ($outSeqFile, $outQualFile, $lenDistF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieF);
+ htmlPrint(getFilePath(abs_path($0)), getFileName($0), $htF, $iFol, $isOnlyStat, $inpFs, $statFile, $oFol, \@fileNames4HTML);
+
+$pm->finish;
+}
+$pm->wait_all_children;
+
+print "================================================================\n";
+print "Processing has been finished\n";
+print "Output files are generated in $outFolder\n" if($outFolder ne "");
+print "Output files are generated in the folder of input files\n" if($outFolder eq "");
+print "================================================================\n";
+
+
+exit;
+
+
+sub openFileGetHandle {
+ my ($file, $rOrw, $ref) = @_;
+ if($file =~ /\.gz$/i) {
+ $$ref = new IO::Zlib;
+ $$ref->open("$file", "rb") or die "Can not open file $file" if($rOrw eq "r");
+ $$ref->open("$file", "wb") or die "Can not create file $file" if($rOrw eq "w");
+ }
+ else {
+ open($$ref, "<$file") or die "Can not open file $file" if($rOrw eq "r");
+ open($$ref, ">$file") or die "Can not create file $file" if($rOrw eq "w");
+ }
+}
+
+
+sub processSeq {
+ $fastaSeq =~ s/\s//g;
+ my $len = length $fastaSeq;
+ $maxRawLen = max($maxRawLen, $len);
+ $minRawLen = min($minRawLen, $len);
+ push(@rawLen, $len);
+ $qualSeq =~ s/\s+$//; # To remove the last space added in 'else' part;
+ my @tmpArr = getQualBases($qualSeq);
+ $totalBases += $tmpArr[0];
+ $totalHQBases += $tmpArr[1];
+ $avgQual += $tmpArr[2];
+ $lenDistrib[0][getIndex($len,$lenInterval)]++;
+ $qualDistrib[0][getIndex($tmpArr[2],$qualInterval)]++;
+ my $As = $fastaSeq =~ s/A/A/gi;
+ my $Ts = $fastaSeq =~ s/T/T/gi;
+ my $Gs = $fastaSeq =~ s/G/G/gi;
+ my $Cs = $fastaSeq =~ s/C/C/gi;
+ my $Ns = $len - $As - $Ts - $Gs - $Cs;
+ my $gcPercent = ($Gs + $Cs)/$len*100;
+ $gcDistrib[0][getIndex($gcPercent,$gcInterval)]++;
+ $charCount[0][0] += $As;
+ $charCount[0][1] += $Ts;
+ $charCount[0][2] += $Gs;
+ $charCount[0][3] += $Cs;
+ $charCount[0][4] += $Ns;
+ if(length $fastaSeq < $lowestValidLen && $isLenFilterOn) {
+ $lt100++;
+ }
+ else {
+ if($homoPolyLen != 0) {
+ if(hasPolyChar(\$fastaSeq)) {
+ $trimCount++;
+ if(length $fastaSeq >= $lowestValidLen || !$isLenFilterOn) {
+ $qualSeq = trimQualSeq($qualSeq, length $fastaSeq, -1);
+ }
+ }
+ }
+ if(length $fastaSeq < $lowestValidLen && $isLenFilterOn) {
+ $lt100++;
+ }
+ else {
+ if(isReadOfHQ($qualSeq)) {
+ $hQCount++;
+ $totalBasesAfterHQ += length $fastaSeq;
+ if(defined $priAdaLib) {
+ my $t=isWOPriAda(\$fastaSeq);
+ if($t > -1) {
+ $qualSeq = trimQualSeq($qualSeq, length $fastaSeq, $t);
+ }
+ if(length $fastaSeq < $lowestValidLen && $isLenFilterOn) {
+ $lt100++;
+ }
+ else {
+ my $len = length $fastaSeq;
+ $maxHQLen = max($maxHQLen, $len);
+ $minHQLen = min($minHQLen, $len);
+ $avgHQLen += $len;
+ push(@hQLen, $len);
+ $totalReadsFinal++;
+ @tmpArr = getQualBases($qualSeq);
+ $totalBasesFinal += $tmpArr[0];
+ $totalHQBasesFinal += $tmpArr[1];
+ $avgQualFinal += $tmpArr[2];
+ if(!defined($isOnlyStat)) {
+ $lenDistrib[1][getIndex($len,$lenInterval)]++;
+ $qualDistrib[1][getIndex($tmpArr[2],$qualInterval)]++;
+ my $As = $fastaSeq =~ s/A/A/gi;
+ my $Ts = $fastaSeq =~ s/T/T/gi;
+ my $Gs = $fastaSeq =~ s/G/G/gi;
+ my $Cs = $fastaSeq =~ s/C/C/gi;
+ my $Ns = $len - $As - $Ts - $Gs - $Cs;
+ my $gcPercent = ($len)?(($Gs + $Cs)/$len*100):0;
+ $gcDistrib[1][getIndex($gcPercent,$gcInterval)]++;
+ $charCount[1][0] += $As;
+ $charCount[1][1] += $Ts;
+ $charCount[1][2] += $Gs;
+ $charCount[1][3] += $Cs;
+ $charCount[1][4] += $Ns;
+ print OI "$prevFastaSeqId\n";
+ print OI formatSeq($fastaSeq), "\n";
+ print OQ "$prevFastaSeqId\n";
+ print OQ formatQualSeq($qualSeq), "\n";
+ }
+ }
+ }
+ else {
+ my $len = length $fastaSeq;
+ $maxHQLen = max($maxHQLen, $len);
+ $minHQLen = min($minHQLen, $len);
+ $avgHQLen += $len;
+ push(@hQLen, $len);
+ $totalReadsFinal++;
+ @tmpArr = getQualBases($qualSeq);
+ $totalBasesFinal += $tmpArr[0];
+ $totalHQBasesFinal += $tmpArr[1];
+ $avgQualFinal += $tmpArr[2];
+ if(!defined($isOnlyStat)) {
+ $lenDistrib[1][getIndex($len,$lenInterval)]++;
+ $qualDistrib[1][getIndex($tmpArr[2],$qualInterval)]++;
+ my $As = $fastaSeq =~ s/A/A/gi;
+ my $Ts = $fastaSeq =~ s/T/T/gi;
+ my $Gs = $fastaSeq =~ s/G/G/gi;
+ my $Cs = $fastaSeq =~ s/C/C/gi;
+ my $Ns = $len - $As - $Ts - $Gs - $Cs;
+ my $gcPercent = ($len)?(($Gs + $Cs)/$len*100):0;
+ $gcDistrib[1][getIndex($gcPercent,$gcInterval)]++;
+ $charCount[1][0] += $As;
+ $charCount[1][1] += $Ts;
+ $charCount[1][2] += $Gs;
+ $charCount[1][3] += $Cs;
+ $charCount[1][4] += $Ns;
+ print OI "$prevFastaSeqId\n";
+ print OI formatSeq($fastaSeq), "\n";
+ print OQ "$prevFastaSeqId\n";
+ print OQ formatQualSeq($qualSeq), "\n";
+ }
+ }
+ }
+ else {
+ $lQCount++;
+ }
+ }
+ }
+ if($seqCount % (10000) == 0) {
+ my $tmpP = sprintf "%0.0f", ($seqCount/$ttlSeqCount*100);
+ print "$indOfAnalysis: Number of reads processed: " . $seqCount . "/$ttlSeqCount ($tmpP\%)...\n";
+ }
+}
+
+sub getIndex {
+ my $up = $_[0];
+ my $down = $_[1];
+ my $inp = $up/$down;
+ return (sprintf "%0.0f", $up) if($down == 1);
+ my $index = int((sprintf "%0.2f", $inp)+0.99)-1;
+ $index = 0 if($index < 0);
+ return $index;
+}
+
+sub calcN50 {
+ my @x = @{$_[0]};
+ my $n = $_[1];
+ @x=sort{$b<=>$a} @x;
+ my $total = sum(@x);
+ my ($count, $n50)=(0,0);
+ for (my $j=0; $j<@x; $j++){
+ $count+=$x[$j];
+ if(($count>=$total*$n/100)){
+ $n50=$x[$j];
+ last;
+ }
+ }
+ return $n50;
+}
+
+sub calcMedian {
+ my @arr = @_;
+ my @sArr = sort{$a<=>$b} @arr;
+ my $arrLen = @arr;
+ my $median;
+ if($arrLen % 2 == 0) {
+ $median = ($sArr[$arrLen/2-1] + $sArr[$arrLen/2])/2;
+ }
+ else {
+ $median = $sArr[$arrLen/2];
+ }
+ return $median;
+}
+
+sub formatSeq {
+ my $seq = $_[0];
+ my $newSeq = "";
+ my $ch = 60;
+ my $len = length $seq;
+ for(my $i=0; $i<$len; $i+=$ch) {
+ $newSeq .= substr($seq, $i, $ch) . "\n";
+ }
+ chomp($newSeq); # To remove \n at the end of the whole sequence..
+ return $newSeq;
+}
+
+sub formatQualSeq {
+ my $qualSeq = $_[0];
+ my $fQSeq = "";
+ my $ch = 60;
+ my $valCount = 0;
+ my @arr = split(/\s+/, $qualSeq);
+ for(my $i=0; $i<@arr; $i++) {
+ $valCount++;
+ if($valCount % $ch == 0) {
+ $fQSeq .= $arr[$i] . "\n";
+ }
+ else {
+ $fQSeq .= $arr[$i] . " ";
+ }
+ }
+ $fQSeq =~ s/\s+$//;
+ return $fQSeq;
+}
+
+sub hasPolyChar {
+ my $seqRef = $_[0];
+ my $flag = 0;
+ if($$seqRef =~ s/(A{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(T{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(G{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(C{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ return $flag;
+}
+
+sub trimQualSeq {
+ my $qualSeq = $_[0];
+ my $seqLen = $_[1];
+ my $priAdaStart = $_[2];
+ my $trimmedQualSeq;
+ if($priAdaStart != -1) {
+ if($priAdaStart < 50) {
+ my $t = $seqLen-1;
+ $qualSeq =~ /((\d{1,2}\s+){$t}\d{1,2})$/;
+ $trimmedQualSeq = $1;
+ }
+ else {
+ $qualSeq =~ /^((\d{1,2}\s+){$seqLen})/;
+ $trimmedQualSeq = $1;
+ }
+ }
+ else {
+ $qualSeq =~ /^((\d{1,2}\s+){$seqLen})/;
+ $trimmedQualSeq = $1;
+ }
+ $trimmedQualSeq =~ s/\s+$//;
+ return $trimmedQualSeq;
+}
+
+sub isReadOfHQ { # Criteria for HQ is greater than or equal to 70% of bases have phred score > 20
+ my $read = $_[0];
+ my $validBaseCount = 0;
+ my @ASCII = split(/\s+/, $read);
+ my $readLen = scalar @ASCII;
+ my $cutOffLen = sprintf("%0.0f", $readLen * $cutOffReadLen4HQ / 100); # 70% length of read length is calculated.
+ foreach my $val (@ASCII) {
+ if($val >= $cutOffPhScore) {
+ $validBaseCount++;
+ }
+ }
+ if($validBaseCount >= $cutOffLen) {
+ $totalHQBasesAfterHQ += $validBaseCount;
+ return 1; # Return true.
+ }
+ else {
+ return 0; # Return false.
+ }
+}
+
+sub getQualBases { # This will return an array. 1) Total bases 2) HQ bases 3) Average quality
+ my $read = $_[0];
+ my $qualSum = 0;
+ my @retArr = ();
+ my $validBaseCount = 0;
+ my @ASCII = split(/\s+/, $read);
+ my $readLen = scalar @ASCII;
+ foreach my $val (@ASCII) {
+ $qualSum += $val;
+ if($val >= $cutOffPhScore) {
+ $validBaseCount++;
+ }
+ }
+ $retArr[0] = $readLen;
+ $retArr[1] = $validBaseCount;
+ $retArr[2] = ($readLen)?(sprintf "%0.2f", $qualSum/$readLen):0;
+ return @retArr;
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTA format; .fna and .qual files) (Required)\n";
+ print " -i <Read file> <Quality file> <Primer/Adaptor library>\n";
+ print " Read and quality file in FASTA format with primer/adaptor library\n";
+ print " User may choose from the provided primer/adaptor library or can give a file containing primer/adaptor sequences, one per line\n";
+ print " Multiple libraries can be given using multiple '-i' options\n";
+ print " For eg.: -i read1.fna read1.qual 3 -i read2.fna read2.qual 2\n\n";
+ print " Primer/Adaptor libraries:\n";
+ my $c = 1;
+ foreach my $lib (@priAdaLibNames) {
+ print " $c = $lib\n";
+ $c++;
+ }
+ print " N = Do not filter for Primer/Adaptor\n";
+ print " <File> = File for user defined primer/adaptor sequences, one per line\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print "--------------------------------- QC Options ---------------------------------\n";
+ print " -l | -cutOffReadLen4HQ <Real number, 0 to 100>\n";
+ print " The cut-off value for percentage of read length that should be of given quality\n";
+ print " default: 70\n";
+ print " -s | -cutOffQualScore <Integer, 0 to 40>\n";
+ print " The cut-off value for PHRED quality score for high-quality filtering\n";
+ print " default: 20\n";
+ print " -n | -homoPolyLen <Integer>\n";
+ print " Minimum length of the homopolymer to be trimmed (0: to skip the homopolymer trimming)\n";
+ print " For eg.: -n 8, will trim the right end of read from the homopolymer of at least 8 bases long\n";
+ print " default: 0 (homopolymer trimming is off)\n";
+ print " -m | -minLen <Integer>\n";
+ print " Filter sequences shorter than the given minimum length\n";
+ print " default: 100\n";
+ print " -f | -lenFilter <Y/N>\n";
+ print " Are sequences to be filtered on the basis of length: (Y)es or (N)o\n";
+ print " default: Y\n";
+ print "----------------------------- Processing Options -----------------------------\n";
+ print " -p | -processes <Integer>\n";
+ print " Number of processes to be used\n";
+ print " default: 1\n";
+ print " -onlyStat\n";
+ print " Outputs only statistics without filtered data output\n";
+ print "------------------------------- Output Options -------------------------------\n";
+ print " -t | -statOutFmt <Integer>\n";
+ print " Output format for statistics\n";
+ print " Formats:\n";
+ print " 1 = formatted text\n";
+ print " 2 = tab delimited\n";
+ print " default: 1\n";
+ print " -o | -outputFolder <Output folder name/path>\n";
+ print " Output will be stored in the given folder\n";
+ print " default: By default, output folder (454QC_Filtered_files) will be generated where the input files are\n";
+ print " -z | -outputDataCompression <Character>\n";
+ print " Output format for HQ filtered data\n";
+ print " Formats:\n";
+ print " t = text FASTA files\n";
+ print " g = gzip compressed files\n";
+ print " default: t\n";
+ print "\n";
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+sub getFileName { # This sub takes a path of a file and returns just its name after separating the path from it.
+ my $path = $_[0];
+ my $name = "";
+ $path =~ /([^\/]+)$/;
+ $name = $1;
+ return $name;
+}
+
+sub getFilePath {
+ my $name = $_[0];
+ my $path = "";
+ if($name =~ /\//) {
+ $name =~ /(.+)\//;
+ $path = $1 . "/";
+ }
+ else {
+ $path = "./";
+ }
+ return $path;
+}
+
+
+
+
+sub drawBaseComp {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ y_label => 'Count',
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ l_margin => 60,
+ r_margin => 60,
+ b_margin => 50,
+ t_margin => 50,
+ show_values => 1,
+ bar_spacing => 1,
+ values_vertical => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ $mygraph->set_values_font($f, 6);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $dgreen = $myImage->colorAllocate(0,127,0);
+ my $dblue = $myImage->colorAllocate(0,0,127);
+
+ my $sum1 = sum(@{$$dataRef[1]});
+ my $sum2 = sum(@{$$dataRef[2]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Base composition for $fileName",
+ color => $dblue,
+ );
+
+ $wrapbox->set(align => 'center', width => $width);
+ $wrapbox->set_font($f, 11);
+ $wrapbox->draw(0,0);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[1]}[0]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[1]}[1]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[1]}[2]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[1]}[3]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[1]}[4]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-35);
+
+
+ my $startRectX = $width/2-230;
+ my $startRectY = $height-35;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+
+ if(!$isOnlyStat) {
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[2]}[0]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[2]}[1]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[2]}[2]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[2]}[3]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[2]}[4]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-20);
+
+
+
+ $startRectX = $width/2-230;
+ $startRectY = $height-20;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+ }
+
+
+
+
+
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+
+sub drawGCDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+
+ $mygraph->set(
+ x_label => '% GC content',
+ y_label => 'Number of reads',
+ title => "GC content distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ markers => [1],
+ marker_size => 3,
+ dclrs => [ qw(lred dgreen) ],
+ x_labels_vertical => 1,
+ legend_placement => 'BR',
+ x_labels_vertical => 1,
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+sub drawSummaryPie {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $width = $_[2];
+ my $height = $_[3];
+ my $mygraph = new GD::Graph::pie($width, $height);
+
+ $mygraph->set(
+ title => "Summary of quality check and filtering",
+ axislabelclr => 'black',
+ pie_height => 40,
+
+ l_margin => 15,
+ r_margin => 15,
+ b_margin => 50,
+ start_angle => -10,
+ dclrs => [ qw(lred cyan lyellow lgreen purple) ],
+ transparent => 0,
+ ) or warn $mygraph->error;
+
+ $mygraph->set_label_font($f, 8);
+ $mygraph->set_value_font(['verdana', 'arial'],14);
+ $mygraph->set_title_font($f, 11);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $lyellow = $myImage->colorAllocate(255,255,0);
+ my $lgreen = $myImage->colorAllocate(0,255,0);
+ my $cyan = $myImage->colorAllocate(0,255,255);
+ my $purple = $myImage->colorAllocate(191,0,191);
+
+ my $sum = sum(@{$$dataRef[1]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Trashed reads (shorter than $lowestValidLen bp) (%0.2f", @{$$dataRef[1]}[0]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Homopolymer trimmed reads (%0.2f", @{$$dataRef[1]}[1]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+40,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Trashed reads (low quality reads) (%0.2f", @{$$dataRef[1]}[4]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Primer/Adaptor trimmed reads (%0.2f", @{$$dataRef[1]}[2]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+40,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "High quality reads other than homopolymer and primer/adaptor trimmed (%0.2f", @{$$dataRef[1]}[3]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 500);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-15);
+
+ my $startRectX1 = 10;
+ my $startRectX2 = $width/2+30;
+ my $startRectY = $height-45;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $myImage->filledRectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$cyan);
+ $myImage->rectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$purple);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $myImage->filledRectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$lyellow);
+ $myImage->rectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$lgreen);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+
+}
+
+sub drawQualDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ x_label => 'Average phred quality score',
+ y_label => 'Number of reads',
+ title => "Quality distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ bar_spacing => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+sub drawLenDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ x_label => 'Read length (bp)',
+ y_label => 'Number of reads',
+ title => "Length distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_labels_vertical => 1,
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ bar_spacing => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 9);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+
+sub isWOPriAda {
+ my $seq = $_[0];
+ chomp($$seq);
+
+ my @rapid = (
+ "CCATCTCATCCCTGCGTGTC",
+ "CCATCTCATCCCTGCGTGTCTCCGACTCAG",
+ "CTGAGTCGGAGA",
+ "CCTATCCCCTGTGTGCCTTG",
+ "CCTATCCCCTGTGTGCCTTGGCAGTCTCAG",
+ "CTGAGACTGCCA",
+ );
+
+ my @arrPE = (
+ "GCCTCCCTCGCGCCATCAG",
+ "CTGATGGCGCGAGGG",
+ "GCCTTGCCAGCCCGCTCAG",
+ "CTGAGCGGGCTGGCA",
+ "GCCTCCCTCGCGCCA",
+ "GCCTTGCCAGCCCGC",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @arrAmplicon = (
+ "CGTATCGCCTCCCTCGCGCCATCAG",
+ "CGTATCGCCTCCCTCGCGCCATCAG",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @arrsmRna = (
+ "GCCTCCCTCGCGCCATCAGTATCGTAGGCACCTGAGA",
+ "GCCTTGCCAGCCCGCTCAGTATTGATGGTGCCTACAG",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @priAdas = (\@rapid, \@arrPE, \@arrAmplicon, \@arrsmRna);
+ my %checkedPriStr = (); # The 20 bp from start and end are stored in this hash as key. So that next time when another pri/ada seq
+
+ my @priAdaSeqs = ();
+ if($priAdaLib eq "u") {
+ @priAdaSeqs = @usrDefinedPriAda;
+ }
+ else {
+ @priAdaSeqs = @{$priAdas[$priAdaLib]};
+ }
+ my @stat = ();
+ my $priInd = 0;
+ my $priAdaStart = 1;
+
+ my $isMatched = 0;
+ foreach my $priAda (@priAdaSeqs) {
+ $priAdaStart = findSeq($priAda, $$seq, \%checkedPriStr);
+ if($priAdaStart) {
+ if($priAdaStart < 50) {
+ $$seq = substr($$seq, $priAdaStart+$substrlen, length($$seq)-($priAdaStart+$substrlen));
+ }
+ else {
+ $$seq = substr($$seq, 0, $priAdaStart);
+ }
+ $isMatched = 1;
+ last;
+ }
+ }
+
+ if($isMatched) {
+ $totalValidReadsWithPriAda++;
+ return $priAdaStart;
+ }
+ else {
+ $totalValidReadsNoPriAda++;
+ return -1;
+ }
+}
+
+sub findSeq {
+ my $pri = $_[0];
+ my $seq = $_[1];
+ my $hashRef = $_[2];
+ my $subsl = $substrlen;
+ $subsl = length $pri if(length($pri) < $substrlen);
+ my $spri = substr($pri, 0, $subsl);
+ my $epri = substr($pri, (length $pri) - $subsl, $subsl);
+ my $sseq = substr($seq, 0, 50);
+ my $tmpInd = (length $seq) - 50;
+ $tmpInd = 0 if($tmpInd < 0);
+ my $eseq = substr($seq, $tmpInd, 50);
+ my $ans;
+ if(!defined($$hashRef{$spri})) {
+ my @catches = String::Approx::amatch($spri, ['I0 D0 S1'], $sseq);
+ if(@catches != 0) {
+ return findStart($sseq, $spri);
+ }
+ @catches = String::Approx::amatch($spri, ['I0 D0 S1'], $eseq);
+ if(@catches != 0) {
+ return findStart($eseq, $spri) + length($seq) - 50;
+ }
+ $$hashRef{$spri} = 1;
+ }
+ if(!defined($$hashRef{$epri})) {
+ my @catches = String::Approx::amatch($epri, ['I0 D0 S1'], $sseq);
+ if(@catches != 0) {
+ return findStart($sseq, $epri);
+ }
+ @catches = String::Approx::amatch($epri, ['I0 D0 S1'], $eseq);
+ if(@catches != 0) {
+ return findStart($eseq, $epri) + length($seq) - 50;
+ }
+ $$hashRef{$epri} = 1;
+ }
+ return 0;
+}
+
+use re qw(eval);
+use vars qw($matchStart);
+
+sub findStart {
+ my $pattern;
+ local $_;
+ ($_, $pattern) = @_;
+ $pattern = fuzzy_pattern($pattern, $mismLim);
+ my @results;
+ local $matchStart;
+ my $instrumentedPattern = qr/(?{ $matchStart = pos() })$pattern/;
+ while (/$instrumentedPattern/g) {
+ my $nextStart = pos();
+ return $matchStart;
+ push @results, "[$matchStart..$nextStart)";
+ pos() = $matchStart+1;
+ }
+}
+
+sub fuzzy_pattern {
+ my ($original_pattern, $mismatches_allowed) = @_;
+ $mismatches_allowed >= 0
+ or die "Number of mismatches must be greater than or equal to zero\n";
+ my $new_pattern = make_approximate($original_pattern, $mismatches_allowed);
+ return qr/$new_pattern/;
+}
+
+sub make_approximate {
+ my ($pattern, $mismatches_allowed) = @_;
+ if ($mismatches_allowed == 0) { return $pattern }
+ elsif (length($pattern) <= $mismatches_allowed)
+ { $pattern =~ tr/ACTG/./; return $pattern }
+ else {
+ my ($first, $rest) = $pattern =~ /^(.)(.*)/;
+ my $after_match = make_approximate($rest, $mismatches_allowed);
+ if ($first =~ /[ACGT]/) {
+ my $after_miss = make_approximate($rest, $mismatches_allowed-1);
+ return "(?:$first$after_match|.$after_miss)";
+ }
+ else { return "$first$after_match" }
+ }
+}
diff --git a/QC/454QC_PE.pl b/QC/454QC_PE.pl
new file mode 100644
index 0000000..3e55c4a
--- /dev/null
+++ b/QC/454QC_PE.pl
@@ -0,0 +1,1996 @@
+#! /usr/bin/perl
+
+use File::Basename;
+#BEGIN {
+# my ($tmp, $path) = fileparse($0);
+# push ( @INC,"$path/lib");
+# #use lib "$path";
+#}
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use Cwd qw(abs_path);
+use IO::Zlib;
+use FindBin qw($RealBin);
+use lib "$RealBin/lib";
+require "454PEhtml.pl";
+
+eval {
+ require Parallel::ForkManager;
+ require String::Approx;
+ require GD::Graph::linespoints;
+ require GD::Graph::bars;
+ require GD::Graph::pie;
+ require GD::Text::Wrap;
+};
+
+my $isGDMod = 1;
+
+if($@) {
+ my $errorText = join("", $@);
+ if($errorText =~ /Parallel/) {
+ print "Error:\n\tCan not find 'lib' folder with this perl program\n"; #module 'Parallel::ForkManager'\n";
+ print "\tCopy the 'lib' folder, provided with the toolkit, to the directory where this perl program is and try again\n\n";
+ exit;
+ }
+ elsif($errorText =~ /GD\/Graph\/linespoints/) {
+ print STDERR "Warning:\n\tCan not find module 'GD::Graph'\n";
+ print STDERR "\tGraphs for statistics will not be produced. \n\t\t\tOR \n\tInstall GD::Graph module and try again.\n\n";
+ $isGDMod = 0;
+ }
+ elsif($errorText =~ /String\/Approx/) {
+ print STDERR "Error:\n\tCan not find module 'String::Approx'\n";
+ print STDERR "\tInstall it and try again\n\n";
+ exit;
+ }
+}
+
+
+# Setting parameters
+my $lowestValidLen = 50;
+my @files = ();
+my $noOfInp = 3;
+my $helpAsked;
+my $cutOffReadLen4HQ = 70;
+my $cutOffPhScore = 20;
+my $outFolder = "";
+my $isOnlyStat;
+my $statOutFmt = 1;
+my $noOfProcesses = 1;
+my $homoPolyLen = 0;
+my $priAdaLib;
+my $isLenFilterOn = 1;
+my @priAdaLibNames = ("Rapid Library (Standard)", "Paired End Library", "Amplicon PE Library", "Small RNA Library");
+my $priAdaFile;
+my @usrDefinedPriAda = ();
+my $outputDataFmt = "t"; # t/T: Text; g/G: Gzip.
+my $linker;
+my $linkerF = "GTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC";
+my $linkerR;
+
+GetOptions(
+ "i=s{$noOfInp}" => \@files,
+ "h|help" => \$helpAsked,
+ "l|cutOffReadLen4HQ=f" => \$cutOffReadLen4HQ,
+ "n|homoPolyLen=i" => \$homoPolyLen,
+ "o|outputFolder=s" => \$outFolder,
+ "z|outputDataCompression=s" => \$outputDataFmt,
+ "t|statOutFmt=i" => \$statOutFmt,
+ "onlyStat" => \$isOnlyStat,
+ "p|processes=i" => \$noOfProcesses,
+ "s|cutOffQualScore=i" => \$cutOffPhScore,
+ "m|minLen=i" => \$lowestValidLen,
+ "f|lenFilter=s" => \$isLenFilterOn,
+ "linker=s" => \$linkerF,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(@files == 0) {
+ prtError("No input files are provided");
+}
+my @tempFiles = ();
+prtError("Missing inputs for option -i") if((scalar @files)%$noOfInp != 0);
+for(my $i=0; $i<@files; $i+=$noOfInp) {
+ my $str = "$files[$i] $files[$i+1] $files[$i+2]";
+ if($files[$i+2] =~ /^-/) {
+ prtError("Missing inputs for option -i: at '-i $str'")
+ }
+ if($files[$i+2] =~ /^\d$/) {
+ if($files[$i+2] < 1 || $files[$i+2] > 4) {
+ prtError("Incorrect option for Primer/Adaptor library: at '-i $str'");
+ }
+ }
+ push(@tempFiles, $str);
+}
+ at files = ();
+ at files = @tempFiles;
+if($cutOffReadLen4HQ < 0 || $cutOffReadLen4HQ > 100) {
+ prtError("Incorrect value for -l|cutOffReadLen4HQ option: at '-l $cutOffReadLen4HQ'");
+}
+if($cutOffPhScore < 0 || $cutOffPhScore > 40) {
+ prtError("Incorrect value for -s|cutOffPhScore option: at '-s $cutOffPhScore'");
+}
+if($statOutFmt < 1 || $statOutFmt > 2) {
+ prtError("Incorrect value for -statOutFmt: at '-statOutFmt $statOutFmt'");
+}
+if($isLenFilterOn =~ /^N/i) {
+ $isLenFilterOn = 0;
+}
+else {
+ $isLenFilterOn = 1;
+}
+if($outputDataFmt !~ /^[tg]$/i) {
+ prtError("Incorrect value for -f|outputDataFmt option: at '-f $outputDataFmt'");
+}
+
+my $tmpLinker = $linkerF;
+$tmpLinker =~ tr/ATGCatgc/TACGtacg/;
+$tmpLinker = reverse $tmpLinker;
+if($linkerF ne $tmpLinker) {
+ $linkerR = $tmpLinker;
+}
+
+my $pm = new Parallel::ForkManager($noOfProcesses);
+
+
+my $trimCount = 0;
+my @trimCountPE = (0, 0, 0);
+my $seqCount = 0;
+my $seqCountPE = 0;
+my $ttlSeqCount = 0;
+my $lt100 = 0;
+my @lt100PE = (0, 0, 0);
+my $hQCount = 0;
+my @hQCountPE = (0, 0, 0);
+my $lQCount = 0;
+my @lQCountPE = (0, 0, 0);
+my $maxRawLen = 0;
+my $minRawLen = 1000000000000;
+my @maxRawLenPE = (0, 0);
+my @minRawLenPE = (1000000000000, 1000000000000);
+my $avgRawLen = 0;
+my $maxHQLen = 0;
+my $minHQLen = 1000000000000;
+my $avgHQLen = 0;
+my @rawLen = ();
+my @rawLenPE = ();
+my @hQLen = ();
+my $totalBases = 0;
+my $totalHQBases = 0;
+my @totalBasesPE = (0, 0, 0);
+my @totalHQBasesPE = (0, 0, 0);
+my $totalBasesUPOri = 0;
+my $totalBasesAfterHQ = 0;
+my @totalBasesAfterHQPE = (0, 0, 0);
+my $totalHQBasesAfterHQ = 0;
+my @totalHQBasesAfterHQPE = (0, 0, 0);
+my $totalBasesFinal = 0;
+my $totalHQBasesFinal = 0;
+my $totalReadsFinal = 0;
+my @totalReadsFinalPE = (0, 0, 0);
+my @totalReadsFinalUP = (0, 0);
+my $avgQual = 0;
+my @avgQualPE = (0, 0);
+my $avgQualFinal = 0;
+my $totalValidReadsWithPriAda = 0;
+my @totalValidReadsWithPriAdaPE = (0, 0, 0);
+my $totalValidReadsNoPriAda = 0;
+my $substrlen = 20; # For removePriAda
+my $mismLim = 1; # For removePriAda
+
+my $fastaSeqId = "";
+my $fastaSeq = "";
+my $qualSeqId = "";
+my $qualSeq = "";
+my $prevFastaSeqId = "";
+my $indOfAnalysis = 0;
+
+my @lenDistrib = ();
+my $lenInterval = 40;
+my @qualDistrib = ();
+my $qualInterval = 1;
+my @gcDistrib = ();
+my $gcInterval = 5;
+my @charCount = ();
+my @charCountPE = ();
+
+my $font_spec = getFilePath($0) . "lib/Fonts/Dustismo_Sans.ttf";
+my $f = getFilePath($0) . "lib/Fonts/LucidaSansDemiBold.ttf";
+
+
+foreach my $inpData (@files) {
+ $indOfAnalysis++;
+my $pid = $pm->start and next;
+ $inpData =~ s/\\([A-Za-z_\.])/\/$1/g; # To remove '\' from the path of windows file
+ my @iData = split(" ", $inpData);
+ my $seqFile = $iData[0];
+ my $qualFile = $iData[1];
+ $priAdaLib = $iData[2];
+ print "Analysis has been started for \"$seqFile\": Index: $indOfAnalysis\n";
+ if($priAdaLib =~ /^n$/i) {
+ undef $priAdaLib;
+ }
+ elsif($priAdaLib =~ /^\d$/) {
+ $priAdaLib = $priAdaLib - 1;
+ }
+ else {
+ $priAdaFile = $priAdaLib;
+ $priAdaLib = "u";
+ open(PRIADA, "<$priAdaFile") or die "Can not open the user-defined primer/adapter file: $priAdaFile\n";
+ @usrDefinedPriAda = <PRIADA>;
+ for(my $i=0; $i<$#usrDefinedPriAda; $i++) {
+ $usrDefinedPriAda[$i] =~ s/\s+//g;
+ }
+ }
+ my ($seqFileName, $filePath) = fileparse($seqFile);
+ my ($qualFileName) = fileparse($qualFile);
+ $outFolder = $filePath . "454QC_Filtered_files" if($outFolder eq "");
+ $outFolder .= "/" if($outFolder !~ /\/$/);
+ if(! -e $outFolder) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+ }
+ my $outSeqFile = $outFolder . $seqFileName . "_filtered";
+ my $outQualFile = $outFolder . $qualFileName . "_filtered";
+ $outSeqFile .= ".gz" if($outputDataFmt =~ /g/i);
+ $outQualFile .= ".gz" if($outputDataFmt =~ /g/i);
+ my $statFile = $outFolder . $seqFileName . "_stat";
+
+ my $iH;
+ openFileGetHandle($seqFile, "r", \$iH);
+ *I = $iH;
+ my $qH;
+ openFileGetHandle($qualFile, "r", \$qH);
+ *Q = $qH;
+ if(!defined($isOnlyStat)) {
+ my $oiH;
+ openFileGetHandle($outSeqFile, "w", \$oiH);
+ *OI = $oiH;
+ my $oqH;
+ openFileGetHandle($outQualFile, "w", \$oqH);
+ *OQ = $oqH;
+ }
+ open(STAT, ">$statFile") or die "Can not open file: $statFile\n";
+ while(my $line = <I>) {
+ $ttlSeqCount++ if($line =~ /^>/);
+ }
+ close(I);
+ print "$indOfAnalysis: Number of reads processed: " . "0/$ttlSeqCount (0\%)...\n";
+ undef($iH);
+ openFileGetHandle($seqFile, "r", \$iH);
+ *I = $iH;
+
+ while(my $line = <I>) {
+ chomp $line;
+ my $qualLine = <Q>;
+ chomp($qualLine);
+ if($line =~ /^>/) {
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ $qualSeqId = $qualLine;
+ if($fastaSeqId ne $qualSeqId) {
+ print STDERR "Error: Read Id doesn't match in sequence and quality file for read number $seqCount in sequence file.\n";
+ exit(-1);
+ }
+ if($fastaSeq ne "") {
+ processSeq();
+ }
+ $fastaSeq = "";
+ $qualSeq = "";
+ }
+ else {
+ $fastaSeq .= $line;
+ $qualSeq .= $qualLine . " ";
+ }
+ }
+ if($fastaSeq ne "") {
+ $prevFastaSeqId = $fastaSeqId;
+ processSeq();
+ }
+
+ print "$indOfAnalysis: Number of reads processed: " . "$ttlSeqCount/$ttlSeqCount (100\%)...\n";
+ print "$indOfAnalysis: Analysis completed\n";
+
+ print "$indOfAnalysis: Printing Statistics...\n";
+
+ if($statOutFmt == 1) {
+ my $inde = " " x 1;
+ my ($tmpPer, $tmpPer2, $tmpPer3);
+ printf STAT "Parameters\n";
+ printf STAT "$inde %-40s %s %s\n", "Input files ", $seqFile, $qualFile;
+ printf STAT "$inde %-40s %s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf STAT "$inde %-40s %s\n", "Linker sequence", (($linkerR)?"(+)5'$linkerF 3'/ (-)5'$linkerR 3'":"(+)5'$linkerF 3' / (-)5'$linkerF 3'");
+ printf STAT "$inde %-40s %s\n", "Homopolymer trimming", "Off" if($homoPolyLen == 0);
+ printf STAT "$inde %-40s %s\n", "Homopolymer trimming", "On" if($homoPolyLen != 0);
+ printf STAT "$inde %-40s %s\n", "Length of the homopolymer to be removed", $homoPolyLen if($homoPolyLen != 0);
+ printf STAT "$inde %-40s %s\n", "Length filter", ($isLenFilterOn)?"On":"Off";
+ printf STAT "$inde %-40s %s\n", "Cut-off for minimum read length", $lowestValidLen if($isLenFilterOn);
+ printf STAT "$inde %-40s %s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf STAT "$inde %-40s %s\n", "Cut-off quality score", $cutOffPhScore;
+ printf STAT "$inde %-40s %s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf STAT "$inde %-40s %s\n", "Number of processes", $noOfProcesses;
+
+ print STAT "\n\n";
+
+ print STAT "QC statistics\n";
+ printf STAT "$inde %-70s %s\n", "File name", $seqFileName;
+ printf STAT "$inde %-70s %d\n", "Total number of reads", $seqCount+$seqCountPE;
+ print STAT "\n";
+ printf STAT "$inde %-70s %-13s %s\n", "QC analysis of Paired reads:", "Paired", "(Read1 / Read2)";
+ printf STAT "$inde %-70s %-13d (%d / %d)\n", "Total number of Paired reads", $seqCountPE, $seqCountPE, $seqCountPE;
+ printf STAT "$inde %-70s %-13d (%d / %d)\n", "Total number of trimmed reads containing homopolymer", $trimCountPE[2], $trimCountPE[0], $trimCountPE[1];
+ printf STAT "$inde %-70s %-13d (%d / %d)\n", "Total number of trashed reads (<$lowestValidLen bp in length after trimming)", $lt100PE[2], $lt100PE[0], $lt100PE[1];
+ printf STAT "$inde %-70s %-13d (%d / %d)\n", "Total number of low quality reads (excluding <$lowestValidLen reads)", $lQCountPE[2], $lQCountPE[0], $lQCountPE[1];
+ printf STAT "$inde %-70s %-13d (%d / %d)\n", "Total number of HQ reads", $hQCountPE[2], $hQCountPE[0], $hQCountPE[1];
+ $tmpPer = ($seqCountPE)?(sprintf "%0.2f", $hQCountPE[2]/$seqCountPE*100):"0";
+ $tmpPer2 = ($seqCountPE)?(sprintf "%0.2f", $hQCountPE[0]/$seqCountPE*100):"0";
+ $tmpPer3 = ($seqCountPE)?(sprintf "%0.2f", $hQCountPE[1]/$seqCountPE*100):"0";
+ printf STAT "$inde %-70s %-13s (%s / %s)\n", "Percentage of HQ reads", $tmpPer."%", $tmpPer2."%", $tmpPer3."%";
+ printf STAT "$inde %-70s %-13.f (%.f / %.f)\n", "Total number of bases", $totalBasesPE[2], $totalBasesPE[0], $totalBasesPE[1];
+ printf STAT "$inde %-70s %-13.f (%.f / %.f)\n", "Total number of bases in HQ reads", $totalBasesAfterHQPE[2], $totalBasesAfterHQPE[0], $totalBasesAfterHQPE[1];
+ printf STAT "$inde %-70s %-13.f (%.f / %.f)\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQPE[2], $totalHQBasesAfterHQPE[0], $totalHQBasesAfterHQPE[1];
+ $tmpPer = ($totalBasesAfterHQPE[2])?(sprintf "%0.2f", $totalHQBasesAfterHQPE[2]/$totalBasesAfterHQPE[2]*100):"0";
+ $tmpPer2 = ($totalBasesAfterHQPE[2])?(sprintf "%0.2f", $totalHQBasesAfterHQPE[0]/$totalBasesAfterHQPE[0]*100):"0";
+ $tmpPer3 = ($totalBasesAfterHQPE[2])?(sprintf "%0.2f", $totalHQBasesAfterHQPE[1]/$totalBasesAfterHQPE[1]*100):"0";
+ printf STAT "$inde %-70s %-13s (%s / %s)\n", "Percentage of HQ bases in HQ reads", $tmpPer."%", $tmpPer2."%", $tmpPer3."%";
+ if(defined($priAdaLib)) {
+ printf STAT "$inde %-70s %-13d (%d / %d)\n", "Number of Primer/Adaptor trimmed reads", $totalValidReadsWithPriAdaPE[2], $totalValidReadsWithPriAdaPE[0], $totalValidReadsWithPriAdaPE[1];
+ }
+ else {
+ printf STAT "$inde %-70s %-13s (%s / %s)\n", "Number of Primer/Adaptor trimmed reads", "NA", "NA", "NA";
+ }
+ printf STAT "$inde %-70s %-13d (%d / %d)\n", "Total number of HQ filtered reads", $totalReadsFinalPE[2], $totalReadsFinalPE[0], $totalReadsFinalPE[1];
+ $tmpPer = ($seqCountPE)?(sprintf "%0.2f", $totalReadsFinalPE[2]/$seqCountPE*100):"0";
+ $tmpPer2 = ($seqCountPE)?(sprintf "%0.2f", $totalReadsFinalPE[0]/$seqCountPE*100):"0";
+ $tmpPer3 = ($seqCountPE)?(sprintf "%0.2f", $totalReadsFinalPE[1]/$seqCountPE*100):"0";
+ printf STAT "$inde %-70s %-13s (%s / %s)\n", "Percentage of HQ filtered reads", $tmpPer."%", $tmpPer2."%", $tmpPer3."%";
+ printf STAT "$inde %-70s %d\n", "Total number of HQ filtered reads (Unpaired)", $totalReadsFinalUP[0];
+ $tmpPer = ($seqCountPE)?(sprintf "%0.2f", $totalReadsFinalUP[0]/$seqCountPE*100):"0";
+ printf STAT "$inde %-70s %s\n", "Percentage of HQ filtered reads (Unpaired)", $tmpPer."%";
+
+ print STAT "\n";
+ printf STAT "$inde %-70s %-13s\n", "QC analysis of Unpaired (UPOri*) reads:", "Total";
+ printf STAT "$inde %-70s %-13d\n", "Total number of Unpaired reads", $seqCount;
+ printf STAT "$inde %-70s %-13d\n", "Total number of trimmed reads containing homopolymer", $trimCount;
+ printf STAT "$inde %-70s %-13d\n", "Total number of trashed reads (<$lowestValidLen bp in length after trimming)", $lt100;
+ printf STAT "$inde %-70s %-13d\n", "Total number of low quality reads (excluding <$lowestValidLen reads)", $lQCount;
+ printf STAT "$inde %-70s %-13d\n", "Total number of HQ reads", $hQCount;
+ $tmpPer = sprintf "%0.2f", $hQCount/$seqCount*100;
+ printf STAT "$inde %-70s %-13s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf STAT "$inde %-70s %-13.f\n", "Total number of bases", $totalBasesUPOri;
+ printf STAT "$inde %-70s %-13.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ;
+ printf STAT "$inde %-70s %-13.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ;
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ/$totalBasesAfterHQ*100;
+ printf STAT "$inde %-70s %-13s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf STAT "$inde %-70s %-13d\n", "Number of Primer/Adaptor trimmed reads", $totalValidReadsWithPriAda;
+ }
+ else {
+ printf STAT "$inde %-70s %-13s\n", "Number of Primer/Adaptor trimmed reads", "NA";
+ }
+ printf STAT "$inde %-70s %-13d\n", "Total number of HQ filtered reads (Unpaired)", $totalReadsFinal;
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal/$seqCount*100;
+ printf STAT "$inde %-70s %-13s\n", "Percentage of HQ filtered reads (Unpaired)", $tmpPer."%";
+ print STAT ("-"x100)."\n";
+ $tmpPer = sprintf "%0.2f", $totalReadsFinalPE[2]/($seqCount+$seqCountPE)*100;
+ $tmpPer2 = sprintf "%0.2f", $totalReadsFinalUP[0]/($seqCount+$seqCountPE)*100;
+ $tmpPer3 = sprintf "%0.2f", $totalReadsFinal/($seqCount+$seqCountPE)*100;
+ printf STAT "$inde %-70s %-13d (%s)\n", "Number of HQ filtered reads (Paired)", $totalReadsFinalPE[2], $tmpPer."%";
+ printf STAT "$inde %-70s %-13d (%s)\n", "Number of HQ filtered reads (UPPair*)", $totalReadsFinalUP[0], $tmpPer2."%";
+ printf STAT "$inde %-70s %-13d (%s)\n", "Number of HQ filtered reads (UPOri*)", $totalReadsFinal, $tmpPer3."%";
+ print STAT ("-"x100)."\n";
+ $tmpPer = sprintf "%0.2f", ($totalReadsFinalPE[2]+$totalReadsFinalUP[0]+$totalReadsFinal)/($seqCount+$seqCountPE)*100;
+ printf STAT "$inde %-70s %-13d (%s)\n", "Total number of HQ filtered reads", ($totalReadsFinalPE[2]+$totalReadsFinalUP[0]+$totalReadsFinal), $tmpPer."%";
+ print STAT "\n";
+ print STAT "* UPOri: Unpaired reads (i.e. reads without linker sequence) found in the input file\n";
+ print STAT " UPPair: One of the paired reads which passed QC\n";
+
+ print STAT "\n\n";
+
+######### Adding the statistics of PE, UPPair and UPOri reads
+ $seqCount = $seqCount + $seqCountPE;
+ $totalReadsFinal = $totalReadsFinalPE[2]+$totalReadsFinalUP[0]+$totalReadsFinal;
+######### Done adding
+ print STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", $seqFileName, (fileparse($outSeqFile))[0]],
+ ["Total number of reads", $seqCount, $totalReadsFinal],
+ ["Minimum read length", $minRawLen, $minHQLen],
+ ["Maximum read length", $maxRawLen, $maxHQLen],
+ ["Average read length", (sprintf "%0.2f", $totalBases/$seqCount), (sprintf "%0.2f", $totalBasesFinal/$totalReadsFinal)],
+ ["Median read length", calcMedian(@rawLen), calcMedian(@hQLen)],
+ ["N25 length", calcN50(\@rawLen, 25), calcN50(\@hQLen, 25)],
+ ["N50 length", calcN50(\@rawLen, 50), calcN50(\@hQLen, 50)],
+ ["N75 length", calcN50(\@rawLen, 75), calcN50(\@hQLen, 75)],
+ ["N90 length", calcN50(\@rawLen, 90), calcN50(\@hQLen, 90)],
+ ["N95 length", calcN50(\@rawLen, 95), calcN50(\@hQLen, 95)],
+ ["Total number of bases", $totalBases, $totalBasesFinal],
+ ["Total number of HQ bases", $totalHQBases, $totalHQBasesFinal],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases/$totalBases*100)."%", (sprintf "%0.2f", $totalHQBasesFinal/$totalBasesFinal*100)."%"],
+ ["Average quality score (Overall)", (sprintf "%0.2f", $avgQual/$seqCount), (sprintf "%0.2f", $avgQualFinal/$totalReadsFinal)],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf STAT "$inde %-50s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf STAT "$inde %-50s %s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print STAT "\n\n";
+######### Subtracting the values back
+ $seqCount = $seqCount - $seqCountPE;
+ $totalReadsFinal = $totalReadsFinal-$totalReadsFinalPE[2]-$totalReadsFinalUP[0];
+######### Done
+ }
+ elsif($statOutFmt == 2) {
+ my ($tmpPer, $tmpPer2, $tmpPer3);
+ printf STAT "Parameters\n";
+ printf STAT "\t%s\t%s\t%s\n", "Input files ", $seqFile, $qualFile;
+ printf STAT "\t%s\t%s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf STAT "\t%s\t%s\n", "Linker sequence", (($linkerR)?"(+)5'$linkerF 3'/ (-)5'$linkerR 3'":"(+)5'$linkerF 3' / (-)5'$linkerF 3'");
+ printf STAT "\t%s\t%s\n", "Homopolymer trimming", "Off" if($homoPolyLen == 0);
+ printf STAT "\t%s\t%s\n", "Homopolymer trimming", "On" if($homoPolyLen != 0);
+ printf STAT "\t%s\t%s\n", "Length of the homopolymer to be removed", $homoPolyLen if($homoPolyLen != 0);
+ printf STAT "\t%s\t%s\n", "Length filter", ($isLenFilterOn)?"On":"Off";
+ printf STAT "\t%s\t%s\n", "Cut-off for minimum read length", $lowestValidLen if($isLenFilterOn);
+ printf STAT "\t%s\t%s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf STAT "\t%s\t%s\n", "Cut-off quality score", $cutOffPhScore;
+ printf STAT "\t%s\t%s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf STAT "\t%s\t%s\n", "Number of processes", $noOfProcesses;
+
+ print STAT "\n\n";
+
+ print STAT "QC statistics\n";
+ printf STAT "\t%s\t%s\n", "File name", $seqFileName;
+ printf STAT "\t%s\t%d\n", "Total number of reads", $seqCount+$seqCountPE;
+ print STAT "\n";
+ printf STAT "\t%s\t%s\t%s\n", "QC analysis of Paired reads:", "Paired", "(Read1 / Read2)";
+ printf STAT "\t%s\t%d\t(%d / %d)\n", "Total number of Paired reads", $seqCountPE, $seqCountPE, $seqCountPE;
+ printf STAT "\t%s\t%d\t(%d / %d)\n", "Total number of trimmed reads containing homopolymer", $trimCountPE[2], $trimCountPE[0], $trimCountPE[1];
+ printf STAT "\t%s\t%d\t(%d / %d)\n", "Total number of trashed reads (<$lowestValidLen bp in length after trimming)", $lt100PE[2], $lt100PE[0], $lt100PE[1];
+ printf STAT "\t%s\t%d\t(%d / %d)\n", "Total number of low quality reads (excluding <$lowestValidLen reads)", $lQCountPE[2], $lQCountPE[0], $lQCountPE[1];
+ printf STAT "\t%s\t%d\t(%d / %d)\n", "Total number of HQ reads", $hQCountPE[2], $hQCountPE[0], $hQCountPE[1];
+ $tmpPer = ($seqCountPE)?(sprintf "%0.2f", $hQCountPE[2]/$seqCountPE*100):"0";
+ $tmpPer2 = ($seqCountPE)?(sprintf "%0.2f", $hQCountPE[0]/$seqCountPE*100):"0";
+ $tmpPer3 = ($seqCountPE)?(sprintf "%0.2f", $hQCountPE[1]/$seqCountPE*100):"0";
+ printf STAT "\t%s\t%s\t(%s / %s)\n", "Percentage of HQ reads", $tmpPer."%", $tmpPer2."%", $tmpPer3."%";
+ printf STAT "\t%s\t%.f\t(%.f / %.f)\n", "Total number of bases", $totalBasesPE[2], $totalBasesPE[0], $totalBasesPE[1];
+ printf STAT "\t%s\t%.f\t(%.f / %.f)\n", "Total number of bases in HQ reads", $totalBasesAfterHQPE[2], $totalBasesAfterHQPE[0], $totalBasesAfterHQPE[1];
+ printf STAT "\t%s\t%.f\t(%.f / %.f)\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQPE[2], $totalHQBasesAfterHQPE[0], $totalHQBasesAfterHQPE[1];
+ $tmpPer = ($totalBasesAfterHQPE[2])?(sprintf "%0.2f", $totalHQBasesAfterHQPE[2]/$totalBasesAfterHQPE[2]*100):"0";
+ $tmpPer2 = ($totalBasesAfterHQPE[2])?(sprintf "%0.2f", $totalHQBasesAfterHQPE[0]/$totalBasesAfterHQPE[0]*100):"0";
+ $tmpPer3 = ($totalBasesAfterHQPE[2])?(sprintf "%0.2f", $totalHQBasesAfterHQPE[1]/$totalBasesAfterHQPE[1]*100):"0";
+ printf STAT "\t%s\t%s\t(%s / %s)\n", "Percentage of HQ bases in HQ reads", $tmpPer."%", $tmpPer2."%", $tmpPer3."%";
+ if(defined($priAdaLib)) {
+ printf STAT "\t%s\t%d\t(%d / %d)\n", "Number of Primer/Adaptor trimmed reads", $totalValidReadsWithPriAdaPE[2], $totalValidReadsWithPriAdaPE[0], $totalValidReadsWithPriAdaPE[1];
+ }
+ else {
+ printf STAT "\t%s\t%s\t(%s / %s)\n", "Number of Primer/Adaptor trimmed reads", "NA", "NA", "NA";
+ }
+ printf STAT "\t%s\t%d\t(%d / %d)\n", "Total number of HQ filtered reads", $totalReadsFinalPE[2], $totalReadsFinalPE[0], $totalReadsFinalPE[1];
+ $tmpPer = ($seqCountPE)?(sprintf "%0.2f", $totalReadsFinalPE[2]/$seqCountPE*100):"0";
+ $tmpPer2 = ($seqCountPE)?(sprintf "%0.2f", $totalReadsFinalPE[0]/$seqCountPE*100):"0";
+ $tmpPer3 = ($seqCountPE)?(sprintf "%0.2f", $totalReadsFinalPE[1]/$seqCountPE*100):"0";
+ printf STAT "\t%s\t%s\t(%s / %s)\n", "Percentage of HQ filtered reads", $tmpPer."%", $tmpPer2."%", $tmpPer3."%";
+ printf STAT "\t%s\t%d\n", "Total number of HQ filtered reads (Unpaired)", $totalReadsFinalUP[0];
+ $tmpPer = ($seqCountPE)?(sprintf "%0.2f", $totalReadsFinalUP[0]/$seqCountPE*100):"0";
+ printf STAT "\t%s\t%s\n", "Percentage of HQ filtered reads (Unpaired)", $tmpPer."%";
+
+ print STAT "\n";
+ printf STAT "\t%s\t%s\n", "QC analysis of Unpaired (UPOri*) reads:", "Total";
+ printf STAT "\t%s\t%d\n", "Total number of Unpaired reads", $seqCount;
+ printf STAT "\t%s\t%d\n", "Total number of trimmed reads containing homopolymer", $trimCount;
+ printf STAT "\t%s\t%d\n", "Total number of trashed reads (<$lowestValidLen bp in length after trimming)", $lt100;
+ printf STAT "\t%s\t%d\n", "Total number of low quality reads (excluding <$lowestValidLen reads)", $lQCount;
+ printf STAT "\t%s\t%d\n", "Total number of HQ reads", $hQCount;
+ $tmpPer = sprintf "%0.2f", $hQCount/$seqCount*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf STAT "\t%s\t%.f\n", "Total number of bases", $totalBasesUPOri;
+ printf STAT "\t%s\t%.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ;
+ printf STAT "\t%s\t%.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ;
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ/$totalBasesAfterHQ*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf STAT "\t%s\t%d\n", "Number of Primer/Adaptor trimmed reads", $totalValidReadsWithPriAda;
+ }
+ else {
+ printf STAT "\t%s\t%s\n", "Number of Primer/Adaptor trimmed reads", "NA";
+ }
+ printf STAT "\t%s\t%d\n", "Total number of HQ filtered reads (Unpaired)", $totalReadsFinal;
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal/$seqCount*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ filtered reads (Unpaired)", $tmpPer."%";
+ print STAT ("-"x100)."\n";
+ $tmpPer = sprintf "%0.2f", $totalReadsFinalPE[2]/($seqCount+$seqCountPE)*100;
+ $tmpPer2 = sprintf "%0.2f", $totalReadsFinalUP[0]/($seqCount+$seqCountPE)*100;
+ $tmpPer3 = sprintf "%0.2f", $totalReadsFinal/($seqCount+$seqCountPE)*100;
+ printf STAT "\t%s\t%d\t(%s)\n", "Number of HQ filtered reads (Paired)", $totalReadsFinalPE[2], $tmpPer."%";
+ printf STAT "\t%s\t%d\t(%s)\n", "Number of HQ filtered reads (UPPair*)", $totalReadsFinalUP[0], $tmpPer2."%";
+ printf STAT "\t%s\t%d\t(%s)\n", "Number of HQ filtered reads (UPOri*)", $totalReadsFinal, $tmpPer3."%";
+ print STAT ("-"x100)."\n";
+ $tmpPer = sprintf "%0.2f", ($totalReadsFinalPE[2]+$totalReadsFinalUP[0]+$totalReadsFinal)/($seqCount+$seqCountPE)*100;
+ printf STAT "\t%s\t%d\t(%s)\n", "Total number of HQ filtered reads", ($totalReadsFinalPE[2]+$totalReadsFinalUP[0]+$totalReadsFinal), $tmpPer."%";
+ print STAT "\n";
+ print STAT "* UPOri: Unpaired reads (i.e. reads without linker sequence) found in the input file\n";
+ print STAT " UPPair: One of the paired reads which passed QC\n";
+
+ print STAT "\n\n";
+
+######### Adding the statistics of PE, UPPair and UPOri reads
+ $seqCount = $seqCount + $seqCountPE;
+ $totalReadsFinal = $totalReadsFinalPE[2]+$totalReadsFinalUP[0]+$totalReadsFinal;
+######### Done adding
+ print STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", $seqFileName, (fileparse($outSeqFile))[0]],
+ ["Total number of reads", $seqCount, $totalReadsFinal],
+ ["Minimum read length", $minRawLen, $minHQLen],
+ ["Maximum read length", $maxRawLen, $maxHQLen],
+ ["Average read length", (sprintf "%0.2f", $totalBases/$seqCount), (sprintf "%0.2f", $totalBasesFinal/$totalReadsFinal)],
+ ["Median read length", calcMedian(@rawLen), calcMedian(@hQLen)],
+ ["N25 length", calcN50(\@rawLen, 25), calcN50(\@hQLen, 25)],
+ ["N50 length", calcN50(\@rawLen, 50), calcN50(\@hQLen, 50)],
+ ["N75 length", calcN50(\@rawLen, 75), calcN50(\@hQLen, 75)],
+ ["N90 length", calcN50(\@rawLen, 90), calcN50(\@hQLen, 90)],
+ ["N95 length", calcN50(\@rawLen, 95), calcN50(\@hQLen, 95)],
+ ["Total number of bases", $totalBases, $totalBasesFinal],
+ ["Total number of HQ bases", $totalHQBases, $totalHQBasesFinal],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases/$totalBases*100)."%", (sprintf "%0.2f", $totalHQBasesFinal/$totalBasesFinal*100)."%"],
+ ["Average quality score (Overall)", (sprintf "%0.2f", $avgQual/$seqCount), (sprintf "%0.2f", $avgQualFinal/$totalReadsFinal)],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf STAT "\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf STAT "\t%s\t%s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print STAT "\n\n";
+######### Subtracting the values back
+ $seqCount = $seqCount - $seqCountPE;
+ $totalReadsFinal = $totalReadsFinal-$totalReadsFinalPE[2]-$totalReadsFinalUP[0];
+######### Done
+ }
+
+ my $lenDistF1 = getFileName($seqFile)."_lenDistribution.png";
+ my $qualDistF1 = getFileName($seqFile)."_qualDistribution.png";
+ my $sumPieFPE = getFileName($seqFile). "_PE_summary.png";
+ my $sumPieF = getFileName($seqFile). "_UP_summary.png";
+ my $gcDistF1 = getFileName($seqFile)."_gcDistribution.png";
+ my $baseCntF1 = getFileName($seqFile)."_baseCompostion.png";
+
+ my $c = 0;
+ my @lenLabels = ();
+ foreach my $arrRef (@lenDistrib) {
+ my $str = "";
+ foreach my $val (@{$arrRef}) {
+ if($c == 0) {
+ $str = "0-$lenInterval";
+ }
+ else {
+ $str = $lenInterval*$c . "-" . $lenInterval*($c+1);
+ }
+ $c++;
+ push(@lenLabels, $str);
+ }
+ last;
+ }
+
+ unshift(@lenDistrib, \@lenLabels);
+
+ if($isGDMod) {
+ drawLenDist(\@lenDistrib, $outFolder.$lenDistF1, getFileName($seqFile), 550, 350);
+ }
+
+ $c = 0;
+ my @qualLabels = ();
+ foreach my $arrRef (@qualDistrib) {
+ my $str = "";
+ foreach my $val (@{$arrRef}) {
+ if($c == 0) {
+ $str = "0";
+ $str .= "-$qualInterval" if($qualInterval>1);
+ }
+ else {
+ $str = $qualInterval*$c;
+ $str .= "-" . $qualInterval*($c) if($qualInterval>1);
+ }
+ push(@qualLabels, $str);
+ $c++;
+ }
+ last;
+ }
+
+ unshift(@qualDistrib, \@qualLabels);
+
+ if($isGDMod) {
+ drawQualDist(\@qualDistrib, $outFolder.$qualDistF1, getFileName($seqFile), 650, 350);
+ }
+
+ my $trashedReads = $lt100PE[2];
+ my $trimmedHP = $trimCountPE[2];
+ my $trimmedPA = $totalValidReadsWithPriAdaPE[2];
+ my $hQreadsExcptHP_PATrimmed = $totalReadsFinalPE[2] - $trimmedHP - $trimmedPA;
+ my $UPPairCount = $totalReadsFinalUP[0];
+ my $lQreadsGT100 = $seqCountPE - $totalReadsFinalPE[2] - $trashedReads - $UPPairCount;
+ my @summaryData = (["", "", "", "", "", ""], [$trashedReads, $trimmedHP, $trimmedPA, $hQreadsExcptHP_PATrimmed, $lQreadsGT100, $UPPairCount]);
+
+ if($isGDMod) {
+ drawSummaryPiePE(\@summaryData, $outFolder.$sumPieFPE, 520, 350);
+ }
+
+ $trashedReads = $lt100;
+ $trimmedHP = $trimCount;
+ $trimmedPA = $totalValidReadsWithPriAda;
+ $hQreadsExcptHP_PATrimmed = $totalReadsFinal - $trimmedHP - $trimmedPA;
+ $lQreadsGT100 = $seqCount - $totalReadsFinal - $trashedReads;
+ @summaryData = (["", "", "", "", ""], [$trashedReads, $trimmedHP, $trimmedPA, $hQreadsExcptHP_PATrimmed, $lQreadsGT100]);
+
+ if($isGDMod) {
+ drawSummaryPie(\@summaryData, $outFolder.$sumPieF, 520, 350);
+ }
+
+ $c=0;
+ my @gcLabel;
+ foreach my $ref (@gcDistrib) {
+ foreach my $val (@{$ref}) {
+ my $str = "";
+ if($c == 0) {
+ $str = "0-$gcInterval";
+ }
+ else {
+ $str = $gcInterval*$c . "-" . $gcInterval*($c+1);
+ }
+ $c++;
+ push(@gcLabel, $str);
+ }
+ last;
+ }
+
+ unshift(@gcDistrib, \@gcLabel);
+ if($isGDMod) {
+ drawGCDist(\@gcDistrib, $outFolder.$gcDistF1, getFileName($seqFile), 550, 350);
+ }
+
+
+ my @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCount[0]);
+ @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCount[0], $charCount[1]) if(!$isOnlyStat);
+ if($isGDMod) {
+ drawBaseComp(\@file1, $outFolder.$baseCntF1, getFileName($seqFile), 500, 300);
+ }
+
+
+ close(I);
+ close(Q);
+ close(OI);
+ close(OQ);
+ close(STAT);
+
+ my $iFol = getFilePath(abs_path($seqFile));
+ my $oFol = abs_path($outFolder) . "/";
+ my $inpFs = getFileName($seqFile);
+ $inpFs .= ":::::" . getFileName($qualFile);
+ my $htF = $oFol . "output_" . getFileName($seqFile);
+ $htF .= ".html";
+ my @fileNames4HTML;
+ @fileNames4HTML = ($outSeqFile, $outQualFile, $lenDistF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieFPE, $sumPieF);
+ htmlPrint(getFilePath(abs_path($0)), getFileName($0), $htF, $iFol, $isOnlyStat, $inpFs, $statFile, $oFol, \@fileNames4HTML);
+
+$pm->finish;
+}
+$pm->wait_all_children;
+
+print "================================================================\n";
+print "Processing has been finished\n";
+print "Output files are generated in $outFolder\n" if($outFolder ne "");
+print "Output files are generated in the folder of input files\n" if($outFolder eq "");
+print "================================================================\n";
+
+
+exit;
+
+
+sub openFileGetHandle {
+ my ($file, $rOrw, $ref) = @_;
+ if($file =~ /\.gz$/i) {
+ $$ref = new IO::Zlib;
+ $$ref->open("$file", "rb") or die "Can not open file $file" if($rOrw eq "r");
+ $$ref->open("$file", "wb") or die "Can not create file $file" if($rOrw eq "w");
+ }
+ else {
+ open($$ref, "<$file") or die "Can not open file $file" if($rOrw eq "r");
+ open($$ref, ">$file") or die "Can not create file $file" if($rOrw eq "w");
+ }
+}
+
+sub splitPEReads {
+ my ($seq, $qual, $id) = @_;
+ my $len = length $seq;
+ $linker = $linkerF;
+ if($linkerR) {
+ $linker = $linkerR if($seq =~ /$linkerR/);
+ }
+ my @seqs = split(/$linker/, $seq);
+ my ($fRead, $rRead, $fQual, $rQual, $lQual);
+ my $readStatus = 0; # 0: Paired 1: Forward only 2: Reverse only
+ if(@seqs == 1) {
+ $fRead = $seqs[0];
+ $readStatus = 1;
+ }
+ elsif(@seqs == 2) {
+ if(!$seqs[1]) {
+ $fRead = $seqs[0];
+ $readStatus = 1;
+ }
+ elsif(!$seqs[0]) {
+ $fRead = $seqs[1];
+ $readStatus = 2;
+ }
+ else {
+ $fRead = $seqs[0];
+ $rRead = $seqs[1];
+ $readStatus = 0;
+ }
+ }
+ elsif(@seqs == 3) {
+ if(!$seqs[0] && !$seqs[1]) {
+ $fRead = $seqs[2];
+ $readStatus = 2;
+ }
+ else {
+ $fRead = $seqs[0];
+ $rRead = $seqs[2];
+ $readStatus = 0;
+ }
+ }
+ else {
+ print "There is some problem with the linker sequence found in the read: $id\n";
+ return;
+ }
+ if($readStatus == 0) {
+ my $fLen = length $fRead;
+ my $rLen = length $rRead;
+ my $lLen = $len - $fLen - $rLen;
+ my ($t1, $t2);
+ ($fQual, $t1, $lQual, $t2, $rQual) = $qual =~ /^((\d{1,2}\s+){$fLen})((\d{1,2}\s+){$lLen})((\d{1,2}\s+){$rLen})$/;
+ }
+ elsif($readStatus == 1) {
+ my $fLen = length $fRead;
+ my $lLen = $len - $fLen;
+ if($lLen == 0) {
+ $fQual = $qual;
+ }
+ else {
+ ($fQual, $lQual) = $qual =~ /^((\d{1,2}\s+){$fLen})((\d{1,2}\s+){$lLen})$/;
+ }
+ }
+ elsif($readStatus == 2) {
+ my $fLen = length $fRead;
+ my $lLen = $len - $fLen;
+ ($lQual, $fQual) = $qual =~ /^((\d{1,2}\s+){$lLen})((\d{1,2}\s+){$fLen})$/;
+ }
+ $fQual =~ s/\s+$// if($fQual);
+ $rQual =~ s/\s+$// if($rQual);
+ $lQual =~ s/\s+$// if($lQual);
+ my $linkerCount = $seq =~ /$linker/gi;
+ return ($readStatus, $fRead, $fQual, $linkerCount, $lQual, $rRead, $rQual);
+}
+
+sub doQC {
+ my ($read, $qual) = @_;
+ my $len = length $read;
+ my ($basesAfterHQ);
+ my $qcStatus = 1; # 0: Failed QC (Read trashed), 1: Passed QC
+ my $lenFltrStatus = 1; # 1: Passed length filter
+ my $homoStatus = 0; # 0: absent homopolymer, 1: trimmed homopolymer
+ my $primStatus = 0; # 0: absent contam, 1: trimmed contam
+ my $hqStatus = -1; # 0: low quality read, 1: high quality read, -1: default (that means QC could not reach upto HQ filtering due to previous trashing in length filter)
+ my $validBases = 0;
+ if($len < $lowestValidLen && $isLenFilterOn) {
+ $lenFltrStatus = 0;
+ $qcStatus = 0;
+ }
+ else {
+ if($homoPolyLen != 0) {
+ if(hasPolyChar(\$read)) {
+ $homoStatus = 1;
+ $len = length $read;
+ if($len >= $lowestValidLen || !$isLenFilterOn) {
+ $qual = trimQualSeq($qual, $len, -1);
+ }
+ }
+ }
+ if($len < $lowestValidLen && $isLenFilterOn) {
+ $lenFltrStatus = 0;
+ $qcStatus = 0;
+ }
+ else {
+ $validBases = isReadOfHQ($qual);
+ if($validBases) {
+ $hqStatus = 1;
+ $basesAfterHQ = $len;
+ if(defined $priAdaLib) {
+ my $t=isWOPriAda(\$read);
+ $len = length $read;
+ if($t > -1) {
+ $qual = trimQualSeq($qual, $len, $t);
+ $primStatus = 1;
+ }
+ if(length $fastaSeq < $lowestValidLen && $isLenFilterOn) {
+ $lenFltrStatus = 0;
+ $qcStatus = 0;
+ }
+ else {
+ }
+ }
+ else {
+ }
+ }
+ else {
+ $hqStatus = 0;
+ $qcStatus = 0;
+ }
+ }
+ }
+ return ($read, $qual, $qcStatus, $lenFltrStatus, $primStatus, $homoStatus, $hqStatus, $basesAfterHQ, $validBases);
+}
+
+sub processSeq {
+ $fastaSeq =~ s/\s//g;
+ my ($readStatus, $fRead, $fQual, $linkerCount, $lQual, $rRead, $rQual) = splitPEReads($fastaSeq, $qualSeq, $prevFastaSeqId);
+ my $len = length $fastaSeq;
+##########Calculating statistics for Input Data
+ $maxRawLen = max($maxRawLen, $len);
+ $minRawLen = min($minRawLen, $len);
+ push(@rawLen, $len);
+ $qualSeq =~ s/\s+$//; # To remove the last space added in 'else' part;
+ my @tmpArr = getQualBases($qualSeq);
+ $totalBases += $tmpArr[0];
+ $totalHQBases += $tmpArr[1];
+ $avgQual += $tmpArr[2];
+ $lenDistrib[0][getIndex($len,$lenInterval)]++;
+ $qualDistrib[0][getIndex($tmpArr[2],$qualInterval)]++;
+ my $As = $fastaSeq =~ s/A/A/gi;
+ my $Ts = $fastaSeq =~ s/T/T/gi;
+ my $Gs = $fastaSeq =~ s/G/G/gi;
+ my $Cs = $fastaSeq =~ s/C/C/gi;
+ my $Ns = $len - $As - $Ts - $Gs - $Cs;
+ my $gcPercent = ($Gs + $Cs)/$len*100;
+ $gcDistrib[0][getIndex($gcPercent,$gcInterval)]++;
+ $charCount[0][0] += $As;
+ $charCount[0][1] += $Ts;
+ $charCount[0][2] += $Gs;
+ $charCount[0][3] += $Cs;
+ $charCount[0][4] += $Ns;
+#########Done calculating stat
+ my ($readOut, $qualOut);
+ my $outReadType = 0; # 0: Paired, 1: Unpaired
+ if($readStatus == 0) {
+ my @fReadQC = doQC($fRead, $fQual); #($read, $qual, $qcStatus, $lenFltrStatus, $homoStatus, $hqStatus)
+ my @rReadQC = doQC($rRead, $rQual); #($read, $qual, $qcStatus, $lenFltrStatus, $homoStatus, $hqStatus)
+ my $fLen = length $fRead;
+ my $rLen = length $rRead;
+ $seqCountPE++;
+ $lt100PE[0]++ if($fReadQC[3] == 0);
+ $lt100PE[1]++ if($rReadQC[3] == 0);
+ $lt100PE[2]++ if($fReadQC[3] == 0 && $rReadQC[3] == 0);
+ $totalValidReadsWithPriAdaPE[0]++ if($fReadQC[4] == 1);
+ $totalValidReadsWithPriAdaPE[1]++ if($rReadQC[4] == 1);
+ $totalValidReadsWithPriAdaPE[2]++ if($fReadQC[4] == 1 && $rReadQC[4] == 1);
+ $trimCountPE[0]++ if($fReadQC[5] == 1);
+ $trimCountPE[1]++ if($rReadQC[5] == 1);
+ $trimCountPE[2]++ if($fReadQC[5] == 1 && $rReadQC[5] == 1);
+ $lQCountPE[0]++ if($fReadQC[6] == 0);
+ $lQCountPE[1]++ if($rReadQC[6] == 0);
+ $lQCountPE[2]++ if($fReadQC[6] == 0 && $rReadQC[6] == 0);
+ $hQCountPE[0]++ if($fReadQC[6] == 1);
+ $hQCountPE[1]++ if($rReadQC[6] == 1);
+ $hQCountPE[2]++ if($fReadQC[6] == 1 && $rReadQC[6] == 1);
+ $totalBasesPE[0] += $fLen;
+ $totalBasesPE[1] += $rLen;
+ $totalBasesPE[2] += $fLen + $rLen;
+ $totalBasesAfterHQPE[0] += $fReadQC[7] if($fReadQC[6] == 1);
+ $totalBasesAfterHQPE[1] += $rReadQC[7] if($rReadQC[6] == 1);
+ $totalBasesAfterHQPE[2] += $fReadQC[7]+$rReadQC[7] if($fReadQC[6] == 1 && $rReadQC[6] == 1);
+ $totalHQBasesAfterHQPE[0] += $fReadQC[8] if($fReadQC[6] == 1);
+ $totalHQBasesAfterHQPE[1] += $rReadQC[8] if($rReadQC[6] == 1);
+ $totalHQBasesAfterHQPE[2] += $fReadQC[8]+$rReadQC[8] if($fReadQC[6] == 1 && $rReadQC[6] == 1);
+ $totalReadsFinalPE[0]++ if($fReadQC[2] == 1);
+ $totalReadsFinalPE[1]++ if($rReadQC[2] == 1);
+ $totalReadsFinalPE[2]++ if($fReadQC[2] == 1 && $rReadQC[2] == 1);
+ if($fReadQC[2] == 1 && $rReadQC[2] == 0) {
+ $totalReadsFinalUP[0]++;
+ $readOut = $fReadQC[0];
+ $qualOut = $fReadQC[1];
+ $outReadType = 1;
+ }
+ elsif($fReadQC[2] == 0 && $rReadQC[2] == 1) {
+ $totalReadsFinalUP[0]++;
+ $readOut = $rReadQC[0];
+ $qualOut = $rReadQC[1];
+ $outReadType = 1;
+ }
+ elsif($fReadQC[2] && $rReadQC[2]) {
+ $readOut = $fReadQC[0] . ($linker x $linkerCount) . $rReadQC[0];
+ $qualOut = $fReadQC[1] . " " . $lQual . " " . $rReadQC[1];
+ }
+ }
+ else {
+ my @fReadQC = doQC($fRead, $fQual); #($read, $qual, $qcStatus, $lenFltrStatus, $homoStatus, $hqStatus)
+ my $fLen = length $fRead;
+ $seqCount++;
+ $lt100++ if($fReadQC[3] == 0);
+ $totalValidReadsWithPriAda++ if($fReadQC[4] == 1);
+ $trimCount++ if($fReadQC[5] == 1);
+ $lQCount++ if($fReadQC[6] == 0);
+ $hQCount++ if($fReadQC[6] == 1);
+ $totalBasesUPOri += $fLen;
+ $totalBasesAfterHQ += $fReadQC[7] if($fReadQC[6] == 1);
+ $totalHQBasesAfterHQ += $fReadQC[8] if($fReadQC[6] == 1);
+ $totalReadsFinal++ if($fReadQC[2] == 1);
+ if($fReadQC[2]) {
+ $readOut = $fReadQC[0];
+ $qualOut = $fReadQC[1];
+ $outReadType = 1;
+ }
+ }
+ if(defined($readOut)) {
+##########Calculating statistics for Output Data
+ $len = length $readOut;
+ $maxHQLen = max($maxHQLen, $len);
+ $minHQLen = min($minHQLen, $len);
+ push(@hQLen, $len);
+ $qualOut =~ s/\s+$//; # To remove the last space, if present;
+ my @tmpArr = getQualBases($qualOut);
+ $totalBasesFinal += $tmpArr[0];
+ $totalHQBasesFinal += $tmpArr[1];
+ $avgQualFinal += $tmpArr[2];
+ if(!defined($isOnlyStat)) {
+ $lenDistrib[1][getIndex($len,$lenInterval)]++;
+ $qualDistrib[1][getIndex($tmpArr[2],$qualInterval)]++;
+ my $As = $readOut =~ s/A/A/gi;
+ my $Ts = $readOut =~ s/T/T/gi;
+ my $Gs = $readOut =~ s/G/G/gi;
+ my $Cs = $readOut =~ s/C/C/gi;
+ my $Ns = $len - $As - $Ts - $Gs - $Cs;
+ my $gcPercent = ($len)?(($Gs + $Cs)/$len*100):0;
+ $gcDistrib[1][getIndex($gcPercent,$gcInterval)]++;
+ $charCount[1][0] += $As;
+ $charCount[1][1] += $Ts;
+ $charCount[1][2] += $Gs;
+ $charCount[1][3] += $Cs;
+ $charCount[1][4] += $Ns;
+ print OI "$prevFastaSeqId\n";
+ print OI formatSeq($readOut), "\n";
+ print OQ "$prevFastaSeqId\n";
+ print OQ formatQualSeq($qualOut), "\n";
+ }
+#########Done calculating stat
+ }
+ if(($seqCount+$seqCountPE) % (10000) == 0) {
+ my $tmpP = sprintf "%0.0f", (($seqCount+$seqCountPE)/$ttlSeqCount*100);
+ print "$indOfAnalysis: Number of reads processed: " . ($seqCount+$seqCountPE) . "/$ttlSeqCount ($tmpP\%)...\n";
+ }
+ return;
+}
+
+sub getIndex {
+ my $up = $_[0];
+ my $down = $_[1];
+ my $inp = $up/$down;
+ return (sprintf "%0.0f", $up) if($down == 1);
+ my $index = int((sprintf "%0.2f", $inp)+0.99)-1;
+ $index = 0 if($index < 0);
+ return $index;
+}
+
+sub calcN50 {
+ my @x = @{$_[0]};
+ my $n = $_[1];
+ @x=sort{$b<=>$a} @x;
+ my $total = sum(@x);
+ my ($count, $n50)=(0,0);
+ for (my $j=0; $j<@x; $j++){
+ $count+=$x[$j];
+ if(($count>=$total*$n/100)){
+ $n50=$x[$j];
+ last;
+ }
+ }
+ return $n50;
+}
+
+sub calcMedian {
+ my @arr = @_;
+ my @sArr = sort{$a<=>$b} @arr;
+ my $arrLen = @arr;
+ my $median;
+ if($arrLen % 2 == 0) {
+ $median = ($sArr[$arrLen/2-1] + $sArr[$arrLen/2])/2;
+ }
+ else {
+ $median = $sArr[$arrLen/2];
+ }
+ return $median;
+}
+
+sub formatSeq {
+ my $seq = $_[0];
+ my $newSeq = "";
+ my $ch = 60;
+ my $len = length $seq;
+ for(my $i=0; $i<$len; $i+=$ch) {
+ $newSeq .= substr($seq, $i, $ch) . "\n";
+ }
+ chomp($newSeq); # To remove \n at the end of the whole sequence..
+ return $newSeq;
+}
+
+sub formatQualSeq {
+ my $qualSeq = $_[0];
+ my $fQSeq = "";
+ my $ch = 60;
+ my $valCount = 0;
+ my @arr = split(/\s+/, $qualSeq);
+ for(my $i=0; $i<@arr; $i++) {
+ $valCount++;
+ if($valCount % $ch == 0) {
+ $fQSeq .= $arr[$i] . "\n";
+ }
+ else {
+ $fQSeq .= $arr[$i] . " ";
+ }
+ }
+ $fQSeq =~ s/\s+$//;
+ return $fQSeq;
+}
+
+sub hasPolyChar {
+ my $seqRef = $_[0];
+ my $flag = 0;
+ if($$seqRef =~ s/(A{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(T{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(G{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(C{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ return $flag;
+}
+
+sub trimQualSeq {
+ my $qualSeq = $_[0];
+ my $seqLen = $_[1];
+ my $priAdaStart = $_[2];
+ my $trimmedQualSeq;
+ if($priAdaStart != -1) {
+ if($priAdaStart < 50) {
+ my $t = $seqLen-1;
+ $qualSeq =~ /((\d{1,2}\s+){$t}\d{1,2})$/;
+ $trimmedQualSeq = $1;
+ }
+ else {
+ $qualSeq =~ /^((\d{1,2}\s+){$seqLen})/;
+ $trimmedQualSeq = $1;
+ }
+ }
+ else {
+ $qualSeq =~ /^((\d{1,2}\s+){$seqLen})/;
+ $trimmedQualSeq = $1;
+ }
+ $trimmedQualSeq =~ s/\s+$//;
+ return $trimmedQualSeq;
+}
+
+sub isReadOfHQ { # Criteria for HQ is greater than or equal to 70% of bases have phred score > 20
+ my $read = $_[0];
+ my $validBaseCount = 0;
+ my @ASCII = split(/\s+/, $read);
+ my $readLen = scalar @ASCII;
+ my $cutOffLen = sprintf("%0.0f", $readLen * $cutOffReadLen4HQ / 100); # 70% length of read length is calculated.
+ foreach my $val (@ASCII) {
+ if($val >= $cutOffPhScore) {
+ $validBaseCount++;
+ }
+ }
+ if($validBaseCount >= $cutOffLen) {
+ return $validBaseCount; # Return true.
+ }
+ else {
+ return 0; # Return false.
+ }
+}
+
+sub getQualBases { # This will return an array. 1) Total bases 2) HQ bases 3) Average quality
+ my $read = $_[0];
+ my $qualSum = 0;
+ my @retArr = ();
+ my $validBaseCount = 0;
+ my @ASCII = split(/\s+/, $read);
+ my $readLen = scalar @ASCII;
+ foreach my $val (@ASCII) {
+ $qualSum += $val;
+ if($val >= $cutOffPhScore) {
+ $validBaseCount++;
+ }
+ }
+ $retArr[0] = $readLen;
+ $retArr[1] = $validBaseCount;
+ $retArr[2] = ($readLen)?(sprintf "%0.2f", $qualSum/$readLen):0;
+ return @retArr;
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTA format; .fna and .qual files) (Required)\n";
+ print " -i <Read file> <Quality file> <Primer/Adaptor library>\n";
+ print " Read and quality file in FASTA format with primer/adaptor library\n";
+ print " User may choose from the provided primer/adaptor library or can give a file containing primer/adaptor sequences, one per line\n";
+ print " Multiple libraries can be given using multiple '-i' options\n";
+ print " For eg.: -i read1.fna read1.qual 3 -i read2.fna read2.qual 2\n\n";
+ print " Primer/Adaptor libraries:\n";
+ my $c = 1;
+ foreach my $lib (@priAdaLibNames) {
+ print " $c = $lib\n";
+ $c++;
+ }
+ print " N = Do not filter for Primer/Adaptor\n";
+ print " <File> = File for user defined primer/adaptor sequences, one per line\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print "--------------------------------- QC Options ---------------------------------\n";
+ print " -l | -cutOffReadLen4HQ <Real number, 0 to 100>\n";
+ print " The cut-off value for percentage of read length that should be of given quality\n";
+ print " default: 70\n";
+ print " -s | -cutOffQualScore <Integer, 0 to 40>\n";
+ print " The cut-off value for PHRED quality score for high-quality filtering\n";
+ print " default: 20\n";
+ print " -n | -homoPolyLen <Integer>\n";
+ print " Minimum length of the homopolymer to be trimmed (0: to skip the homopolymer trimming)\n";
+ print " For eg.: -n 8, will trim the right end of read from the homopolymer of at least 8 bases long\n";
+ print " default: 0 (homopolymer trimming is off)\n";
+ print " -m | -minLen <Integer>\n";
+ print " Filter sequences shorter than the given minimum length\n";
+ print " default: 100\n";
+ print " -f | -lenFilter <Y/N>\n";
+ print " Are sequences to be filtered on the basis of length: (Y)es or (N)o\n";
+ print " default: Y\n";
+ print " -linker <Linker Sequence>\n";
+ print " Linker sequence used while preparing the paired-end library for sequencing using Roche 454\n";
+ print " default: GTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n";
+ print "----------------------------- Processing Options -----------------------------\n";
+ print " -p | -processes <Integer>\n";
+ print " Number of processes to be used\n";
+ print " default: 1\n";
+ print " -onlyStat\n";
+ print " Outputs only statistics without filtered data output\n";
+ print "------------------------------- Output Options -------------------------------\n";
+ print " -t | -statOutFmt <Integer>\n";
+ print " Output format for statistics\n";
+ print " Formats:\n";
+ print " 1 = formatted text\n";
+ print " 2 = tab delimited\n";
+ print " default: 1\n";
+ print " -o | -outputFolder <Output folder name/path>\n";
+ print " Output will be stored in the given folder\n";
+ print " default: By default, output folder (454QC_Filtered_files) will be generated where the input files are\n";
+ print " -z | -outputDataCompression <Character>\n";
+ print " Output format for HQ filtered data\n";
+ print " Formats:\n";
+ print " t = text FASTA files\n";
+ print " g = gzip compressed files\n";
+ print " default: t\n";
+ print "\n";
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+sub getFileName { # This sub takes a path of a file and returns just its name after separating the path from it.
+ my $path = $_[0];
+ my $name = "";
+ $path =~ /([^\/]+)$/;
+ $name = $1;
+ return $name;
+}
+
+sub getFilePath {
+ my $name = $_[0];
+ my $path = "";
+ if($name =~ /\//) {
+ $name =~ /(.+)\//;
+ $path = $1 . "/";
+ }
+ else {
+ $path = "./";
+ }
+ return $path;
+}
+
+
+
+
+sub drawBaseComp {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ y_label => 'Count',
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ l_margin => 60,
+ r_margin => 60,
+ b_margin => 50,
+ t_margin => 50,
+ show_values => 1,
+ bar_spacing => 1,
+ values_vertical => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ $mygraph->set_values_font($f, 6);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $dgreen = $myImage->colorAllocate(0,127,0);
+ my $dblue = $myImage->colorAllocate(0,0,127);
+
+ my $sum1 = sum(@{$$dataRef[1]});
+ my $sum2 = sum(@{$$dataRef[2]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Base composition for $fileName",
+ color => $dblue,
+ );
+
+ $wrapbox->set(align => 'center', width => $width);
+ $wrapbox->set_font($f, 11);
+ $wrapbox->draw(0,0);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[1]}[0]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[1]}[1]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[1]}[2]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[1]}[3]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[1]}[4]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-35);
+
+
+ my $startRectX = $width/2-230;
+ my $startRectY = $height-35;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+
+ if(!$isOnlyStat) {
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[2]}[0]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[2]}[1]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[2]}[2]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[2]}[3]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[2]}[4]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-20);
+
+
+
+ $startRectX = $width/2-230;
+ $startRectY = $height-20;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+ }
+
+
+
+
+
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+
+sub drawGCDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+
+ $mygraph->set(
+ x_label => '% GC content',
+ y_label => 'Number of reads',
+ title => "GC content distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ markers => [1],
+ marker_size => 3,
+ dclrs => [ qw(lred dgreen) ],
+ x_labels_vertical => 1,
+ legend_placement => 'BR',
+ x_labels_vertical => 1,
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+sub drawSummaryPiePE {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $width = $_[2];
+ my $height = $_[3];
+ my $mygraph = new GD::Graph::pie($width, $height+15);
+
+ $mygraph->set(
+ title => "Summary of quality check and filtering of Paired reads",
+ axislabelclr => 'black',
+ pie_height => 40,
+
+ l_margin => 15,
+ r_margin => 15,
+ b_margin => 70,
+ start_angle => -10,
+ dclrs => [ qw(lred cyan lyellow lgreen purple dblue) ],
+ transparent => 0,
+ ) or warn $mygraph->error;
+
+ $mygraph->set_label_font($f, 8);
+ $mygraph->set_value_font(['verdana', 'arial'],14);
+ $mygraph->set_title_font($f, 11);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $lyellow = $myImage->colorAllocate(255,255,0);
+ my $lgreen = $myImage->colorAllocate(0,255,0);
+ my $cyan = $myImage->colorAllocate(0,255,255);
+ my $purple = $myImage->colorAllocate(191,0,191);
+ my $dblue = $myImage->colorAllocate(0,0,127);
+
+ my $sum = sum(@{$$dataRef[1]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Trashed reads (shorter than $lowestValidLen bp) (%0.2f", @{$$dataRef[1]}[0]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Homopolymer trimmed reads (%0.2f", @{$$dataRef[1]}[1]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+40,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Trashed reads (low quality reads) (%0.2f", @{$$dataRef[1]}[4]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Primer/Adaptor trimmed reads (%0.2f", @{$$dataRef[1]}[2]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+40,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "High quality reads other than homopolymer and primer/adaptor trimmed (%0.2f", @{$$dataRef[1]}[3]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 500);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-15);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Unpaired reads (one of the paired reads which passed QC) (%0.2f", @{$$dataRef[1]}[5]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 500);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height);
+
+ my $startRectX1 = 10;
+ my $startRectX2 = $width/2+30;
+ my $startRectY = $height-45;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $myImage->filledRectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$cyan);
+ $myImage->rectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$purple);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $myImage->filledRectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$lyellow);
+ $myImage->rectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$lgreen);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$dblue);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+
+}
+
+sub drawSummaryPie {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $width = $_[2];
+ my $height = $_[3];
+ my $mygraph = new GD::Graph::pie($width, $height);
+
+ $mygraph->set(
+ title => "Summary of quality check and filtering of Unpaired (UPOri) reads",
+ axislabelclr => 'black',
+ pie_height => 40,
+
+ l_margin => 15,
+ r_margin => 15,
+ b_margin => 50,
+ start_angle => -10,
+ dclrs => [ qw(lred cyan lyellow lgreen purple) ],
+ transparent => 0,
+ ) or warn $mygraph->error;
+
+ $mygraph->set_label_font($f, 8);
+ $mygraph->set_value_font(['verdana', 'arial'],14);
+ $mygraph->set_title_font($f, 11);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $lyellow = $myImage->colorAllocate(255,255,0);
+ my $lgreen = $myImage->colorAllocate(0,255,0);
+ my $cyan = $myImage->colorAllocate(0,255,255);
+ my $purple = $myImage->colorAllocate(191,0,191);
+
+ my $sum = sum(@{$$dataRef[1]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Trashed reads (shorter than $lowestValidLen bp) (%0.2f", @{$$dataRef[1]}[0]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Homopolymer trimmed reads (%0.2f", @{$$dataRef[1]}[1]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+40,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Trashed reads (low quality reads) (%0.2f", @{$$dataRef[1]}[4]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Primer/Adaptor trimmed reads (%0.2f", @{$$dataRef[1]}[2]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+40,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "High quality reads other than homopolymer and primer/adaptor trimmed (%0.2f", @{$$dataRef[1]}[3]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 500);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-15);
+
+ my $startRectX1 = 10;
+ my $startRectX2 = $width/2+30;
+ my $startRectY = $height-45;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $myImage->filledRectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$cyan);
+ $myImage->rectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$purple);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $myImage->filledRectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$lyellow);
+ $myImage->rectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$lgreen);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+
+}
+
+sub drawQualDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ x_label => 'Average phred quality score',
+ y_label => 'Number of reads',
+ title => "Quality distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ bar_spacing => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+sub drawLenDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ x_label => 'Read length (bp)',
+ y_label => 'Number of reads',
+ title => "Length distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_labels_vertical => 1,
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ bar_spacing => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 9);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+
+sub isWOPriAda {
+ my $seq = $_[0];
+ chomp($$seq);
+
+ my @rapid = (
+ "CCATCTCATCCCTGCGTGTC",
+ "CCATCTCATCCCTGCGTGTCTCCGACTCAG",
+ "CTGAGTCGGAGA",
+ "CCTATCCCCTGTGTGCCTTG",
+ "CCTATCCCCTGTGTGCCTTGGCAGTCTCAG",
+ "CTGAGACTGCCA",
+ );
+
+ my @arrPE = (
+ "GCCTCCCTCGCGCCATCAG",
+ "CTGATGGCGCGAGGG",
+ "GCCTTGCCAGCCCGCTCAG",
+ "CTGAGCGGGCTGGCA",
+ "GCCTCCCTCGCGCCA",
+ "GCCTTGCCAGCCCGC",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @arrAmplicon = (
+ "CGTATCGCCTCCCTCGCGCCATCAG",
+ "CGTATCGCCTCCCTCGCGCCATCAG",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @arrsmRna = (
+ "GCCTCCCTCGCGCCATCAGTATCGTAGGCACCTGAGA",
+ "GCCTTGCCAGCCCGCTCAGTATTGATGGTGCCTACAG",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @priAdas = (\@rapid, \@arrPE, \@arrAmplicon, \@arrsmRna);
+ my %checkedPriStr = (); # The 20 bp from start and end are stored in this hash as key. So that next time when another pri/ada seq
+
+ my @priAdaSeqs = ();
+ if($priAdaLib eq "u") {
+ @priAdaSeqs = @usrDefinedPriAda;
+ }
+ else {
+ @priAdaSeqs = @{$priAdas[$priAdaLib]};
+ }
+
+ my $priInd = 0;
+ my $priAdaStart = 1;
+
+ my $isMatched = 0;
+ foreach my $priAda (@priAdaSeqs) {
+ $priAdaStart = findSeq($priAda, $$seq, \%checkedPriStr);
+ if($priAdaStart) {
+ if($priAdaStart < 50) {
+ $$seq = substr($$seq, $priAdaStart+$substrlen, length($$seq)-($priAdaStart+$substrlen));
+ }
+ else {
+ $$seq = substr($$seq, 0, $priAdaStart);
+ }
+ $isMatched = 1;
+ last;
+ }
+ }
+
+ if($isMatched) {
+ return $priAdaStart;
+ }
+ else {
+ return -1;
+ }
+}
+
+sub findSeq {
+ my $pri = $_[0];
+ my $seq = $_[1];
+ my $hashRef = $_[2];
+ my $subsl = $substrlen;
+ $subsl = length $pri if(length($pri) < $substrlen);
+ my $spri = substr($pri, 0, $subsl);
+ my $epri = substr($pri, (length $pri) - $subsl, $subsl);
+ my $sseq = substr($seq, 0, 50);
+ my $tmpInd = (length $seq) - 50;
+ $tmpInd = 0 if($tmpInd < 0);
+ my $eseq = substr($seq, $tmpInd, 50);
+ my $ans;
+ if(!defined($$hashRef{$spri})) {
+ my @catches = String::Approx::amatch($spri, ['I0 D0 S1'], $sseq);
+ if(@catches != 0) {
+ return findStart($sseq, $spri);
+ }
+ @catches = String::Approx::amatch($spri, ['I0 D0 S1'], $eseq);
+ if(@catches != 0) {
+ return findStart($eseq, $spri) + length($seq) - 50;
+ }
+ $$hashRef{$spri} = 1;
+ }
+ if(!defined($$hashRef{$epri})) {
+ my @catches = String::Approx::amatch($epri, ['I0 D0 S1'], $sseq);
+ if(@catches != 0) {
+ return findStart($sseq, $epri);
+ }
+ @catches = String::Approx::amatch($epri, ['I0 D0 S1'], $eseq);
+ if(@catches != 0) {
+ return findStart($eseq, $epri) + length($seq) - 50;
+ }
+ $$hashRef{$epri} = 1;
+ }
+ return 0;
+}
+
+use re qw(eval);
+use vars qw($matchStart);
+
+sub findStart {
+ my $pattern;
+ local $_;
+ ($_, $pattern) = @_;
+ $pattern = fuzzy_pattern($pattern, $mismLim);
+ my @results;
+ local $matchStart;
+ my $instrumentedPattern = qr/(?{ $matchStart = pos() })$pattern/;
+ while (/$instrumentedPattern/g) {
+ my $nextStart = pos();
+ return $matchStart;
+ push @results, "[$matchStart..$nextStart)";
+ pos() = $matchStart+1;
+ }
+}
+
+sub fuzzy_pattern {
+ my ($original_pattern, $mismatches_allowed) = @_;
+ $mismatches_allowed >= 0
+ or die "Number of mismatches must be greater than or equal to zero\n";
+ my $new_pattern = make_approximate($original_pattern, $mismatches_allowed);
+ return qr/$new_pattern/;
+}
+
+sub make_approximate {
+ my ($pattern, $mismatches_allowed) = @_;
+ if ($mismatches_allowed == 0) { return $pattern }
+ elsif (length($pattern) <= $mismatches_allowed)
+ { $pattern =~ tr/ACTG/./; return $pattern }
+ else {
+ my ($first, $rest) = $pattern =~ /^(.)(.*)/;
+ my $after_match = make_approximate($rest, $mismatches_allowed);
+ if ($first =~ /[ACGT]/) {
+ my $after_miss = make_approximate($rest, $mismatches_allowed-1);
+ return "(?:$first$after_match|.$after_miss)";
+ }
+ else { return "$first$after_match" }
+ }
+}
diff --git a/QC/454QC_PRLL.pl b/QC/454QC_PRLL.pl
new file mode 100644
index 0000000..7653f66
--- /dev/null
+++ b/QC/454QC_PRLL.pl
@@ -0,0 +1,2021 @@
+#! /usr/bin/perl
+
+use File::Basename;
+#BEGIN {
+# my ($tmp, $path) = fileparse($0);
+# push ( @INC,"$path/lib");
+# #use lib "$path";
+#}
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use Cwd qw(abs_path);
+use IO::Zlib;
+use FindBin qw($RealBin);
+use lib "$RealBin/lib";
+require "454html.pl";
+use threads('yield');
+use File::Path;
+use Thread::Queue;
+my $DataQueue;
+my $ProcessingQueue;
+my $thr;
+
+
+eval {
+ require Parallel::ForkManager;
+ require String::Approx;
+ require GD::Graph::linespoints;
+ require GD::Graph::bars;
+ require GD::Graph::pie;
+ require GD::Text::Wrap;
+};
+
+my $isGDMod = 1;
+
+if($@) {
+ my $errorText = join("", $@);
+ if($errorText =~ /Parallel/) {
+ print "Error:\n\tCan not find 'lib' folder with this perl program\n"; #module 'Parallel::ForkManager'\n";
+ print "\tCopy the 'lib' folder, provided with the toolkit, to the directory where this perl program is and try again\n\n";
+ exit;
+ }
+ elsif($errorText =~ /GD\/Graph\/linespoints/) {
+ print STDERR "Warning:\n\tCan not find module 'GD::Graph'\n";
+ print STDERR "\tGraphs for statistics will not be produced. \n\t\t\tOR \n\tInstall GD::Graph module and try again.\n\n";
+ $isGDMod = 0;
+ }
+ elsif($errorText =~ /String\/Approx/) {
+ print "Error:\n\tCan not find module 'String::Approx'\n";
+ print "\tInstall it and try again\n\n";
+ exit;
+ }
+}
+
+
+# Setting parameters
+my $lowestValidLen = 100;
+my @files = ();
+my $noOfInp = 3;
+my $helpAsked;
+my $cutOffReadLen4HQ = 70;
+my $cutOffPhScore = 20;
+my $outFolder = "";
+my $isOnlyStat;
+my $statOutFmt = 1;
+my $noOfProcesses = 1;
+my $homoPolyLen = 0;
+my $priAdaLib;
+my $isLenFilterOn = 1;
+my @priAdaLibNames = ("Rapid Library (Standard)", "Paired End Library", "Amplicon PE Library", "Small RNA Library");
+my $priAdaFile;
+my @usrDefinedPriAda = ();
+my $outputDataFmt = "t"; # t/T: Text; g/G: Gzip.
+
+GetOptions(
+ "i=s{$noOfInp}" => \@files,
+ "h|help" => \$helpAsked,
+ "l|cutOffReadLen4HQ=f" => \$cutOffReadLen4HQ,
+ "n|homoPolyLen=i" => \$homoPolyLen,
+ "o|outputFolder=s" => \$outFolder,
+ "z|outputDataCompression=s" => \$outputDataFmt,
+ "t|statOutFmt=i" => \$statOutFmt,
+ "onlyStat" => \$isOnlyStat,
+ "c|cpus=i" => \$noOfProcesses,
+ "s|cutOffQualScore=i" => \$cutOffPhScore,
+ "m|minLen=i" => \$lowestValidLen,
+ "f|lenFilter=s" => \$isLenFilterOn,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(@files == 0) {
+ prtError("No input files are provided");
+}
+my @tempFiles = ();
+prtError("Missing inputs for option -i") if((scalar @files)%$noOfInp != 0);
+for(my $i=0; $i<@files; $i+=$noOfInp) {
+ my $str = "$files[$i] $files[$i+1] $files[$i+2]";
+ if($files[$i+2] =~ /^-/) {
+ prtError("Missing inputs for option -i: at '-i $str'")
+ }
+ if($files[$i+2] =~ /^\d$/) {
+ if($files[$i+2] < 1 || $files[$i+2] > 4) {
+ prtError("Incorrect option for Primer/Adaptor library: at '-i $str'");
+ }
+ }
+ push(@tempFiles, $str);
+}
+ at files = ();
+ at files = @tempFiles;
+if($cutOffReadLen4HQ < 0 || $cutOffReadLen4HQ > 100) {
+ prtError("Incorrect value for -l|cutOffReadLen4HQ option: at '-l $cutOffReadLen4HQ'");
+}
+if($cutOffPhScore < 0 || $cutOffPhScore > 40) {
+ prtError("Incorrect value for -s|cutOffPhScore option: at '-s $cutOffPhScore'");
+}
+if($statOutFmt < 1 || $statOutFmt > 2) {
+ prtError("Incorrect value for -statOutFmt: at '-statOutFmt $statOutFmt'");
+}
+if($isLenFilterOn =~ /^N/i) {
+ $isLenFilterOn = 0;
+}
+else {
+ $isLenFilterOn = 1;
+}
+if($outputDataFmt !~ /^[tg]$/i) {
+ prtError("Incorrect value for -f|outputDataFmt option: at '-f $outputDataFmt'");
+}
+
+#my $pm = new Parallel::ForkManager($noOfProcesses);
+
+
+my $seqCount = 0;
+my $substrlen = 20; # For removePriAda
+my $mismLim = 1; # For removePriAda
+
+my $trimCount = 0;
+my $lt100 = 0;
+my $hQCount = 0;
+my $lQCount = 0;
+my $maxRawLen = 0;
+my $minRawLen = 1000000000000;
+#my $avgRawLen = 0;
+my $maxHQLen = 0;
+my $minHQLen = 1000000000000;
+my $avgHQLen = 0;
+my @rawLen = ();
+my @hQLen = ();
+my $totalBases = 0;
+my $totalHQBases = 0;
+my $totalBasesAfterHQ = 0;
+my $totalHQBasesAfterHQ = 0;
+my $totalBasesFinal = 0;
+my $totalHQBasesFinal = 0;
+my $totalReadsFinal = 0;
+my $avgQual = 0;
+my $avgQualFinal = 0;
+my $totalValidReadsWithPriAda = 0;
+my $totalValidReadsNoPriAda = 0;
+my @lenDistrib = ();
+my $lenInterval = 40;
+my @qualDistrib = ();
+my $qualInterval = 1;
+my @gcDistrib = ();
+my $gcInterval = 5;
+my @charCount = ();
+
+
+my $cmaxRawLen = 0;
+my $cminRawLen = 1000000000000;
+my @crawLen = ();
+my $ctotalBases = 0;
+my $ctotalHQBases = 0;
+my $cavgQual = 0;
+my $clt100 = 0;
+my $ctrimCount = 0;
+my $chQCount = 0;
+my $ctotalBasesAfterHQ = 0;
+my $cmaxHQLen = 0;
+my $cminHQLen = 1000000000000;
+my $cavgHQLen = 0;
+my @chQLen = ();
+my $ctotalReadsFinal = 0;
+my $ctotalBasesFinal = 0;
+my $ctotalHQBasesFinal = 0;
+my $cavgQualFinal = 0;
+my $clQCount = 0;
+my $ctotalHQBasesAfterHQ = 0;
+my $ctotalValidReadsWithPriAda = 0;
+my $ctotalValidReadsNoPriAda = 0;
+my @clenDistrib = ();
+my @cqualDistrib = ();
+my @cgcDistrib = ();
+my @ccharCount = ();
+
+
+my $fastaSeqId = "";
+my $fastaSeq = "";
+my $qualSeqId = "";
+my $qualSeq = "";
+my $prevFastaSeqId = "";
+my $indOfAnalysis = 0;
+my $uniqFolder = "";
+my $isInpGzip = 0;
+
+my @idArr = ();
+my @seqArr = ();
+my @qualArr = ();
+
+my $font_spec = getFilePath($0) . "lib/Fonts/Dustismo_Sans.ttf";
+my $f = getFilePath($0) . "lib/Fonts/LucidaSansDemiBold.ttf";
+
+
+#Temp
+my $c=0;
+
+foreach my $inpData (@files) {
+ $indOfAnalysis++;
+#my $pid = $pm->start and next;
+
+ $fastaSeqId = "";
+ $fastaSeq = "";
+ $qualSeqId = "";
+ $qualSeq = "";
+ $seqCount = 0;
+
+ $trimCount = 0;
+ $lt100 = 0;
+ $hQCount = 0;
+ $lQCount = 0;
+ $maxRawLen = 0;
+ $minRawLen = 1000000000000;
+ $maxHQLen = 0;
+ $minHQLen = 1000000000000;
+ $avgHQLen = 0;
+ @rawLen = ();
+ @hQLen = ();
+ $totalBases = 0;
+ $totalHQBases = 0;
+ $totalBasesAfterHQ = 0;
+ $totalHQBasesAfterHQ = 0;
+ $totalBasesFinal = 0;
+ $totalHQBasesFinal = 0;
+ $totalReadsFinal = 0;
+ $avgQual = 0;
+ $avgQualFinal = 0;
+ $totalValidReadsWithPriAda = 0;
+ $totalValidReadsNoPriAda = 0;
+ @lenDistrib = ();
+ @qualDistrib = ();
+ @gcDistrib = ();
+ @charCount = ();
+
+ @idArr = ();
+ @seqArr = ();
+ @qualArr = ();
+
+ $inpData =~ s/\\([A-Za-z_\.])/\/$1/g; # To remove '\' from the path of windows file
+ my @iData = split(" ", $inpData);
+ my $seqFile = $iData[0];
+ my $qualFile = $iData[1];
+ if($seqFile =~ /\.gz$/i || $qualFile =~ /\.gz$/i) {
+ $isInpGzip = 1;
+ }
+ $priAdaLib = $iData[2];
+ print "Analysis has been started for \"$seqFile\": Index: $indOfAnalysis\n";
+ if($priAdaLib =~ /^n$/i) {
+ undef $priAdaLib;
+ }
+ elsif($priAdaLib =~ /^\d$/) {
+ $priAdaLib = $priAdaLib - 1;
+ }
+ else {
+ $priAdaFile = $priAdaLib;
+ $priAdaLib = "u";
+ open(PRIADA, "<$priAdaFile") or die "Can not open the user-defined primer/adapter file: $priAdaFile\n";
+ @usrDefinedPriAda = <PRIADA>;
+ for(my $i=0; $i<$#usrDefinedPriAda; $i++) {
+ $usrDefinedPriAda[$i] =~ s/\s+//g;
+ }
+ }
+ my ($seqFileName, $filePath) = fileparse($seqFile);
+ my ($qualFileName) = fileparse($qualFile);
+ $outFolder = $filePath . "454QC_Filtered_files" if($outFolder eq "");
+ $outFolder .= "/" if($outFolder !~ /\/$/);
+ if(! -e $outFolder) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+ }
+ my $outSeqFile = $outFolder . $seqFileName . "_filtered";
+ my $outQualFile = $outFolder . $qualFileName . "_filtered";
+ $outSeqFile .= ".gz" if($outputDataFmt =~ /g/i);
+ $outQualFile .= ".gz" if($outputDataFmt =~ /g/i);
+ my $statFile = $outFolder . $seqFileName . "_stat";
+
+ $DataQueue = Thread::Queue->new();
+ unlink($outSeqFile) if(-e $outSeqFile);
+ unlink($outQualFile) if(-e $outQualFile);
+ $thr = threads->create(sub {
+ while (my $DataElement = $DataQueue->dequeue()) {
+ $DataElement =~ s/([sq]$)//;
+ my $readType = $1;
+ my $outH;
+ openFileGetHandle($outSeqFile, "a", \$outH) if($readType eq "s");
+ openFileGetHandle($outQualFile, "a", \$outH) if($readType eq "q");
+ *OOO = $outH;
+ print OOO "$DataElement";
+ close(OOO);
+ }
+ });
+ $ProcessingQueue = Thread::Queue->new();
+
+ my $iH;
+ openFileGetHandle($seqFile, "r", \$iH);
+ *I = $iH;
+
+ do {
+ $uniqFolder = "";
+ for(my $i=0; $i<5; $i++) {
+ $uniqFolder .= int(rand(10));
+ }
+ $uniqFolder = $outFolder . $uniqFolder;
+ }
+ while(-e $uniqFolder);
+ mkdir($uniqFolder) or die "Can not create folder for temporary files\n";
+ open(STAT, ">$statFile") or die "Can not open file: $statFile\n";
+ while(my $line = <I>) {
+ $seqCount++ if($line =~ /^>/);
+ }
+ close(I);
+
+
+ if($seqFile =~ /\.gz$/i || $qualFile =~ /\.gz$/i) {
+ my @fileNames = ($seqFile, $qualFile);
+ my $thRef = threads->create('readDivideGzip', @fileNames);
+ threading4Processing();
+ $thRef->join();
+ print "$indOfAnalysis: Number of reads processed: " . $seqCount . "/$seqCount (100\%)...\n";
+ }
+ else {
+ print "$indOfAnalysis: Number of reads processed: " . "0/$seqCount (0\%)...\n";
+ undef $iH;
+ openFileGetHandle($seqFile, "r", \$iH);
+ *I = $iH;
+ my $qH;
+ openFileGetHandle($qualFile, "r", \$qH);
+ *Q = $qH;
+
+ my @thArr = ();
+ my $noOfSeqPerThread = int(($seqCount-1)/$noOfProcesses); #20000;
+ my $roughSeqCounter = 0;
+ my $sCounter = 0;
+ my $jobCounter = 0;
+ my $ttlJobCounter = 0;
+ my $fileEOF = 0;
+
+ while(1) {
+ $jobCounter = 0;
+ OUTERLOOP:
+ for(my $i=0; $i<$noOfProcesses; $i++) {
+ $jobCounter++;
+ $ttlJobCounter++;
+ for(my $j=0; $j<$noOfSeqPerThread;) {
+ my $line = <I>;
+ chomp $line;
+ my $qualLine = <Q>;
+ chomp($qualLine);
+ if($line =~ /^>/) {
+ $sCounter++;
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ $qualSeqId = $qualLine;
+ if($fastaSeqId ne $qualSeqId) {
+ print "Error: Read Id doesn't match in sequence and quality file for read number $seqCount in sequence file.\n";
+ exit(-1);
+ }
+ if($fastaSeq ne "") {
+ push(@idArr, $prevFastaSeqId);
+ push(@seqArr, $fastaSeq);
+ push(@qualArr, $qualSeq);
+ $j++;
+ if($j == $noOfSeqPerThread) {
+ my $id = sprintf "%05s", $ttlJobCounter;
+ my @refArr = (\@idArr, \@seqArr, \@qualArr, $id);
+ $thArr[$i] = threads->create('passSeq', @refArr);
+ @idArr = ();
+ @seqArr = ();
+ @qualArr = ();
+ }
+ }
+ $fastaSeq = "";
+ $qualSeq = "";
+ if($sCounter == $seqCount) {
+ $jobCounter-- if((scalar @idArr) != 0);
+ $ttlJobCounter-- if((scalar @idArr) != 0); #This is not commented because this is used to create part files.
+ $fileEOF = 1;
+ last OUTERLOOP;
+ }
+ }
+ else {
+ $fastaSeq .= $line;
+ $qualSeq .= $qualLine . " ";
+ }
+ }
+ }
+ my @tmpArr = ();
+ for(my $i=0; $i<$jobCounter; $i++) {
+ my $refArr = $thArr[$i]->join;
+ &updateData(@{$refArr});
+ $roughSeqCounter += $noOfSeqPerThread;
+ if($roughSeqCounter%$noOfSeqPerThread == 0) {
+ my $tmpP = sprintf "%0.0f", ($roughSeqCounter/$seqCount*100);
+ print "$indOfAnalysis: Number of reads processed: " . $roughSeqCounter . "/$seqCount ($tmpP\%)...\n";
+ }
+ }
+ last if($fileEOF);
+ }
+ while(my $line = <I>) {
+ my $qualLine = <Q>;
+ chomp $line;
+ chomp($qualLine);
+ $fastaSeq .= $line;
+ $qualSeq .= $qualLine . " ";
+ }
+ $ttlJobCounter++;
+ my $id = sprintf "%05s", $ttlJobCounter;
+ $prevFastaSeqId = $fastaSeqId;
+ push(@idArr, $prevFastaSeqId);
+ push(@seqArr, $fastaSeq);
+ push(@qualArr, $qualSeq);
+ my @refArr = (\@idArr, \@seqArr, \@qualArr, $id);
+ #my $thId = threads->create('passSeq', @refArr);
+ #my $refRefArr = $thId->join; #passSeq(@refArr);
+ my $refRefArr = passSeq(@refArr);
+ &updateData(@{$refRefArr});
+ close(I);
+ close(Q);
+ print "$indOfAnalysis: Number of reads processed: " . $seqCount . "/$seqCount (100\%)...\n";
+ }
+ if(!defined($isOnlyStat)) {
+ my ($sHndl, $qHndl);
+ openFileGetHandle($outSeqFile, "w", \$sHndl);
+ openFileGetHandle($outQualFile, "w", \$qHndl);
+ *OOS = $sHndl;
+ *OOQ = $qHndl;
+ print "$indOfAnalysis: Printing filtered data...\n";
+ opendir(DIR, $uniqFolder);
+ my @partFiles = readdir(DIR);
+ @partFiles = sort @partFiles;
+ foreach my $pFile (@partFiles) {
+ next if($pFile =~ /\./);
+ my $npFile = "$uniqFolder/$pFile";
+ open(P, "<$npFile") or die "Can not open part file\n";
+ while(<P>) {
+ print OOS if($pFile =~ /seq[^\n]+out/);
+ print OOQ if($pFile =~ /qual[^\n]+out/);
+ }
+ close(P);
+ }
+ closedir(DIR);
+ close(OOS);
+ close(OOQ);
+ }
+ print "$indOfAnalysis: Analysis completed\n";
+ print "$indOfAnalysis: Printing Statistics...\n";
+
+ if($statOutFmt == 1) {
+ my $inde = " " x 1;
+ my $tmpPer = 0;
+ printf STAT "Parameters\n";
+ printf STAT "$inde %-40s %s %s\n", "Input files ", $seqFile, $qualFile;
+ printf STAT "$inde %-40s %s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf STAT "$inde %-40s %s\n", "Homopolymer trimming", "Off" if($homoPolyLen == 0);
+ printf STAT "$inde %-40s %s\n", "Homopolymer trimming", "On" if($homoPolyLen != 0);
+ printf STAT "$inde %-40s %s\n", "Length of the homopolymer to be removed", $homoPolyLen if($homoPolyLen != 0);
+ printf STAT "$inde %-40s %s\n", "Length filter", ($isLenFilterOn)?"On":"Off";
+ printf STAT "$inde %-40s %s\n", "Cut-off for minimum read length", $lowestValidLen if($isLenFilterOn);
+ printf STAT "$inde %-40s %s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf STAT "$inde %-40s %s\n", "Cut-off quality score", $cutOffPhScore;
+ printf STAT "$inde %-40s %s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf STAT "$inde %-40s %s\n", "Number of CPUs", $noOfProcesses;
+
+ print STAT "\n\n";
+
+ print STAT "QC statistics\n";
+ printf STAT "$inde %-70s %s\n", "File name", $seqFileName;
+ printf STAT "$inde %-70s %d\n", "Total number of reads", $seqCount;
+ printf STAT "$inde %-70s %d\n", "Total number of trimmed reads containing homopolymer", $trimCount;
+ printf STAT "$inde %-70s %d\n", "Total number of trashed reads (<$lowestValidLen bp in length after trimming)", $lt100;
+ printf STAT "$inde %-70s %d\n", "Total number of low quality reads (excluding <$lowestValidLen reads)", $lQCount;
+ printf STAT "$inde %-70s %d\n", "Total number of HQ reads", $hQCount;
+ $tmpPer = sprintf "%0.2f", $hQCount/$seqCount*100;
+ printf STAT "$inde %-70s %s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf STAT "$inde %-70s %.f\n", "Total number of bases", $totalBases;
+ printf STAT "$inde %-70s %.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ;
+ printf STAT "$inde %-70s %.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ;
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ/$totalBasesAfterHQ*100;
+ printf STAT "$inde %-70s %s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf STAT "$inde %-70s %d\n", "Number of Primer/Adaptor trimmed reads", $totalValidReadsWithPriAda;
+ }
+ else {
+ printf STAT "$inde %-70s %s\n", "Number of Primer/Adaptor trimmed reads", "NA", "NA";
+ }
+ printf STAT "$inde %-70s %d\n", "Total number of HQ filtered reads", $totalReadsFinal;
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal/$seqCount*100;
+ printf STAT "$inde %-70s %s\n", "Percentage of HQ filtered reads", $tmpPer."%";
+
+ print STAT "\n\n";
+
+ print STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", $seqFileName, (fileparse($outSeqFile))[0]],
+ ["Total number of reads", $seqCount, $totalReadsFinal],
+ ["Minimum read length", $minRawLen, $minHQLen],
+ ["Maximum read length", $maxRawLen, $maxHQLen],
+ ["Average read length", (sprintf "%0.2f", $totalBases/$seqCount), (sprintf "%0.2f", $totalBasesFinal/$totalReadsFinal)],
+ ["Median read length", calcMedian(@rawLen), calcMedian(@hQLen)],
+ ["N25 length", calcN50(\@rawLen, 25), calcN50(\@hQLen, 25)],
+ ["N50 length", calcN50(\@rawLen, 50), calcN50(\@hQLen, 50)],
+ ["N75 length", calcN50(\@rawLen, 75), calcN50(\@hQLen, 75)],
+ ["N90 length", calcN50(\@rawLen, 90), calcN50(\@hQLen, 90)],
+ ["N95 length", calcN50(\@rawLen, 95), calcN50(\@hQLen, 95)],
+ ["Total number of bases", $totalBases, $totalBasesFinal],
+ ["Total number of HQ bases", $totalHQBases, $totalHQBasesFinal],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases/$totalBases*100)."%", (sprintf "%0.2f", $totalHQBasesFinal/$totalBasesFinal*100)."%"],
+ ["Average quality score (Overall)", (sprintf "%0.2f", $avgQual/$seqCount), (sprintf "%0.2f", $avgQualFinal/$totalReadsFinal)],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf STAT "$inde %-50s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf STAT "$inde %-50s %s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print STAT "\n\n";
+ }
+ elsif($statOutFmt == 2) {
+ my $inde = " " x 1;
+ my $tmpPer = 0;
+ printf STAT "Parameters\n";
+ printf STAT "\t%s\t%s\t%s\n", "Input files ", $seqFile, $qualFile;
+ printf STAT "\t%s\t%s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf STAT "\t%s\t%s\n", "Homopolymer trimming", "Off" if($homoPolyLen == 0);
+ printf STAT "\t%s\t%s\n", "Homopolymer trimming", "On" if($homoPolyLen != 0);
+ printf STAT "\t%s\t%s\n", "Length of the homopolymer to be removed", $homoPolyLen if($homoPolyLen != 0);
+ printf STAT "\t%s\t%s\n", "Length of the homopolymer to be removed", $homoPolyLen;
+ printf STAT "\t%s\t%s\n", "Length filter", ($isLenFilterOn)?"On":"Off";
+ printf STAT "\t%s\t%s\n", "Cut-off for minimum read length", $lowestValidLen if($isLenFilterOn);
+ printf STAT "\t%s\t%s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf STAT "\t%s\t%s\n", "Cut-off quality score", $cutOffPhScore;
+ printf STAT "\t%s\t%s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf STAT "\t%s\t%s\n", "Number of CPUs", $noOfProcesses;
+
+ print STAT "\n\n";
+
+ print STAT "QC statistics\n";
+ printf STAT "\t%s\t%s\n", "File name", $seqFileName;
+ printf STAT "\t%s\t%d\n", "Total number of reads", $seqCount;
+ printf STAT "\t%s\t%d\n", "Total number of trimmed reads containing homopolymer", $trimCount;
+ printf STAT "\t%s\t%d\n", "Total number of trashed reads (<$lowestValidLen bp in length after trimming)", $lt100;
+ printf STAT "\t%s\t%d\n", "Total number of low quality reads (excluding <$lowestValidLen reads)", $lQCount;
+ printf STAT "\t%s\t%d\n", "Total number of HQ reads", $hQCount;
+ $tmpPer = sprintf "%0.2f", $hQCount/$seqCount*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf STAT "\t%s\t%.f\n", "Total number of bases", $totalBases;
+ printf STAT "\t%s\t%.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ;
+ printf STAT "\t%s\t%.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ;
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ/$totalBasesAfterHQ*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf STAT "\t%s\t%d\n", "Number of Primer/Adaptor trimmed reads", $totalValidReadsWithPriAda;
+ }
+ else {
+ printf STAT "\t%s\t%s\n", "Number of Primer/Adaptor trimmed reads", "NA", "NA";
+ }
+ printf STAT "\t%s\t%d\n", "Total number of HQ filtered reads", $totalReadsFinal;
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal/$seqCount*100;
+ printf STAT "\t%s\t%s\n", "Percentage of HQ filtered reads", $tmpPer."%";
+
+ print STAT "\n\n";
+
+ print STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", $seqFileName, (fileparse($outSeqFile))[0]],
+ ["Total number of reads", $seqCount, $totalReadsFinal],
+ ["Minimum read length", $minRawLen, $minHQLen],
+ ["Maximum read length", $maxRawLen, $maxHQLen],
+ ["Average read length", (sprintf "%0.2f", $totalBases/$seqCount), (sprintf "%0.2f", $totalBasesFinal/$totalReadsFinal)],
+ ["Median read length", calcMedian(@rawLen), calcMedian(@hQLen)],
+ ["N25 length", calcN50(\@rawLen, 25), calcN50(\@hQLen, 25)],
+ ["N50 length", calcN50(\@rawLen, 50), calcN50(\@hQLen, 50)],
+ ["N75 length", calcN50(\@rawLen, 75), calcN50(\@hQLen, 75)],
+ ["N90 length", calcN50(\@rawLen, 90), calcN50(\@hQLen, 90)],
+ ["N95 length", calcN50(\@rawLen, 95), calcN50(\@hQLen, 95)],
+ ["Total number of bases", $totalBases, $totalBasesFinal],
+ ["Total number of HQ bases", $totalHQBases, $totalHQBasesFinal],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases/$totalBases*100)."%", (sprintf "%0.2f", $totalHQBasesFinal/$totalBasesFinal*100)."%"],
+ ["Average quality score (Overall)", (sprintf "%0.2f", $avgQual/$seqCount), (sprintf "%0.2f", $avgQualFinal/$totalReadsFinal)],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf STAT "\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf STAT "\t%s\t%s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print STAT "\n\n";
+ }
+
+ my $lenDistF1 = getFileName($seqFile)."_lenDistribution.png";
+ my $qualDistF1 = getFileName($seqFile)."_qualDistribution.png";
+ my $sumPieF = getFileName($seqFile). "_summary.png";
+ my $gcDistF1 = getFileName($seqFile)."_gcDistribution.png";
+ my $baseCntF1 = getFileName($seqFile)."_baseCompostion.png";
+
+ my $c = 0;
+ my @lenLabels = ();
+ foreach my $arrRef (@lenDistrib) {
+ my $str = "";
+ foreach my $val (@{$arrRef}) {
+ if($c == 0) {
+ $str = "0-$lenInterval";
+ }
+ else {
+ $str = $lenInterval*$c . "-" . $lenInterval*($c+1);
+ }
+ $c++;
+ push(@lenLabels, $str);
+ }
+ last;
+ }
+
+ unshift(@lenDistrib, \@lenLabels);
+
+ if($isGDMod) {
+ drawLenDist(\@lenDistrib, $outFolder.$lenDistF1, getFileName($seqFile), 550, 350);
+ }
+
+ $c = 0;
+ my @qualLabels = ();
+ foreach my $arrRef (@qualDistrib) {
+ my $str = "";
+ foreach my $val (@{$arrRef}) {
+ if($c == 0) {
+ $str = "0";
+ $str .= "-$qualInterval" if($qualInterval>1);
+ }
+ else {
+ $str = $qualInterval*$c;
+ $str .= "-" . $qualInterval*($c) if($qualInterval>1);
+ }
+ push(@qualLabels, $str);
+ $c++;
+ }
+ last;
+ }
+
+ unshift(@qualDistrib, \@qualLabels);
+
+ if($isGDMod) {
+ drawQualDist(\@qualDistrib, $outFolder.$qualDistF1, getFileName($seqFile), 650, 350);
+ }
+
+ my $trashedReads = $lt100;
+ my $trimmedHP = $trimCount;
+ my $trimmedPA = $totalValidReadsWithPriAda;
+ my $hQreadsExcptHP_PATrimmed = $totalReadsFinal - $trimmedHP - $trimmedPA;
+ my $lQreadsGT100 = $seqCount - $totalReadsFinal - $trashedReads;
+ my @summaryData = (["", "", "", "", ""], [$trashedReads, $trimmedHP, $trimmedPA, $hQreadsExcptHP_PATrimmed, $lQreadsGT100]);
+
+ if($isGDMod) {
+ drawSummaryPie(\@summaryData, $outFolder.$sumPieF, 520, 350);
+ }
+
+ $c=0;
+ my @gcLabel;
+ foreach my $ref (@gcDistrib) {
+ foreach my $val (@{$ref}) {
+ my $str = "";
+ if($c == 0) {
+ $str = "0-$gcInterval";
+ }
+ else {
+ $str = $gcInterval*$c . "-" . $gcInterval*($c+1);
+ }
+ $c++;
+ push(@gcLabel, $str);
+ }
+ last;
+ }
+
+ unshift(@gcDistrib, \@gcLabel);
+ if($isGDMod) {
+ drawGCDist(\@gcDistrib, $outFolder.$gcDistF1, getFileName($seqFile), 550, 350);
+ }
+
+
+ my @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCount[0]);
+ @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCount[0], $charCount[1]) if(!$isOnlyStat);
+ if($isGDMod) {
+ drawBaseComp(\@file1, $outFolder.getFileName($seqFile)."_baseCompostion.png", getFileName($seqFile), 500, 300);
+ }
+
+
+ close(STAT);
+
+ my $iFol = getFilePath(abs_path($seqFile));
+ my $oFol = abs_path($outFolder) . "/";
+ my $inpFs = getFileName($seqFile);
+ $inpFs .= ":::::" . getFileName($qualFile);
+ my $htF = $oFol . "output_" . getFileName($seqFile);
+ $htF .= ".html";
+ my @fileNames4HTML;
+ @fileNames4HTML = ($outSeqFile, $outQualFile, $lenDistF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieF);
+ htmlPrint(getFilePath(abs_path($0)), getFileName($0), $htF, $iFol, $isOnlyStat, $inpFs, $statFile, $oFol, \@fileNames4HTML);
+
+ $DataQueue->enqueue(undef);
+ $thr->join();
+ rmtree($uniqFolder, 0, 0);
+#$pm->finish;
+}
+#$pm->wait_all_children;
+
+print "================================================================\n";
+print "Processing has been finished\n";
+print "Output files are generated in $outFolder\n" if($outFolder ne "");
+print "Output files are generated in the folder of input files\n" if($outFolder eq "");
+print "================================================================\n";
+
+
+exit;
+
+sub openFileGetHandle {
+ my ($file, $rOrw, $ref) = @_;
+ if($file =~ /\.gz$/i) {
+ $$ref = new IO::Zlib;
+ $$ref->open("$file", "rb") or die "Can not open file $file" if($rOrw eq "r");
+ $$ref->open("$file", "wb") or die "Can not create file $file" if($rOrw eq "w");
+ $$ref->open("$file", "ab") or die "Can not open-append file $file" if($rOrw eq "a");
+ }
+ else {
+ open($$ref, "<$file") or die "Can not open file $file" if($rOrw eq "r");
+ open($$ref, ">$file") or die "Can not create file $file" if($rOrw eq "w");
+ open($$ref, ">>$file") or die "Can not open-append file $file" if($rOrw eq "a");
+ }
+}
+
+
+sub updateData() {
+ my @arr = @_;
+ $maxRawLen = max($maxRawLen, $arr[0]);
+ $minRawLen = min($minRawLen, $arr[1]);
+ push(@rawLen, @{$arr[2]});
+ $totalBases += $arr[3];
+ $totalHQBases += $arr[4];
+ $avgQual += $arr[5];
+ $lt100 += $arr[6];
+ $trimCount += $arr[7];
+ $hQCount += $arr[8];
+ $totalBasesAfterHQ += $arr[9];
+ $maxHQLen = max($maxHQLen, $arr[10]);
+ $minHQLen = min($minHQLen, $arr[11]);
+ $avgHQLen += $arr[12];
+ push(@hQLen, @{$arr[13]});
+ $totalReadsFinal += $arr[14];
+ $totalBasesFinal += $arr[15];
+ $totalHQBasesFinal += $arr[16];
+ $avgQualFinal += $arr[17];
+ $lQCount += $arr[18];
+ $totalHQBasesAfterHQ += $arr[19];
+ $totalValidReadsWithPriAda += $arr[20];
+ $totalValidReadsNoPriAda += $arr[21];
+ addTwoArrays($arr[22], \@lenDistrib);
+ addTwoArrays($arr[23], \@qualDistrib);
+ addTwoArrays($arr[24], \@gcDistrib);
+ addTwoArrays($arr[25], \@charCount);
+}
+
+sub resetVariables() {
+ $cmaxRawLen = 0;
+ $cminRawLen = 1000000000000;
+ @crawLen = ();
+ $ctotalBases = 0;
+ $ctotalHQBases = 0;
+ $cavgQual = 0;
+ $clt100 = 0;
+ $ctrimCount = 0;
+ $chQCount = 0;
+ $ctotalBasesAfterHQ = 0;
+ $cmaxHQLen = 0;
+ $cminHQLen = 1000000000000;
+ $cavgHQLen = 0;
+ @chQLen = ();
+ $ctotalReadsFinal = 0;
+ $ctotalBasesFinal = 0;
+ $ctotalHQBasesFinal = 0;
+ $cavgQualFinal = 0;
+ $clQCount = 0;
+ $ctotalHQBasesAfterHQ = 0;
+ $ctotalValidReadsWithPriAda = 0;
+ $ctotalValidReadsNoPriAda = 0;
+ @clenDistrib = ();
+ @cqualDistrib = ();
+ @cgcDistrib = ();
+ @ccharCount = ();
+}
+
+sub readDivideGzip {
+ my ($seqFile, $qualFile) = @_;
+ my $chunkCounter = 0;
+ my $sCounter = 0;
+ my $fileEOF = 0;
+ my ($seqH, $qualH, $id);
+ my $isFileOpen = 0;
+ my $iH;
+ openFileGetHandle($seqFile, "r", \$iH);
+ my $qH;
+ openFileGetHandle($qualFile, "r", \$qH);
+ my $noOfSeqPerThread = int(($seqCount-1)/$noOfProcesses); #20000;
+ while(1) {
+ OUTERLOOP:
+ for(my $i=0; $i<$noOfProcesses; $i++) {
+ $chunkCounter++;
+ $id = sprintf "%05s", $chunkCounter;
+ undef $seqH;
+ openFileGetHandle("$uniqFolder/part_seq_$id", "w", \$seqH);
+ undef $qualH;
+ openFileGetHandle("$uniqFolder/part_qual_$id", "w", \$qualH);
+ $isFileOpen = 1;
+ for(my $j=0; $j<$noOfSeqPerThread;) {
+ my $line = <$iH>;
+ chomp $line;
+ my $qualLine = <$qH>;
+ chomp($qualLine);
+ if($line =~ /^>/) {
+ $sCounter++;
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ $qualSeqId = $qualLine;
+ if($fastaSeqId ne $qualSeqId) {
+ print "Error: Read Id doesn't match in sequence and quality file for read number $seqCount in sequence file.\n";
+ exit(-1);
+ }
+ if($fastaSeq ne "") {
+ print $seqH "$prevFastaSeqId\n";
+ print $seqH "$fastaSeq\n";
+ print $qualH "$prevFastaSeqId\n";
+ print $qualH "$qualSeq\n";
+ $j++;
+ if($j == $noOfSeqPerThread) {
+ close($seqH);
+ close($qualH);
+ $ProcessingQueue->enqueue("$uniqFolder/part_seq_$id"."\t"."$uniqFolder/part_qual_$id");
+ $isFileOpen = 0;
+ }
+ }
+ $fastaSeq = "";
+ $qualSeq = "";
+ if($sCounter == $seqCount) {
+ $chunkCounter-- if((scalar @idArr) != 0); #This is not commented because this is used to create part files.
+ $fileEOF = 1;
+ last OUTERLOOP;
+ }
+ }
+ else {
+ $fastaSeq .= $line;
+ $qualSeq .= $qualLine . " ";
+ }
+ }
+ }
+ last if($fileEOF);
+ }
+ while(my $line = <$iH>) {
+ my $qualLine = <$qH>;
+ chomp $line;
+ chomp($qualLine);
+ $fastaSeq .= $line;
+ $qualSeq .= $qualLine . " ";
+ }
+ if(! $isFileOpen) {
+ $chunkCounter++;
+ $id = sprintf "%05s", $chunkCounter;
+ undef $seqH;
+ openFileGetHandle("$uniqFolder/part_seq_$id", "w", \$seqH);
+ undef $qualH;
+ openFileGetHandle("$uniqFolder/part_qual_$id", "w", \$qualH);
+ }
+ $prevFastaSeqId = $fastaSeqId;
+ print $seqH "$prevFastaSeqId\n";
+ print $seqH "$fastaSeq\n";
+ print $qualH "$prevFastaSeqId\n";
+ print $qualH "$qualSeq\n";
+ close($seqH);
+ close($qualH);
+ $ProcessingQueue->enqueue("$uniqFolder/part_seq_$id"."\t"."$uniqFolder/part_qual_$id");
+
+ close($iH);
+ close($qH);
+ $ProcessingQueue->enqueue(undef);
+}
+
+sub fireMyJob {
+ my $lineCount = $_[0];
+ my $fileName = $ProcessingQueue->dequeue();
+ return undef if(!defined($fileName));
+ my ($file1, $file2) = split(/\t/, $fileName);
+ my @idArr = ();
+ my @seqArr = ();
+ my @qualArr = ();
+ open(CHKS, "<$file1") or die "Can't open chunk file containing input reads for processing: $file1\n";
+ open(CHKQ, "<$file2") or die "Can't open chunk file containing input quality for processing: $file2\n";
+ while(my $id = <CHKS>) {
+ <CHKQ>;
+ chomp $id;
+ my $seq = <CHKS>;
+ my $qual = <CHKQ>;
+ chomp $seq;
+ chomp $qual;
+ push(@idArr, $id);
+ push(@seqArr, $seq);
+ push(@qualArr, $qual);
+ $$lineCount++;
+ }
+ close(CHKS);
+ close(CHKQ);
+ my ($id) = $file1=~/(\d+)$/;
+ my @reads = (\@idArr, \@seqArr, \@qualArr, $id);
+ my $thRef = threads->create('passSeq', @reads);
+ return $thRef;
+}
+
+
+sub threading4Processing {
+ my @thArr = ();
+ my $done = 0;
+ my $processedSeqCount = 0;
+ my $roughSeqCounter = 0;
+ my $sCounter = 0;
+ my $jobCounter = 0;
+ my $ttlJobCounter = 0;
+ my $fileEOF = 0;
+
+ while(1) {
+ if($processedSeqCount % 10000 == 0) {
+ my $tmpP = sprintf "%0.0f", ($processedSeqCount/$seqCount*100);
+ print "$indOfAnalysis: Number of reads processed: " . $processedSeqCount . "/$seqCount ($tmpP\%)...\n";
+ }
+ my $i;
+ for($i=0; $i<$noOfProcesses; $i++) {
+ my $thRef = fireMyJob(\$processedSeqCount);
+ if(!defined($thRef)) {
+ $done = 1;
+ last;
+ }
+ $thArr[$i] = $thRef;
+ }
+ for(my $j=0; $j<$i; $j++) {
+ my $refArr = $thArr[$j]->join;
+ &updateData(@{$refArr});
+ }
+ last if($done);
+ }
+}
+
+sub passSeq {
+ yield;
+ my ($idArrRef, $seqArrRef, $qualArrRef, $id) = @_;
+ resetVariables();
+ open(PSEQ, ">$uniqFolder/part_seq_$id"."_out") or die "Can not open part_seq_$id"."_out file\n";
+ open(PQUAL, ">$uniqFolder/part_qual_$id"."_out") or die "Can not open part_qual_$id"."_out file\n";
+ $c++;
+ my @idArr = ();
+ my @seqArr = ();
+ my @qualArr = ();
+ for(my $i=0; $i<@{$idArrRef}; $i++) {
+ processSeq($$idArrRef[$i], $$seqArrRef[$i], $$qualArrRef[$i]);
+ }
+ close(PSEQ);
+ close(PQUAL);
+ my @retArrRef = ($cmaxRawLen, $cminRawLen, \@crawLen, $ctotalBases, $ctotalHQBases, $cavgQual, $clt100, $ctrimCount, $chQCount, $ctotalBasesAfterHQ, $cmaxHQLen, $cminHQLen, $cavgHQLen, \@chQLen, $ctotalReadsFinal, $ctotalBasesFinal, $ctotalHQBasesFinal, $cavgQualFinal, $clQCount, $ctotalHQBasesAfterHQ, $ctotalValidReadsWithPriAda, $ctotalValidReadsNoPriAda, \@clenDistrib, \@cqualDistrib, \@cgcDistrib, \@ccharCount);
+ return \@retArrRef;
+}
+
+sub processSeq {
+ my ($prevFastaSeqId, $fastaSeq, $qualSeq) = @_;
+ $fastaSeq =~ s/\s//g;
+ my $len = length $fastaSeq;
+ $cmaxRawLen = max($cmaxRawLen, $len);
+ $cminRawLen = min($cminRawLen, $len);
+ push(@crawLen, $len);
+ $qualSeq =~ s/\s+$//; # To remove the last space added in 'else' part;
+ my @tmpArr = getQualBases($qualSeq);
+ $ctotalBases += $tmpArr[0];
+ $ctotalHQBases += $tmpArr[1];
+ $cavgQual += $tmpArr[2];
+ $clenDistrib[0][getIndex($len,$lenInterval)]++;
+ $cqualDistrib[0][getIndex($tmpArr[2],$qualInterval)]++;
+ my $As = $fastaSeq =~ s/A/A/gi;
+ my $Ts = $fastaSeq =~ s/T/T/gi;
+ my $Gs = $fastaSeq =~ s/G/G/gi;
+ my $Cs = $fastaSeq =~ s/C/C/gi;
+ my $Ns = $len - $As - $Ts - $Gs - $Cs;
+ my $gcPercent = ($Gs + $Cs)/$len*100;
+ $cgcDistrib[0][getIndex($gcPercent,$gcInterval)]++;
+ $ccharCount[0][0] += $As;
+ $ccharCount[0][1] += $Ts;
+ $ccharCount[0][2] += $Gs;
+ $ccharCount[0][3] += $Cs;
+ $ccharCount[0][4] += $Ns;
+ if(length $fastaSeq < $lowestValidLen && $isLenFilterOn) {
+ $clt100++;
+ }
+ else {
+ if($homoPolyLen != 0) {
+ if(hasPolyChar(\$fastaSeq)) {
+ $ctrimCount++;
+ if(length $fastaSeq >= $lowestValidLen || !$isLenFilterOn) {
+ $qualSeq = trimQualSeq($qualSeq, length $fastaSeq, -1);
+ }
+ }
+ }
+ if(length $fastaSeq < $lowestValidLen && $isLenFilterOn) {
+ $clt100++;
+ }
+ else {
+ if(isReadOfHQ($qualSeq)) {
+ $chQCount++;
+ $ctotalBasesAfterHQ += length $fastaSeq;
+ if(defined $priAdaLib) {
+ my $t=isWOPriAda(\$fastaSeq);
+ if($t > -1) {
+ $qualSeq = trimQualSeq($qualSeq, length $fastaSeq, $t);
+ }
+ if(length $fastaSeq < $lowestValidLen && $isLenFilterOn) {
+ $clt100++;
+ }
+ else {
+ my $len = length $fastaSeq;
+ $cmaxHQLen = max($cmaxHQLen, $len);
+ $cminHQLen = min($cminHQLen, $len);
+ $cavgHQLen += $len;
+ push(@chQLen, $len);
+ $ctotalReadsFinal++;
+ @tmpArr = getQualBases($qualSeq);
+ $ctotalBasesFinal += $tmpArr[0];
+ $ctotalHQBasesFinal += $tmpArr[1];
+ $cavgQualFinal += $tmpArr[2];
+ if(!defined($isOnlyStat)) {
+ $clenDistrib[1][getIndex($len,$lenInterval)]++;
+ $cqualDistrib[1][getIndex($tmpArr[2],$qualInterval)]++;
+ my $As = $fastaSeq =~ s/A/A/gi;
+ my $Ts = $fastaSeq =~ s/T/T/gi;
+ my $Gs = $fastaSeq =~ s/G/G/gi;
+ my $Cs = $fastaSeq =~ s/C/C/gi;
+ my $Ns = $len - $As - $Ts - $Gs - $Cs;
+ my $gcPercent = ($len)?(($Gs + $Cs)/$len*100):0;
+ $cgcDistrib[1][getIndex($gcPercent,$gcInterval)]++;
+ $ccharCount[1][0] += $As;
+ $ccharCount[1][1] += $Ts;
+ $ccharCount[1][2] += $Gs;
+ $ccharCount[1][3] += $Cs;
+ $ccharCount[1][4] += $Ns;
+ print PSEQ "$prevFastaSeqId\n";
+ print PSEQ formatSeq($fastaSeq), "\n";
+ print PQUAL "$prevFastaSeqId\n";
+ print PQUAL formatQualSeq($qualSeq), "\n";
+ }
+ }
+ }
+ else {
+ my $len = length $fastaSeq;
+ $cmaxHQLen = max($cmaxHQLen, $len);
+ $cminHQLen = min($cminHQLen, $len);
+ $cavgHQLen += $len;
+ push(@chQLen, $len);
+ $ctotalReadsFinal++;
+ @tmpArr = getQualBases($qualSeq);
+ $ctotalBasesFinal += $tmpArr[0];
+ $ctotalHQBasesFinal += $tmpArr[1];
+ $cavgQualFinal += $tmpArr[2];
+ if(!defined($isOnlyStat)) {
+ $clenDistrib[1][getIndex($len,$lenInterval)]++;
+ $cqualDistrib[1][getIndex($tmpArr[2],$qualInterval)]++;
+ my $As = $fastaSeq =~ s/A/A/gi;
+ my $Ts = $fastaSeq =~ s/T/T/gi;
+ my $Gs = $fastaSeq =~ s/G/G/gi;
+ my $Cs = $fastaSeq =~ s/C/C/gi;
+ my $Ns = $len - $As - $Ts - $Gs - $Cs;
+ my $gcPercent = ($Gs + $Cs)/$len*100;
+ $cgcDistrib[1][getIndex($gcPercent,$gcInterval)]++;
+ $ccharCount[1][0] += $As;
+ $ccharCount[1][1] += $Ts;
+ $ccharCount[1][2] += $Gs;
+ $ccharCount[1][3] += $Cs;
+ $ccharCount[1][4] += $Ns;
+ print PSEQ "$prevFastaSeqId\n";
+ print PSEQ formatSeq($fastaSeq), "\n";
+ print PQUAL "$prevFastaSeqId\n";
+ print PQUAL formatQualSeq($qualSeq), "\n";
+ }
+ }
+ }
+ else {
+ $clQCount++;
+ }
+ }
+ }
+}
+
+sub getIndex {
+ my $up = $_[0];
+ my $down = $_[1];
+ my $inp = $up/$down;
+ return (sprintf "%0.0f", $up) if($down == 1);
+ my $index = int((sprintf "%0.2f", $inp)+0.99)-1;
+ $index = 0 if($index < 0);
+ return $index;
+}
+
+sub addTwoArrays {
+ my $arr1Ref = $_[0];
+ my $arr2Ref = $_[1];
+ my $c=0;
+ my $i=0;
+ foreach my $arrRef (@{$arr1Ref}) {
+ $c=0;
+ foreach my $val (@{$arrRef}) {
+ @{$$arr2Ref[$i]}[$c] += $val if($val);
+ @{$$arr2Ref[$i]}[$c] = 0 if(!defined(@{$$arr2Ref[$i]}[$c]));
+ $c++;
+ }
+ $i++
+ }
+}
+
+
+sub calcN50 {
+ my @x = @{$_[0]};
+ my $n = $_[1];
+ @x=sort{$b<=>$a} @x;
+ my $total = sum(@x);
+ my ($count, $n50)=(0,0);
+ for (my $j=0; $j<@x; $j++){
+ $count+=$x[$j];
+ if(($count>=$total*$n/100)){
+ $n50=$x[$j];
+ last;
+ }
+ }
+ return $n50;
+}
+
+sub calcMedian {
+ my @arr = @_;
+ my @sArr = sort{$a<=>$b} @arr;
+ my $arrLen = @arr;
+ my $median;
+ if($arrLen % 2 == 0) {
+ $median = ($sArr[$arrLen/2-1] + $sArr[$arrLen/2])/2;
+ }
+ else {
+ $median = $sArr[$arrLen/2];
+ }
+ return $median;
+}
+
+sub formatSeq {
+ my $seq = $_[0];
+ my $newSeq = "";
+ my $ch = 60;
+ for(my $i=0; $i<length $seq; $i+=$ch) {
+ $newSeq .= substr($seq, $i, $ch) . "\n";
+ }
+ chomp($newSeq); # To remove \n at the end of the whole sequence..
+ return $newSeq;
+}
+
+sub formatQualSeq {
+ my $qualSeq = $_[0];
+ my $fQSeq = "";
+ my $ch = 60;
+ my $valCount = 0;
+ my @arr = split(" ", $qualSeq);
+ for(my $i=0; $i<@arr; $i++) {
+ $valCount++;
+ if($valCount % $ch == 0) {
+ $fQSeq .= $arr[$i] . "\n";
+ }
+ else {
+ $fQSeq .= $arr[$i] . " ";
+ }
+ }
+ $fQSeq =~ s/\s+$//;
+ return $fQSeq;
+}
+
+sub hasPolyChar {
+ my $seqRef = $_[0];
+ my $flag = 0;
+ if($$seqRef =~ s/(A{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(T{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(G{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(C{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ return $flag;
+}
+
+sub trimQualSeq {
+ my $qualSeq = $_[0];
+ my $seqLen = $_[1];
+ my $priAdaStart = $_[2];
+ my $trimmedQualSeq;
+ if($priAdaStart != -1) {
+ if($priAdaStart < 50) {
+ my $t = $seqLen-1;
+ $qualSeq =~ /((\d{1,2}\s+){$t}\d{1,2})$/;
+ $trimmedQualSeq = $1;
+ }
+ else {
+ $qualSeq =~ /^((\d{1,2}\s+){$seqLen})/;
+ $trimmedQualSeq = $1;
+ }
+ }
+ else {
+ $qualSeq =~ /^((\d{1,2}\s+){$seqLen})/;
+ $trimmedQualSeq = $1;
+ }
+ $trimmedQualSeq =~ s/\s+$//;
+ return $trimmedQualSeq;
+}
+
+sub isReadOfHQ { # Criteria for HQ is greater than or equal to 70% of bases have phred score > 20
+ my $read = $_[0];
+ my $validBaseCount = 0;
+ my @ASCII = split(/\s+/, $read);
+ my $readLen = scalar @ASCII;
+ my $cutOffLen = sprintf("%0.0f", $readLen * $cutOffReadLen4HQ / 100); # 70% length of read length is calculated.
+ foreach my $val (@ASCII) {
+ if($val >= $cutOffPhScore) {
+ $validBaseCount++;
+ }
+ }
+ if($validBaseCount >= $cutOffLen) {
+ $ctotalHQBasesAfterHQ += $validBaseCount;
+ return 1; # Return true.
+ }
+ else {
+ return 0; # Return false.
+ }
+}
+
+sub getQualBases { # This will return an array. 1) Total bases 2) HQ bases 3) Average quality
+ my $read = $_[0];
+ my $qualSum = 0;
+ my @retArr = ();
+ my $validBaseCount = 0;
+ my @ASCII = split(/\s+/, $read);
+ my $readLen = scalar @ASCII;
+ foreach my $val (@ASCII) {
+ $qualSum += $val;
+ if($val >= $cutOffPhScore) {
+ $validBaseCount++;
+ }
+ }
+ $retArr[0] = $readLen;
+ $retArr[1] = $validBaseCount;
+ $retArr[2] = ($readLen)?(sprintf "%0.2f", $qualSum/$readLen):0;
+ return @retArr;
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub getFileName { # This sub takes a path of a file and returns just its name after separating the path from it.
+ my $path = $_[0];
+ my $name = "";
+ $path =~ /([^\/]+)$/;
+ $name = $1;
+ return $name;
+}
+
+sub getFilePath {
+ my $name = $_[0];
+ my $path = "";
+ if($name =~ /\//) {
+ $name =~ /(.+)\//;
+ $path = $1 . "/";
+ }
+ else {
+ $path = "./";
+ }
+ return $path;
+}
+
+
+
+
+sub drawBaseComp {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ y_label => 'Count',
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ l_margin => 60,
+ r_margin => 60,
+ b_margin => 50,
+ t_margin => 50,
+ show_values => 1,
+ bar_spacing => 1,
+ values_vertical => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ $mygraph->set_values_font($f, 6);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $dgreen = $myImage->colorAllocate(0,127,0);
+ my $dblue = $myImage->colorAllocate(0,0,127);
+
+ my $sum1 = sum(@{$$dataRef[1]});
+ my $sum2 = sum(@{$$dataRef[2]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Base composition for $fileName",
+ color => $dblue,
+ );
+
+ $wrapbox->set(align => 'center', width => $width);
+ $wrapbox->set_font($f, 11);
+ $wrapbox->draw(0,0);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[1]}[0]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[1]}[1]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[1]}[2]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[1]}[3]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[1]}[4]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-35);
+
+
+ my $startRectX = $width/2-230;
+ my $startRectY = $height-35;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+
+ if(!$isOnlyStat) {
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[2]}[0]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[2]}[1]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[2]}[2]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[2]}[3]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[2]}[4]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-20);
+
+
+
+ $startRectX = $width/2-230;
+ $startRectY = $height-20;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+ }
+
+
+
+
+
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+
+sub drawGCDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+
+ $mygraph->set(
+ x_label => '% GC content',
+ y_label => 'Number of reads',
+ title => "GC content distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ markers => [1],
+ marker_size => 3,
+ dclrs => [ qw(lred dgreen) ],
+ x_labels_vertical => 1,
+ legend_placement => 'BR',
+ x_labels_vertical => 1,
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+sub drawSummaryPie {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $width = $_[2];
+ my $height = $_[3];
+ my $mygraph = new GD::Graph::pie($width, $height);
+
+ $mygraph->set(
+ title => "Summary of quality check and filtering",
+ axislabelclr => 'black',
+ pie_height => 40,
+
+ l_margin => 15,
+ r_margin => 15,
+ b_margin => 50,
+ start_angle => -10,
+ dclrs => [ qw(lred cyan lyellow lgreen purple) ],
+ transparent => 0,
+ ) or warn $mygraph->error;
+
+ $mygraph->set_label_font($f, 8);
+ $mygraph->set_value_font(['verdana', 'arial'],14);
+ $mygraph->set_title_font($f, 11);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $lyellow = $myImage->colorAllocate(255,255,0);
+ my $lgreen = $myImage->colorAllocate(0,255,0);
+ my $cyan = $myImage->colorAllocate(0,255,255);
+ my $purple = $myImage->colorAllocate(191,0,191);
+
+ my $sum = sum(@{$$dataRef[1]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Trashed reads (shorter than $lowestValidLen bp) (%0.2f", @{$$dataRef[1]}[0]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Homopolymer trimmed reads (%0.2f", @{$$dataRef[1]}[1]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+40,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Trashed reads (low quality reads) (%0.2f", @{$$dataRef[1]}[4]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Primer/Adaptor trimmed reads (%0.2f", @{$$dataRef[1]}[2]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+40,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "High quality reads other than homopolymer and primer/adaptor trimmed (%0.2f", @{$$dataRef[1]}[3]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 500);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw(20,$height-15);
+
+ my $startRectX1 = 10;
+ my $startRectX2 = $width/2+30;
+ my $startRectY = $height-45;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $myImage->filledRectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$cyan);
+ $myImage->rectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$purple);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ $myImage->filledRectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$lyellow);
+ $myImage->rectangle($startRectX2,$startRectY,$startRectX2+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$lgreen);
+ $myImage->rectangle($startRectX1,$startRectY,$startRectX1+8,$startRectY+8,$black);
+
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+
+}
+
+sub drawQualDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ x_label => 'Average phred quality score',
+ y_label => 'Number of reads',
+ title => "Quality distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ bar_spacing => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+sub drawLenDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ x_label => 'Read length (bp)',
+ y_label => 'Number of reads',
+ title => "Length distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_labels_vertical => 1,
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ bar_spacing => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 9);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(IMG, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode IMG;
+ print IMG $myImage->png;
+ close(IMG);
+}
+
+
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTA format; .fna and .qual files) (Required)\n";
+ print " -i <Read file> <Quality file> <Primer/Adaptor library>\n";
+ print " Read and quality file in FASTA format with primer/adaptor library\n";
+ print " User may choose from the provided primer/adaptor library or can give a file containing primer/adaptor sequences, one per line\n";
+ print " Multiple libraries can be given using multiple '-i' options\n";
+ print " For eg.: -i read1.fna read1.qual 3 -i read2.fna read2.qual 2\n\n";
+ print " Primer/Adaptor libraries:\n";
+ my $c = 1;
+ foreach my $lib (@priAdaLibNames) {
+ print " $c = $lib\n";
+ $c++;
+ }
+ print " N = Do not filter for Primer/Adaptor\n";
+ print " <File> = File for user defined primer/adaptor sequences, one per line\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print "--------------------------------- QC Options ---------------------------------\n";
+ print " -l | -cutOffReadLen4HQ <Real number, 0 to 100>\n";
+ print " The cut-off value for percentage of read length that should be of given quality\n";
+ print " default: 70\n";
+ print " -s | -cutOffQualScore <Integer, 0 to 40>\n";
+ print " The cut-off value for PHRED quality score for high-quality filtering\n";
+ print " default: 20\n";
+ print " -n | -homoPolyLen <Integer>\n";
+ print " Minimum length of the homopolymer to be trimmed (0: to skip the homopolymer trimming)\n";
+ print " For eg.: -n 8, will trim the right end of read from the homopolymer of at least 8 bases long\n";
+ print " default: 0 (homopolymer trimming is off)\n";
+ print " -m | -minLen <Integer>\n";
+ print " Filter sequences shorter than the given minimum length\n";
+ print " default: 100\n";
+ print " -f | -lenFilter <Y/N>\n";
+ print " Are sequences to be filtered on the basis of length: (Y)es or (N)o\n";
+ print " default: Y\n";
+ print "----------------------------- Processing Options -----------------------------\n";
+ print " -c | -cpus <Integer>\n";
+ print " Number of CPUs to be used\n";
+ print " default: 1\n";
+ print " -onlyStat\n";
+ print " Outputs only statistics without filtered data output\n";
+ print "------------------------------- Output Options -------------------------------\n";
+ print " -t | -statOutFmt <Integer>\n";
+ print " Output format for statistics\n";
+ print " Formats:\n";
+ print " 1 = formatted text\n";
+ print " 2 = tab delimited\n";
+ print " default: 1\n";
+ print " -o | -outputFolder <Output folder name/path>\n";
+ print " Output will be stored in the given folder\n";
+ print " default: By default, output folder (454QC_Filtered_files) will be generated where the input files are\n";
+ print " -z | -outputDataCompression <Character>\n";
+ print " Output format for HQ filtered data\n";
+ print " Formats:\n";
+ print " t = text FASTA files\n";
+ print " g = gzip compressed files\n";
+ print " default: t\n";
+ print "\n";
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+
+sub isWOPriAda {
+ my $seq = $_[0];
+ chomp($$seq);
+
+ my @rapid = (
+ "CCATCTCATCCCTGCGTGTC",
+ "CCATCTCATCCCTGCGTGTCTCCGACTCAG",
+ "CTGAGTCGGAGA",
+ "CCTATCCCCTGTGTGCCTTG",
+ "CCTATCCCCTGTGTGCCTTGGCAGTCTCAG",
+ "CTGAGACTGCCA",
+ );
+
+ my @arrPE = (
+ "GCCTCCCTCGCGCCATCAG",
+ "CTGATGGCGCGAGGG",
+ "GCCTTGCCAGCCCGCTCAG",
+ "CTGAGCGGGCTGGCA",
+ "GCCTCCCTCGCGCCA",
+ "GCCTTGCCAGCCCGC",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @arrAmplicon = (
+ "CGTATCGCCTCCCTCGCGCCATCAG",
+ "CGTATCGCCTCCCTCGCGCCATCAG",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @arrsmRna = (
+ "GCCTCCCTCGCGCCATCAGTATCGTAGGCACCTGAGA",
+ "GCCTTGCCAGCCCGCTCAGTATTGATGGTGCCTACAG",
+ "CCATCTCATCCCTGCGTGTC",
+ "CCTATCCCCTGTGTGCCTTG",
+ );
+
+ my @priAdas = (\@rapid, \@arrPE, \@arrAmplicon, \@arrsmRna);
+ my %checkedPriStr = (); # The 20 bp from start and end are stored in this hash as key. So that next time when another pri/ada seq
+
+ my @priAdaSeqs = ();
+ if($priAdaLib eq "u") {
+ @priAdaSeqs = @usrDefinedPriAda;
+ }
+ else {
+ @priAdaSeqs = @{$priAdas[$priAdaLib]};
+ }
+ my @stat = ();
+ my $priInd = 0;
+ my $priAdaStart = 1;
+
+ my $isMatched = 0;
+ foreach my $priAda (@priAdaSeqs) {
+ $priAdaStart = findSeq($priAda, $$seq, \%checkedPriStr);
+ if($priAdaStart) {
+ if($priAdaStart < 50) {
+ $$seq = substr($$seq, $priAdaStart+$substrlen, length($$seq)-($priAdaStart+$substrlen));
+ }
+ else {
+ $$seq = substr($$seq, 0, $priAdaStart);
+ }
+ $isMatched = 1;
+ last;
+ }
+ }
+
+ if($isMatched) {
+ $ctotalValidReadsWithPriAda++;
+ return $priAdaStart;
+ }
+ else {
+ $ctotalValidReadsNoPriAda++;
+ return -1;
+ }
+}
+
+sub findSeq {
+ my $pri = $_[0];
+ my $seq = $_[1];
+ my $hashRef = $_[2];
+ my $subsl = $substrlen;
+ $subsl = length $pri if(length($pri) < $substrlen);
+ my $spri = substr($pri, 0, $subsl);
+ my $epri = substr($pri, (length $pri) - $subsl, $subsl);
+ my $sseq = substr($seq, 0, 50);
+ my $tmpInd = (length $seq) - 50;
+ $tmpInd = 0 if($tmpInd < 0);
+ my $eseq = substr($seq, $tmpInd, 50);
+ my $ans;
+ if(!defined($$hashRef{$spri})) {
+ my @catches = String::Approx::amatch($spri, ['I0 D0 S1'], $sseq);
+ if(@catches != 0) {
+ return findStart($sseq, $spri);
+ }
+ @catches = String::Approx::amatch($spri, ['I0 D0 S1'], $eseq);
+ if(@catches != 0) {
+ return findStart($eseq, $spri) + length($seq) - 50;
+ }
+ $$hashRef{$spri} = 1;
+ }
+ if(!defined($$hashRef{$epri})) {
+ my @catches = String::Approx::amatch($epri, ['I0 D0 S1'], $sseq);
+ if(@catches != 0) {
+ return findStart($sseq, $epri);
+ }
+ @catches = String::Approx::amatch($epri, ['I0 D0 S1'], $eseq);
+ if(@catches != 0) {
+ return findStart($eseq, $epri) + length($seq) - 50;
+ }
+ $$hashRef{$epri} = 1;
+ }
+ return 0;
+}
+
+use re qw(eval);
+use vars qw($matchStart);
+
+sub findStart {
+ my $pattern;
+ local $_;
+ ($_, $pattern) = @_;
+ $pattern = fuzzy_pattern($pattern, $mismLim);
+ my @results;
+ local $matchStart;
+ my $instrumentedPattern = qr/(?{ $matchStart = pos() })$pattern/;
+ while (/$instrumentedPattern/g) {
+ my $nextStart = pos();
+ return $matchStart;
+ push @results, "[$matchStart..$nextStart)";
+ pos() = $matchStart+1;
+ }
+}
+
+sub fuzzy_pattern {
+ my ($original_pattern, $mismatches_allowed) = @_;
+ $mismatches_allowed >= 0
+ or die "Number of mismatches must be greater than or equal to zero\n";
+ my $new_pattern = make_approximate($original_pattern, $mismatches_allowed);
+ return qr/$new_pattern/;
+}
+
+sub make_approximate {
+ my ($pattern, $mismatches_allowed) = @_;
+ if ($mismatches_allowed == 0) { return $pattern }
+ elsif (length($pattern) <= $mismatches_allowed)
+ { $pattern =~ tr/ACTG/./; return $pattern }
+ else {
+ my ($first, $rest) = $pattern =~ /^(.)(.*)/;
+ my $after_match = make_approximate($rest, $mismatches_allowed);
+ if ($first =~ /[ACGT]/) {
+ my $after_miss = make_approximate($rest, $mismatches_allowed-1);
+ return "(?:$first$after_match|.$after_miss)";
+ }
+ else { return "$first$after_match" }
+ }
+}
diff --git a/QC/IlluQC.pl b/QC/IlluQC.pl
new file mode 100644
index 0000000..d8cfb0b
--- /dev/null
+++ b/QC/IlluQC.pl
@@ -0,0 +1,2328 @@
+#! /usr/bin/perl
+
+use File::Basename;
+#BEGIN {
+# my ($tmp, $path) = fileparse($0);
+# push ( @INC,"$path/lib");
+# #use lib "$path";
+#}
+use strict;
+use warnings;
+use Getopt::Long;
+use List::Util qw(sum min max);
+use Cwd qw(abs_path);
+use IO::Zlib;
+use FindBin qw($RealBin);
+use lib "$RealBin/lib";
+require "html.pl";
+
+eval {
+ require Parallel::ForkManager;
+ require String::Approx;
+ require GD::Graph::linespoints;
+ require GD::Graph::bars;
+ require GD::Graph::pie;
+ require GD::Text::Wrap;
+};
+
+my $isGDMod = 1;
+
+if($@) {
+ my $errorText = join("", $@);
+ if($errorText =~ /Parallel/) {
+ print "Error:\n\tCan not find 'lib' folder with this perl program\n"; #module 'Parallel::ForkManager'\n";
+ print "\tCopy the 'lib' folder, provided with the toolkit, to the directory where this perl program is and try again\n\n";
+ exit;
+ }
+ elsif($errorText =~ /GD\/Graph\/linespoints/) {
+ print STDERR "Warning:\n\tCan not find module 'GD::Graph'\n";
+ print STDERR "\tGraphs for statistics will not be produced. \n\t\t\tOR \n\tInstall GD::Graph module and try again.\n\n";
+ $isGDMod = 0;
+ }
+ elsif($errorText =~ /String\/Approx/) {
+ print STDERR "Error:\n\tCan not find module 'String::Approx'\n";
+ print STDERR "\tInstall it and try again\n\n";
+ exit;
+ }
+}
+
+
+# Stat variables.
+my @totalBases = (0, 0);
+my @totalHQBases = (0, 0); ### This is for HQ reads, does not include NO_VEC stat.
+my @totalBasesAfterHQ = (0, 0);
+my @totalHQBasesAfterHQ = (0, 0);
+my @totalBasesFinal = (0, 0);
+my @totalHQBasesFinal = (0, 0);
+my @totalReadsAfterHQ = (0, 0); ### This is for HQ reads, does not include NO_VEC stat.
+my @totalReads = (0, 0);
+my @totalValidReadsNoPriAda = (0, 0);
+my @totalValidReadsWithPriAda = (0, 0);
+my @minLen = (1000, 1000, 1000, 1000);
+my @maxLen = (0, 0, 0, 0);
+my @positionSpecificBaseCount = ();
+my @positionSpecificBaseCountHQ = (); #### This is for final output reads only... Which may include NO_VEC according to the user input.
+my @positionSpecificBaseCountWithRanges = ();
+my @positionSpecificBaseCountHQWithRanges = ();
+my @totalReadsFinal = ();
+my @fileName = ();
+my @outFileName = ();
+my @readsWithN = (0, 0, 0, 0);
+my @totalNs = (0, 0, 0, 0);
+my @totalTrimmedReads = (0, 0);
+my @priAdaLibNames = ("Genomic DNA/Chip-seq Library", "Paired End DNA Library", "DpnII gene expression Library", "NlaIII gene expression Library", "Small RNA Library", "Multiplexing DNA Library");
+my $nLines = 0;
+my $seqFormat = 0; # 1: Sanger; 2: Solexa; 3: Illumina 1.3+; 4: Illumina 1.5+;
+my $subVal = 0; # 33: Sanger; 64: Illumina
+my @qualDistribRaw = ();
+my @qualDistribFinal = ();
+my $qualDistribInterval = 1;
+my @qualLabel = ();
+my @gcDistribRaw = ();
+my @gcDistribFinal = ();
+my $gcDistribInterval = 5;
+my @gcLabel = ();
+my @baseCountRaw = ();
+my @baseCountFinal = ();
+my @charCountRaw = ();
+my @charCountFinal = ();
+
+#my @monoRepeat = (); ### Poly A, Poly T, Poly G, Poly C
+#my @diRepeat = ();
+#my @triRepeat = ();
+#my @tetraRepeat = ();
+
+my $font_spec = getFilePath($0) . "lib/Fonts/Dustismo_Sans.ttf";
+my $f = getFilePath($0) . "lib/Fonts/LucidaSansDemiBold.ttf";
+
+
+# Misc variables.
+my $isPairedEnd = 0;
+my $outFolder = "";
+my $substrlen = 20; # For removePriAda
+my $mismLim = 1; # For removePriAda
+my $indOfAnalysis = 0;
+
+# Parameter variables.
+my @peFiles = ();
+my @seFiles = ();
+my @allFiles = ();
+my $noOfInp4PE = 4;
+my $noOfInp4SE = 3;
+my $isOnlyStat;
+my $priAdaLib = "";
+my $cutOffReadLen4HQ = 70;
+my $cutOffPhScore = 20;
+#my $trimAfterUnknownCall;
+my $noOfProcesses = 1;
+my $helpAsked;
+my $statOutFmt = 1; # 1: Text format; 2: Tab-delimited format.
+my $priAdaFile;
+my @usrDefinedPriAda = ();
+my $outputDataFmt = "t"; # t/T: Text; g/G: Gzip.
+GetOptions(
+ "pe=s{$noOfInp4PE}" => \@peFiles,
+ "se=s{$noOfInp4SE}" => \@seFiles,
+ "h|help" => \$helpAsked,
+ "l|cutOffReadLen4HQ=f" => \$cutOffReadLen4HQ,
+ "o|outputFolder=s" => \$outFolder,
+ "z|outputDataCompression=s" => \$outputDataFmt,
+ "t|statOutFmt=i" => \$statOutFmt,
+ "onlyStat" => \$isOnlyStat,
+ "p|processes=i" => \$noOfProcesses,
+ "s|cutOffQualScore=i" => \$cutOffPhScore,
+ );
+if($helpAsked) {
+ prtUsage();
+ exit;
+}
+if(@peFiles == 0 && @seFiles == 0) {
+ prtError("No input files are provided");
+}
+# Validating inputs
+my @tempFiles = ();
+prtError("Missing inputs for paired-end files") if((scalar @peFiles)%$noOfInp4PE != 0);
+for(my $i=0; $i<@peFiles; $i+=$noOfInp4PE) {
+ my $str = "$peFiles[$i] $peFiles[$i+1] $peFiles[$i+2] $peFiles[$i+3]";
+ if($peFiles[$i+2] =~ /^-/) {
+ prtError("Missing inputs for paired-end files: at '-pe $str'")
+ }
+ if($peFiles[$i+2] =~ /^\d$/) {
+ if($peFiles[$i+2] < 1 || $peFiles[$i+2] > 6) {
+ prtError("Incorrect option for Primer/Adaptor library: at '-pe $str'");
+ }
+ }
+ if($peFiles[$i+3] =~ /^-/) {
+ prtError("Missing inputs for paired-end files: at '-pe $str'")
+ }
+ if($peFiles[$i+3] !~ /\d/ && $peFiles[$i+3] !~ /a/i) {
+ prtError("Incorrect option for FASTQ variant: at '-pe $str'")
+ }
+ if($peFiles[$i+3] !~ /a/i) {
+ if($peFiles[$i+3] < 1 || $peFiles[$i+3] > 5) {
+ prtError("Incorrect option for FASTQ variant: at '-pe $str'");
+ }
+ }
+ push(@tempFiles, $str);
+}
+ at peFiles = ();
+ at peFiles = @tempFiles;
+ at tempFiles = ();
+prtError("Missing inputs for single-end files") if((scalar @seFiles)%$noOfInp4SE != 0);
+for(my $i=0; $i<@seFiles; $i+=$noOfInp4SE) {
+ my $str = "$seFiles[$i] $seFiles[$i+1] $seFiles[$i+2]";
+ if($seFiles[$i+1] =~ /^-/) {
+ prtError("Missing inputs for single-end files: at '-se $str'")
+ }
+ if($seFiles[$i+1] =~ /^\d$/i) {
+ if($seFiles[$i+1] < 1 || $seFiles[$i+1] > 6) {
+ prtError("Incorrect option for Primer/Adaptor library: at '-se $str'");
+ }
+ }
+ if($seFiles[$i+2] =~ /^-/) {
+ prtError("Missing inputs for single-end files: at '-se $str'")
+ }
+ if($seFiles[$i+2] !~ /\d/ && $seFiles[$i+2] !~ /a/i) {
+ prtError("Incorrect option for FASTQ variant: at '-se $str'")
+ }
+ if($seFiles[$i+2] !~ /a/i) {
+ if($seFiles[$i+2] < 1 || $seFiles[$i+2] > 5) {
+ prtError("Incorrect option for FASTQ variant: at '-se $str'");
+ }
+ }
+ push(@tempFiles, $str);
+}
+ at seFiles = ();
+ at seFiles = @tempFiles;
+ at tempFiles = ();
+if($cutOffReadLen4HQ < 0 || $cutOffReadLen4HQ > 100) {
+ prtError("Incorrect value for -l|cutOffReadLen4HQ option: at '-l $cutOffReadLen4HQ'");
+}
+if($statOutFmt < 1 || $statOutFmt > 2) {
+ prtError("Incorrect value for -statOutFmt: at '-statOutFmt $statOutFmt'");
+}
+if($outputDataFmt !~ /^[tg]$/i) {
+ prtError("Incorrect value for -f|outputDataFmt option: at '-f $outputDataFmt'");
+}
+
+my $pm = new Parallel::ForkManager($noOfProcesses);
+
+ at allFiles = (@peFiles, @seFiles);
+my $pid;
+
+foreach my $file (@allFiles) {
+ @totalBases = (0, 0);
+ @totalHQBases = (0, 0);
+ @totalBasesAfterHQ = (0, 0);
+ @totalHQBasesAfterHQ = (0, 0);
+ @totalBasesFinal = (0, 0);
+ @totalHQBasesFinal = (0, 0);
+ @totalReadsAfterHQ = (0, 0);
+ @totalReads = (0, 0);
+ @totalValidReadsNoPriAda = (0, 0);
+ @totalValidReadsWithPriAda = (0, 0);
+ @minLen = (1000, 1000, 1000, 1000);
+ @maxLen = (0, 0, 0, 0);
+ @positionSpecificBaseCount = ();
+ @positionSpecificBaseCountHQ = ();
+ @positionSpecificBaseCountWithRanges = ();
+ @positionSpecificBaseCountHQWithRanges = ();
+ @totalReadsFinal = ();
+ @fileName = ();
+ @outFileName = ();
+ @readsWithN = (0, 0, 0, 0);
+ @totalNs = (0, 0, 0, 0);
+ @totalTrimmedReads = (0, 0);
+ @qualDistribRaw = ();
+ @qualDistribFinal = ();
+ @qualLabel = ();
+ @gcDistribRaw = ();
+ @gcDistribFinal = ();
+ @gcLabel = ();
+ @charCountRaw = ();
+ @charCountFinal = ();
+ $priAdaFile = "";
+ @usrDefinedPriAda = ();
+
+
+
+ $file =~ s/\\([A-Za-z_\.])/\/$1/g; # To remove '\' from the path of windows file
+ $isPairedEnd = 0;
+ my @inpData = split(/\s+/, $file);
+ if($inpData[$#inpData-1] =~ /^n$/i) {
+ undef $priAdaLib;
+ }
+ elsif($inpData[$#inpData-1] =~ /^\d$/) {
+ $priAdaLib = $inpData[$#inpData-1] - 1;
+ }
+ else {
+ $priAdaLib = "u";
+ $priAdaFile = $inpData[$#inpData-1];
+ open(PRIADA, "<$priAdaFile") or die "Can not open the user-defined primer/adapter file: $priAdaFile\n";
+ @usrDefinedPriAda = <PRIADA>;
+ for(my $i=0; $i<=$#usrDefinedPriAda; $i++) {
+ $usrDefinedPriAda[$i] =~ s/\s+//g;
+ }
+ }
+ $seqFormat = $inpData[$#inpData];
+
+ $indOfAnalysis++;
+$pid = $pm->start and next;
+ print "Analysis has been started for \"$file\": Index: $indOfAnalysis\n";
+ my $statFile = "";
+ my $outFile1;
+ my $outFile2;
+ my $unPaired;
+ my $outFile;
+ if((scalar @inpData) == $noOfInp4PE) {
+ @fileName = ($inpData[0], $inpData[1]);
+ $outFolder = getFilePath($fileName[0]) . "IlluQC_Filtered_files" if($outFolder eq "");
+ $outFolder .= "/" if($outFolder !~ /\/$/);
+ if(! -e $outFolder) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+ }
+ $outFile1 = $outFolder . getFileName($fileName[0]) . "_filtered";
+ $outFile2 = $outFolder . getFileName($fileName[1]) . "_filtered";
+ $outFile1 .= ".gz" if($outputDataFmt =~ /g/i);
+ $outFile2 .= ".gz" if($outputDataFmt =~ /g/i);
+ $statFile = $outFolder . getFileName($fileName[0]) . "_" . getFileName($fileName[1]) . "_stat";
+ $outFileName[0] = $outFile1;
+ $outFileName[1] = $outFile2;
+ if($seqFormat =~ /a/i) {
+ print "$indOfAnalysis: Checking FASTQ format: File $fileName[0]...\n";
+ $nLines = checkFastQFormat($fileName[0], 1);
+ print "$indOfAnalysis: Checking FASTQ format: File $fileName[1]...\n";
+ if($nLines != checkFastQFormat($fileName[1], 1)) {
+ prtErrorExit("Number of reads in paired end files are not same.\n\t\tFiles: $fileName[0], $fileName[1]");
+ }
+ if($seqFormat == 1) {
+ $subVal = 33;
+ print "$indOfAnalysis: Input FASTQ file format: Sanger\n";
+ }
+ if($seqFormat == 2) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Solexa\n";
+ }
+ if($seqFormat == 3) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.3+\n";
+ }
+ if($seqFormat == 4) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.5+\n";
+ }
+ if($seqFormat == 5) {
+ $subVal = 33;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.8+\n";
+ }
+ }
+ else {
+ $nLines = checkFastQFormat($fileName[0], 0);
+ if($nLines != checkFastQFormat($fileName[1], 0)) {
+ prtErrorExit("Number of reads in paired end files are not same.\n\t\tFiles: $fileName[0], $fileName[1]");
+ }
+ if($seqFormat == 1 || $seqFormat == 5) {
+ $subVal = 33;
+ }
+ else {
+ $subVal = 64;
+ }
+ }
+ print "$indOfAnalysis: Processing input files...\n";
+ $unPaired = getFilePath($outFile1) . getFileName($fileName[0]) . "_" . getFileName($fileName[1]) . "_unPaired_HQReads";
+ $unPaired .= ".gz" if($outputDataFmt =~ /g/i);
+ my $t = sprintf("%0.0f", $nLines/4);
+ print "$indOfAnalysis: Number of reads processed: " . "0/$t (0\%)...\n";
+ processPairedEndFiles($fileName[0], $fileName[1], $outFile1, $outFile2, $unPaired);
+ print "$indOfAnalysis: Number of reads processed: " . "$totalReads[0]/$totalReads[0] (100\%)...\n";
+ if(!defined($isOnlyStat)) {
+ }
+ $isPairedEnd = 1;
+ }
+ else {
+ $fileName[0] = $inpData[0]; #$arg;
+ $outFolder = getFilePath($fileName[0]) . "IlluQC_Filtered_files" if($outFolder eq "");
+ $outFolder .= "/" if($outFolder !~ /\/$/);
+ if(! -e $outFolder) {
+ if(!defined($isOnlyStat)) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+ }
+ }
+ $outFile = $outFolder . getFileName($fileName[0]) . "_filtered";
+ $outFile .= ".gz" if($outputDataFmt =~ /g/i);
+ $outFileName[0] = $outFile;
+ $statFile = $outFolder . getFileName($fileName[0]) . "_stat";
+ if($seqFormat =~ /a/i) {
+ print "$indOfAnalysis: Checking FASTQ format: File $fileName[0]...\n";
+ $nLines = checkFastQFormat($fileName[0], 1);
+ if($seqFormat == 1) {
+ $subVal = 33;
+ print "$indOfAnalysis: Input FASTQ file format: Sanger\n";
+ }
+ if($seqFormat == 2) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Solexa\n";
+ }
+ if($seqFormat == 3) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.3+\n";
+ }
+ if($seqFormat == 4) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.5+\n";
+ }
+ if($seqFormat == 5) {
+ $subVal = 33;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.8+\n";
+ }
+ }
+ else {
+ $nLines = checkFastQFormat($fileName[0], 0);
+ if($seqFormat == 1 || $seqFormat == 5) {
+ $subVal = 33;
+ }
+ else {
+ $subVal = 64;
+ }
+ }
+ print "$indOfAnalysis: Processing input files...\n";
+ my $t = sprintf("%0.0f", $nLines/4);
+ print "$indOfAnalysis: Number of reads processed: " . "0/$t (0\%)...\n";
+ processSingleEndFiles($fileName[0], $outFile);
+ print "$indOfAnalysis: Number of reads processed: " . "$totalReads[0]/$totalReads[0] (100\%)...\n";
+ if(!defined($isOnlyStat)) {
+ }
+ $isPairedEnd = 0;
+ }
+
+ print "$indOfAnalysis: Analysis completed\n";
+
+ print "$indOfAnalysis: Printing Statistics...\n";
+
+ my $qualDistF1 = getFileName($fileName[0])."_qualDistribution.png";
+ my $qualDistF2 = getFileName($fileName[1])."_qualDistribution.png" if($isPairedEnd);
+ my $sumPieF;
+ $sumPieF = getFileName($fileName[0]). "_summary.png";
+ $sumPieF = getFileName($fileName[0]). "_" . getFileName($fileName[1]) ."_summary.png" if($isPairedEnd);
+ my $gcDistF1 = getFileName($fileName[0])."_gcDistribution.png";
+ my $gcDistF2 = getFileName($fileName[1])."_gcDistribution.png" if($isPairedEnd);
+ my $baseCntF1 = getFileName($fileName[0])."_baseCompostion.png";
+ my $baseCntF2 = getFileName($fileName[1])."_baseCompostion.png" if($isPairedEnd);
+ my $avgQF1 = getFileName($fileName[0]) . "_avgQual.png";
+ my $avgQF2 = getFileName($fileName[1]) . "_avgQual.png" if($isPairedEnd);
+ my $QRangeRawF1 = getFileName($fileName[0]) . "_QualRangePerBase.png";
+ my $QRangeFilteredF1 = getFileName($outFileName[0]) . "_QualRangePerBase.png" if(!defined($isOnlyStat));
+ my $QRangeF1 = "$QRangeRawF1";
+ $QRangeF1 .= ":::$QRangeFilteredF1" if(!defined($isOnlyStat));
+ my $QRangeF2;
+ if($isPairedEnd) {
+ my $QRangeRawF2 = getFileName($fileName[1]) . "_QualRangePerBase.png";
+ my $QRangeFilteredF2 = getFileName($outFileName[1]) . "_QualRangePerBase.png" if(!defined($isOnlyStat));
+ $QRangeF2 = "$QRangeRawF2";
+ $QRangeF2 .= ":::$QRangeFilteredF2" if(!defined($isOnlyStat));
+ }
+
+ my $c=0;
+ foreach my $ref (@qualDistribRaw) {
+ my $str = "";
+ foreach my $val (@{$ref}) {
+ if($c == 0) {
+ $str = "0";
+ $str .= "-$qualDistribInterval" if($qualDistribInterval>1);
+ }
+ else {
+ $str = $qualDistribInterval*$c;
+ $str .= "-" . $qualDistribInterval*($c+1) if($qualDistribInterval>1);
+ }
+ $c++;
+ push(@qualLabel, $str);
+ }
+ last;
+ }
+ my @file1 = (\@qualLabel, $qualDistribRaw[0]);
+ my @file2 = (\@qualLabel, $qualDistribRaw[1]) if($isPairedEnd);
+ if(!$isOnlyStat) {
+ push(@file1, $qualDistribFinal[0]);
+ push(@file2, $qualDistribFinal[1]) if($isPairedEnd);;
+ }
+ if($isGDMod) {
+ drawQualDist(\@file1, $outFolder.$qualDistF1, getFileName($fileName[0]), 650, 350);
+ drawQualDist(\@file2, $outFolder.$qualDistF2, getFileName($fileName[1]), 650, 300) if($isPairedEnd);
+ }
+
+ my $readsWPriAda = $totalReadsAfterHQ[0] - $totalReadsFinal[0]; # For Paired end, different number of contaminated sequences will be filtered in both the files. And we have to report total reads contaminated including both end files.
+ my $readsLowQual = $totalReads[0] - $readsWPriAda - $totalReadsFinal[0];
+
+ @file1 = (["", "", ""], [$readsWPriAda, $totalReadsFinal[0], $readsLowQual]);
+ if($isGDMod) {
+ drawSummaryPie(\@file1, $outFolder.$sumPieF, 500, 350);
+ }
+
+ $c=0;
+ foreach my $ref (@gcDistribRaw) {
+ foreach my $val (@{$ref}) {
+ my $str = "";
+ if($c == 0) {
+ $str = "0-$gcDistribInterval";
+ }
+ else {
+ $str = $gcDistribInterval*$c . "-" . $gcDistribInterval*($c+1);
+ }
+ $c++;
+ push(@gcLabel, $str);
+ }
+ last;
+ }
+
+ @file1 = (\@gcLabel, $gcDistribRaw[0]);
+ @file2 = (\@gcLabel, $gcDistribRaw[1]) if($isPairedEnd);
+ if(!$isOnlyStat) {
+ push(@file1, $gcDistribFinal[0]);
+ push(@file2, $gcDistribFinal[1]) if($isPairedEnd);
+ }
+
+ if($isGDMod) {
+ drawGCDist(\@file1, $outFolder.$gcDistF1, getFileName($fileName[0]), 550, 350);
+ drawGCDist(\@file2, $outFolder.$gcDistF2, getFileName($fileName[1]), 550, 350) if($isPairedEnd);
+ }
+
+ @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCountRaw[0]);
+ @file2 = (["A", "T", "G", "C", "Non-ATGC"], $charCountRaw[1]) if($isPairedEnd);
+ if(!$isOnlyStat) {
+ @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCountRaw[0], $charCountFinal[0]);
+ @file2 = (["A", "T", "G", "C", "Non-ATGC"], $charCountRaw[1], $charCountFinal[1]) if($isPairedEnd);
+ }
+ if($isGDMod) {
+ drawBaseComp(\@file1, $outFolder.$baseCntF1, getFileName($fileName[0]), 500, 300);
+ drawBaseComp(\@file2, $outFolder.$baseCntF2, getFileName($fileName[1]), 500, 300) if($isPairedEnd);
+ }
+
+ open(STAT, ">$statFile") or die "Can not create statistics file $statFile\n";
+ printStat(*STAT) if($statOutFmt == 1);
+ printStatTab(*STAT) if($statOutFmt == 2);
+ close(STAT);
+
+ my $iFol = getFilePath(abs_path($fileName[0]));
+ my $oFol = abs_path($outFolder) . "/";
+ my $inpFs = getFileName($fileName[0]);
+ my $seqFormatName;
+ $inpFs .= ":::::" . getFileName($fileName[1]) if($isPairedEnd);
+ my $htF = $oFol . "output_" . getFileName($fileName[0]);
+ $htF .= "_" . getFileName($fileName[1]) if($isPairedEnd);
+ $htF .= ".html";
+ if($seqFormat == 1) {
+ $seqFormatName = "Sanger";
+ }
+ elsif($seqFormat == 2) {
+ $seqFormatName = "Solexa";
+ }
+ elsif($seqFormat == 3) {
+ $seqFormatName = "Illumina 1.3+";
+ }
+ elsif($seqFormat == 4) {
+ $seqFormatName = "Illumina 1.5+";
+ }
+ my @fileNames4HTML;
+ @fileNames4HTML = ($outFile, $avgQF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieF, $QRangeF1);
+ @fileNames4HTML = ($outFile1, $outFile2, $unPaired, $avgQF1, $avgQF2, $baseCntF1, $baseCntF2, $gcDistF1, $gcDistF2, $qualDistF1, $qualDistF2, $sumPieF, $QRangeF1, $QRangeF2) if($isPairedEnd);
+ htmlPrint(getFilePath(abs_path($0)), getFileName($0), $htF, $iFol, $isPairedEnd, $isOnlyStat, $inpFs, $seqFormatName, $statFile, $oFol, \@fileNames4HTML);
+ $pm->finish;
+}
+$pm->wait_all_children;
+
+print "================================================================\n";
+print "Processing has been finished\n";
+print "Output files are generated in $outFolder\n" if($outFolder ne "");
+print "Output files are generated in the folder of input files\n" if($outFolder eq "");
+print "================================================================\n";
+
+exit;
+
+sub openFileGetHandle {
+ my ($file, $rOrw) = @_;
+ my $fh;
+ if($file =~ /\.gz$/i) {
+ $fh = new IO::Zlib;
+ $fh->open("$file", "rb") or die "Can not open file $file" if($rOrw eq "r");
+ $fh->open("$file", "wb") or die "Can not create file $file" if($rOrw eq "w");
+ }
+ else {
+ open($fh, "<$file") or die "Can not open file $file" if($rOrw eq "r");
+ open($fh, ">$file") or die "Can not create file $file" if($rOrw eq "w");
+ }
+ return $fh;
+}
+
+sub processPairedEndFiles {
+ my $file1 = $_[0];
+ my $file2 = $_[1];
+ my $outFile1 = $_[2];
+ my $outFile2 = $_[3];
+ my $unPaired = $_[4];
+ $totalReads[0] = sprintf("%0.0f", $nLines/4);
+ $totalReads[1] = sprintf("%0.0f", $nLines/4);
+
+ my $fH1 = openFileGetHandle($file1, "r");
+ *F1 = $fH1;
+ my $fH2 = openFileGetHandle($file2, "r");
+ *F2 = $fH2;
+
+ if(!defined($isOnlyStat)) {
+ my $ofH1 = openFileGetHandle($outFile1, "w");
+ *OF1 = $ofH1;
+ my $ofH2 = openFileGetHandle($outFile2, "w");
+ *OF2 = $ofH2;
+ my $ofupH = openFileGetHandle($unPaired, "w");
+ *OFUP = $ofupH;
+ }
+
+ my $isEOF = 1;
+ if($nLines/4 > 0) {
+ $isEOF = 0;
+ }
+
+ my $lineCount = 0;
+
+ while(!$isEOF) {
+ my @fRead = ();
+ my @rRead = ();
+ for(my $i=0; $i<4; $i++) {
+ $fRead[$i] = <F1>;
+ $rRead[$i] = <F2>;
+ }
+ last if($fRead[0]=~ /^\n$/);
+ last if($rRead[0]=~ /^\n$/);
+ chomp(my $fQualLine = $fRead[3]);
+ chomp(my $rQualLine = $rRead[3]);
+ chomp(my $fSeqLine = $fRead[1]);
+ chomp(my $rSeqLine = $rRead[1]);
+ my $fNs = getNoOfNs($fSeqLine);
+ my $rNs = getNoOfNs($rSeqLine);
+ $totalNs[0] += $fNs;
+ $totalNs[1] += $rNs;
+ if($fNs) {
+ $readsWithN[0]++;
+ }
+ if($rNs) {
+ $readsWithN[1]++;
+ }
+
+ my @qualArr = ();
+ my $isFReadOfHQ = isReadOfHQ($fQualLine, 0, \@qualArr);
+ my $isRReadOfHQ = isReadOfHQ($rQualLine, 1, \@qualArr);
+ my $fSeqLineLen = length $fSeqLine;
+ my $rSeqLineLen = length $rSeqLine;
+ my @ASCII = unpack("C*", $fQualLine);
+ my $fAvgQual = sprintf "%.0f", (sum(@ASCII)/$fSeqLineLen);
+ my @rASCII = unpack("C*", $rQualLine);
+ my $rAvgQual = sprintf "%.0f", (sum(@rASCII)/$rSeqLineLen);
+ $fAvgQual -= $subVal;
+ $rAvgQual -= $subVal;
+ $qualDistribRaw[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $qualDistribRaw[1][getIndex($rAvgQual,$qualDistribInterval)]++;
+ my $fAs = $fSeqLine =~ s/A/A/gi;
+ my $fTs = $fSeqLine =~ s/T/T/gi;
+ my $fGs = $fSeqLine =~ s/G/G/gi;
+ my $fCs = $fSeqLine =~ s/C/C/gi;
+ my $fgcPercent = ($fGs + $fCs)/$fSeqLineLen*100;
+ $charCountRaw[0][0] += $fAs;
+ $charCountRaw[0][1] += $fTs;
+ $charCountRaw[0][2] += $fGs;
+ $charCountRaw[0][3] += $fCs;
+ $charCountRaw[0][4] += $fNs;
+ my $rAs = $rSeqLine =~ s/A/A/gi;
+ my $rTs = $rSeqLine =~ s/T/T/gi;
+ my $rGs = $rSeqLine =~ s/G/G/gi;
+ my $rCs = $rSeqLine =~ s/C/C/gi;
+ my $rgcPercent = ($rGs + $rCs)/$rSeqLineLen*100;
+ $charCountRaw[1][0] += $rAs;
+ $charCountRaw[1][1] += $rTs;
+ $charCountRaw[1][2] += $rGs;
+ $charCountRaw[1][3] += $rCs;
+ $charCountRaw[1][4] += $rNs;
+ $gcDistribRaw[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $gcDistribRaw[1][getIndex($rgcPercent,$gcDistribInterval)]++;
+ if($isFReadOfHQ && $isRReadOfHQ) {
+ $totalReadsAfterHQ[0]++;
+ $totalReadsAfterHQ[1]++;
+ $totalBasesAfterHQ[0] += $fSeqLineLen;
+ $totalBasesAfterHQ[1] += $rSeqLineLen;
+ $totalHQBasesAfterHQ[0] += $isFReadOfHQ;
+ $totalHQBasesAfterHQ[1] += $isRReadOfHQ;
+ if(defined($priAdaLib)) {
+ my $isFWOPriAda = isWOPriAda($fSeqLine, 0, 1);
+ my $isRWOPriAda = isWOPriAda($rSeqLine, 1, 1);
+ if($isFWOPriAda && $isRWOPriAda) {
+ $totalReadsFinal[0]++;
+ $totalReadsFinal[1]++;
+ $totalBasesFinal[0] += $fSeqLineLen;
+ $totalBasesFinal[1] += $rSeqLineLen;
+ $totalHQBasesFinal[0] += $isFReadOfHQ;
+ $totalHQBasesFinal[1] += $isRReadOfHQ;
+ $minLen[2] = $fSeqLineLen if($minLen[2] > $fSeqLineLen);
+ $maxLen[2] = $fSeqLineLen if($maxLen[2] < $fSeqLineLen);
+ $minLen[3] = $rSeqLineLen if($minLen[3] > $rSeqLineLen);
+ $maxLen[3] = $rSeqLineLen if($maxLen[3] < $rSeqLineLen);
+ $totalNs[2] += $fNs;
+ $totalNs[3] += $rNs;
+ if($fNs) {
+ $readsWithN[2]++;
+ }
+ if($rNs) {
+ $readsWithN[3]++;
+ }
+ for(my $x=0; $x<@qualArr; $x++) {
+ my @row = @{$qualArr[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $positionSpecificBaseCountHQ[$x][$y] += $qualArr[$x][$y];
+ my $ind = int($qualArr[$x][$y]/10);
+ $ind-- if($qualArr[$x][$y]%10 == 0 && $qualArr[$x][$y] != 0);
+ $positionSpecificBaseCountHQWithRanges[$x][$y][$ind]++;
+ }
+ }
+ if(!defined($isOnlyStat)) {
+ $qualDistribFinal[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $qualDistribFinal[1][getIndex($rAvgQual,$qualDistribInterval)]++;
+ $gcDistribFinal[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $gcDistribFinal[1][getIndex($rgcPercent,$gcDistribInterval)]++;
+ $charCountFinal[0][0] += $fAs;
+ $charCountFinal[0][1] += $fTs;
+ $charCountFinal[0][2] += $fGs;
+ $charCountFinal[0][3] += $fCs;
+ $charCountFinal[0][4] += $fNs;
+ $charCountFinal[1][0] += $rAs;
+ $charCountFinal[1][1] += $rTs;
+ $charCountFinal[1][2] += $rGs;
+ $charCountFinal[1][3] += $rCs;
+ $charCountFinal[1][4] += $rNs;
+ print OF1 @fRead;
+ print OF2 @rRead;
+ }
+ }
+ else {
+ if(!defined($isOnlyStat)) {
+ if($isFWOPriAda) {
+ print OFUP @fRead;
+ }
+ elsif($isRWOPriAda) {
+ print OFUP @rRead;
+ }
+ }
+ }
+ }
+ else {
+ $totalReadsFinal[0]++;
+ $totalReadsFinal[1]++;
+ $totalBasesFinal[0] += $fSeqLineLen;
+ $totalBasesFinal[1] += $rSeqLineLen;
+ $totalHQBasesFinal[0] += $isFReadOfHQ;
+ $totalHQBasesFinal[1] += $isRReadOfHQ;
+ $minLen[2] = $fSeqLineLen if($minLen[2] > $fSeqLineLen);
+ $maxLen[2] = $fSeqLineLen if($maxLen[2] < $fSeqLineLen);
+ $minLen[3] = $rSeqLineLen if($minLen[3] > $rSeqLineLen);
+ $maxLen[3] = $rSeqLineLen if($maxLen[3] < $rSeqLineLen);
+ $totalNs[2] += $fNs;
+ $totalNs[3] += $rNs;
+ if($fNs) {
+ $readsWithN[2]++;
+ }
+ if($rNs) {
+ $readsWithN[3]++;
+ }
+ for(my $x=0; $x<@qualArr; $x++) {
+ my @row = @{$qualArr[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $positionSpecificBaseCountHQ[$x][$y] += $qualArr[$x][$y];
+ my $ind = int($qualArr[$x][$y]/10);
+ $ind-- if($qualArr[$x][$y]%10 == 0 && $qualArr[$x][$y] != 0);
+ $positionSpecificBaseCountHQWithRanges[$x][$y][$ind]++;
+ }
+ }
+ if(!defined($isOnlyStat)) {
+ $qualDistribFinal[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $qualDistribFinal[1][getIndex($rAvgQual,$qualDistribInterval)]++;
+ $gcDistribFinal[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $gcDistribFinal[1][getIndex($rgcPercent,$gcDistribInterval)]++;
+ $charCountFinal[0][0] += $fAs;
+ $charCountFinal[0][1] += $fTs;
+ $charCountFinal[0][2] += $fGs;
+ $charCountFinal[0][3] += $fCs;
+ $charCountFinal[0][4] += $fNs;
+ $charCountFinal[1][0] += $rAs;
+ $charCountFinal[1][1] += $rTs;
+ $charCountFinal[1][2] += $rGs;
+ $charCountFinal[1][3] += $rCs;
+ $charCountFinal[1][4] += $rNs;
+ print OF1 @fRead;
+ print OF2 @rRead;
+ }
+ }
+ }
+ else {
+ if(!defined($isOnlyStat)) {
+ if($isFReadOfHQ) {
+ my $isFWOPriAda = 1;
+ $isFWOPriAda = isWOPriAda($fSeqLine, 0, 0) if(defined($priAdaLib));
+ if($isFWOPriAda) {
+ print OFUP @fRead;
+ }
+ }
+ elsif($isRReadOfHQ) {
+ my $isRWOPriAda = 1;
+ $isRWOPriAda = isWOPriAda($rSeqLine, 1, 0) if(defined($priAdaLib));
+ if($isRWOPriAda) {
+ print OFUP @rRead;
+ }
+ }
+ }
+ }
+ $lineCount += 4;
+ if($lineCount >= $nLines) {
+ $isEOF = 1;
+ }
+ if($lineCount % (100000*4) == 0) {
+ my $tmpP = sprintf "%0.0f", ($lineCount/4/$totalReads[0]*100);
+ print "$indOfAnalysis: Number of reads processed: " . $lineCount/4 . "/$totalReads[0] ($tmpP\%)...\n";
+ }
+ }
+ close(OFUP);
+ close(OF2);
+ close(OF1);
+ close(F2);
+ close(F1);
+}
+
+sub processSingleEndFiles {
+ my $file = $_[0];
+ my $outFile = $_[1];
+ $totalReads[0] = sprintf("%0.0f", $nLines/4);
+
+ my $fH = openFileGetHandle($file, "r");
+ *F = $fH;
+
+ if(!defined($isOnlyStat)) {
+ my $ofH = openFileGetHandle($outFile, "w");
+ *OF = $ofH;
+ }
+
+ my $isEOF = 1;
+ if($nLines/4 > 0) {
+ $isEOF = 0;
+ }
+
+ my $lineCount = 0;
+
+ while(!$isEOF) {
+ my @fRead = ();
+ for(my $i=0; $i<4; $i++) {
+ $fRead[$i] = <F>;
+ }
+ last if($fRead[0]=~ /^\n$/);
+ chomp(my $fQualLine = $fRead[3]);
+ chomp(my $fSeqLine = $fRead[1]);
+ my $fNs = getNoOfNs($fSeqLine);
+ $totalNs[0] += $fNs;
+ if($fNs) {
+ $readsWithN[0]++;
+ }
+
+ my @qualArr = ();
+ my $isFReadOfHQ = isReadOfHQ($fQualLine, 0, \@qualArr);
+ my $fSeqLineLen = length $fSeqLine;
+ my @ASCII = unpack("C*", $fQualLine);
+ my $fAvgQual = sprintf "%.0f", (sum(@ASCII)/$fSeqLineLen);
+ $fAvgQual -= $subVal;
+ $qualDistribRaw[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ my $fAs = $fSeqLine =~ s/A/A/gi;
+ my $fTs = $fSeqLine =~ s/T/T/gi;
+ my $fGs = $fSeqLine =~ s/G/G/gi;
+ my $fCs = $fSeqLine =~ s/C/C/gi;
+ my $fgcPercent = ($fGs + $fCs)/$fSeqLineLen*100;
+ $charCountRaw[0][0] += $fAs;
+ $charCountRaw[0][1] += $fTs;
+ $charCountRaw[0][2] += $fGs;
+ $charCountRaw[0][3] += $fCs;
+ $charCountRaw[0][4] += $fNs;
+ $gcDistribRaw[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ if($isFReadOfHQ) {
+ $totalReadsAfterHQ[0]++;
+ $totalBasesAfterHQ[0] += $fSeqLineLen;
+ $totalHQBasesAfterHQ[0] += $isFReadOfHQ;
+ if(defined($priAdaLib)) {
+ if(isWOPriAda($fSeqLine, 0, 1)) {
+ $totalReadsFinal[0]++;
+ $totalBasesFinal[0] += $fSeqLineLen;
+ $totalHQBasesFinal[0] += $isFReadOfHQ;
+ $minLen[1] = $fSeqLineLen if($minLen[1] > $fSeqLineLen);
+ $maxLen[1] = $fSeqLineLen if($maxLen[1] < $fSeqLineLen);
+ $totalNs[1] += $fNs;
+ if($fNs) {
+ $readsWithN[1]++;
+ }
+ for(my $x=0; $x<@qualArr; $x++) {
+ my @row = @{$qualArr[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $positionSpecificBaseCountHQ[$x][$y] += $qualArr[$x][$y];
+ my $ind = int($qualArr[$x][$y]/10);
+ $ind-- if($qualArr[$x][$y]%10 == 0 && $qualArr[$x][$y] != 0);
+ $positionSpecificBaseCountHQWithRanges[$x][$y][$ind]++;
+ }
+ }
+ if(!defined($isOnlyStat)) {
+ $qualDistribFinal[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $gcDistribFinal[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $charCountFinal[0][0] += $fAs;
+ $charCountFinal[0][1] += $fTs;
+ $charCountFinal[0][2] += $fGs;
+ $charCountFinal[0][3] += $fCs;
+ $charCountFinal[0][4] += $fNs;
+ print OF @fRead;
+ }
+ }
+ }
+ else {
+ $totalReadsFinal[0]++;
+ $totalBasesFinal[0] += $fSeqLineLen;
+ $totalHQBasesFinal[0] += $isFReadOfHQ;
+ $minLen[1] = $fSeqLineLen if($minLen[1] > $fSeqLineLen);
+ $maxLen[1] = $fSeqLineLen if($maxLen[1] < $fSeqLineLen);
+ $totalNs[1] += $fNs;
+ if($fNs) {
+ $readsWithN[1]++;
+ }
+ for(my $x=0; $x<@qualArr; $x++) {
+ my @row = @{$qualArr[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $positionSpecificBaseCountHQ[$x][$y] += $qualArr[$x][$y];
+ my $ind = int($qualArr[$x][$y]/10);
+ $ind-- if($qualArr[$x][$y]%10 == 0 && $qualArr[$x][$y] != 0);
+ $positionSpecificBaseCountHQWithRanges[$x][$y][$ind]++;
+ }
+ }
+ if(!defined($isOnlyStat)) {
+ $qualDistribFinal[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $gcDistribFinal[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $charCountFinal[0][0] += $fAs;
+ $charCountFinal[0][1] += $fTs;
+ $charCountFinal[0][2] += $fGs;
+ $charCountFinal[0][3] += $fCs;
+ $charCountFinal[0][4] += $fNs;
+ print OF @fRead;
+ }
+ }
+ }
+ else {
+ }
+ $lineCount += 4;
+ if($lineCount >= $nLines) {
+ $isEOF = 1;
+ }
+ if($lineCount % (100000*4) == 0) {
+ my $tmpP = sprintf "%0.0f", ($lineCount/4/$totalReads[0]*100);
+ print "$indOfAnalysis: Number of reads processed: " . $lineCount/4 . "/$totalReads[0] ($tmpP\%)...\n";
+ }
+ }
+
+ close(OF);
+ close(F);
+}
+
+sub checkFastQFormat { # Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file.
+ my $file = $_[0];
+ my $isVariantIdntfcntOn = $_[1];
+ my $lines = 0;
+ my $fH = openFileGetHandle($file, "r");
+ *F = $fH;
+ my $counter = 0;
+ my $minVal = 1000;
+ my $maxVal = 0;
+ while(my $line = <F>) {
+ $lines++;
+ $counter++;
+ next if($line =~ /^\n$/);
+ if($counter == 1 && $line !~ /^\@/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 3 && $line !~ /^\+/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 4 && $lines < 1000000) {
+ chomp $line;
+ my @ASCII = unpack("C*", $line);
+ $minVal = min(min(@ASCII), $minVal);
+ $maxVal = max(max(@ASCII), $maxVal);
+ }
+ if($counter == 4) {
+ $counter = 0;
+ }
+ }
+ close(F);
+ my $tseqFormat = 0;
+ if($minVal >= 33 && $minVal <= 73 && $maxVal >= 33 && $maxVal <= 73) {
+ $tseqFormat = 1;
+ }
+ elsif($minVal >= 66 && $minVal <= 105 && $maxVal >= 66 && $maxVal <= 105) {
+ $tseqFormat = 4; # Illumina 1.5+
+ }
+ elsif($minVal >= 64 && $minVal <= 105 && $maxVal >= 64 && $maxVal <= 105) {
+ $tseqFormat = 3; # Illumina 1.3+
+ }
+ elsif($minVal >= 59 && $minVal <= 105 && $maxVal >= 59 && $maxVal <= 105) {
+ $tseqFormat = 2; # Solexa
+ }
+ elsif($minVal >= 33 && $minVal <= 74 && $maxVal >= 33 && $maxVal <= 74) {
+ $tseqFormat = 5; # Illumina 1.8+
+ }
+ if($isVariantIdntfcntOn) {
+ $seqFormat = $tseqFormat;
+ }
+ else {
+ if($tseqFormat != $seqFormat) {
+ print STDERR "Warning: It seems the specified variant of FASTQ doesn't match the quality values in input FASTQ files.\n";
+ }
+ }
+ return $lines;
+}
+
+sub getFilePath {
+ my $name = $_[0];
+ my $path = "";
+ if($name =~ /\//) {
+ $name =~ /(.+)\//;
+ $path = $1 . "/";
+ }
+ else {
+ $path = "./";
+ }
+ return $path;
+}
+
+sub getFileName { # This sub takes a path of a file and returns just its name after separating the path from it.
+ my $path = $_[0];
+ my $name = "";
+ $path =~ /([^\/]+)$/;
+ $name = $1;
+ return $name;
+}
+
+sub prtErrorExit {
+ my $errmsg = $_[0];
+ print STDERR "Error:\t", $errmsg, "\n";
+ exit;
+}
+
+sub isReadOfHQ { # Criteria for HQ is greater than or equal to 70% of bases have phred score >= 20
+ my $read = $_[0];
+ my $v0Or1 = $_[1]; # 0 will be for forward reads and 1 for reverse reads.
+ my $arrRef = $_[2];
+ my $readLen = length $read;
+ $minLen[$v0Or1] = $readLen if($minLen[$v0Or1] > $readLen);
+ $maxLen[$v0Or1] = $readLen if($maxLen[$v0Or1] < $readLen);
+ my $cutOffLen = sprintf("%0.0f", $readLen * $cutOffReadLen4HQ / 100); # 70% length of read length is calculated.
+ my $validBaseCount = 0;
+ my @ASCII = unpack("C*", $read);
+ my $c = 0;
+ foreach my $val (@ASCII) {
+ $val -= $subVal;
+ $positionSpecificBaseCount[$v0Or1][$c] += $val;
+ my $ind = int($val/10);
+ $ind-- if($val%10 == 0 && $val != 0);
+ $positionSpecificBaseCountWithRanges[$v0Or1][$c][$ind]++;
+ $$arrRef[$v0Or1][$c] = $val;
+ if($val >= $cutOffPhScore) {
+ $validBaseCount++;
+ }
+ $c++;
+ }
+ $totalBases[$v0Or1] += $readLen;
+ $totalHQBases[$v0Or1] += $validBaseCount;
+ if($validBaseCount >= $cutOffLen) {
+ return $validBaseCount; # Return true.
+ }
+ else {
+ return 0; # Return false.
+ }
+}
+
+
+sub qualGraph { ### Use this just for final graph generation...
+ my $file = $_[0];
+ my $v0Or1 = $_[1]; # 0 will be for forward reads and 1 for reverse reads.
+ my $arrRef = $_[2];
+ my $flag = 0;
+ my $fH = openFileGetHandle($file, "r");
+ *F = $fH;
+ while(my $read = <F>) {
+ chomp($read);
+ $flag++;
+ if($flag%4 == 0) { # To obtain the quality value line.
+ my @ASCII = ();
+ @ASCII = unpack("C*", $read);
+ my $c=0;
+ foreach my $val (@ASCII) {
+ $val -= $subVal;
+ $$arrRef[$v0Or1][$c] += $val;
+ $c++;
+ }
+ }
+ }
+ close(F);
+}
+
+
+
+
+
+
+
+sub isWOPriAda {
+ my $seq = $_[0];
+ my $v0Or1 = $_[1];
+ my $isCountStatOn = $_[2];
+ chomp($seq);
+
+ my @arrGenomic = (
+ "GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
+ );
+
+ my @arrPE = (
+ "GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"
+ );
+
+ my @arrDpnII = (
+ "GATCGTCGGACTGTAGAACTCTGAAC",
+ "ACAGGTTCAGAGTTCTACAGTCCGAC",
+ "CAAGCAGAAGACGGCATACGANN",
+ "TCGTATGCCGTCTTCTGCTTG",
+ "CAAGCAGAAGACGGCATACGA",
+ "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA",
+ "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
+ );
+
+ my @arrNlaIII = (
+ "TCGGACTGTAGAACTCTGAAC",
+ "ACAGGTTCAGAGTTCTACAGTCCGACATG",
+ "CAAGCAGAAGACGGCATACGANN",
+ "TCGTATGCCGTCTTCTGCTTG",
+ "CAAGCAGAAGACGGCATACGA",
+ "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA",
+ "CCGACAGGTTCAGAGTTCTACAGTCCGACATG"
+ );
+
+ my @arrsmRNA = (
+ "GTTCAGAGTTCTACAGTCCGACGATC",
+ "TCGTATGCCGTCTTCTGCTTGT",
+ "CAAGCAGAAGACGGCATACGA",
+ "CAAGCAGAAGACGGCATACGA",
+ "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA",
+ "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
+ );
+
+ my @arrmulPlex = (
+ "GATCGGAAGAGCACACGTCT",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC",
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT",
+ "CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC"
+ );
+
+ my @priAdas = (\@arrGenomic, \@arrPE, \@arrDpnII, \@arrNlaIII, \@arrsmRNA, \@arrmulPlex);
+ my %checkedPriStr = (); # The 20 bp from start and end are stored in this hash as key. So that next time when another pri/ada seq
+
+ my @priAdaSeqs = ();
+ if($priAdaLib eq "u") {
+ @priAdaSeqs = @usrDefinedPriAda;
+ }
+ else {
+ @priAdaSeqs = @{$priAdas[$priAdaLib]};
+ }
+ my @stat = ();
+ my $priInd = 0;
+
+ my $isMatched = 0;
+ foreach my $priAda (@priAdaSeqs) {
+ if(findSeq($priAda, $seq, \%checkedPriStr)) {
+ $isMatched = 1;
+ last;
+ }
+ }
+
+ if($isMatched) {
+ $totalValidReadsWithPriAda[$v0Or1]++ if($isCountStatOn);
+ return 0;
+ }
+ else {
+ $totalValidReadsNoPriAda[$v0Or1]++ if($isCountStatOn);
+ return 1;
+ }
+}
+
+sub findSeq {
+ my $pri = $_[0];
+ my $seq = $_[1];
+ my $hashRef = $_[2];
+ my $spri = substr($pri, 0, $substrlen);
+ my $tmpInd = (length $pri) - $substrlen;
+ $tmpInd = 0 if($tmpInd < 0);
+ my $epri = substr($pri, $tmpInd, $substrlen);
+ my $ans;
+ if(!defined($$hashRef{$spri})) {
+ my @catches = String::Approx::amatch($spri, ['I0 D0 S1'], $seq);
+ if(@catches != 0) {
+ return 1;
+ }
+ $$hashRef{$spri} = 1;
+ }
+ if(!defined($$hashRef{$epri})) {
+ my @catches = String::Approx::amatch($epri, ['I0 D0 S1'], $seq);
+ if(@catches != 0) {
+ return 1;
+ }
+ $$hashRef{$epri} = 1;
+ }
+ return 0;
+}
+
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTQ) options (Atleast one option is required)\n";
+ print " -pe <Forward reads file> <Reverse reads file> <Primer/Adaptor library> <FASTQ variant>\n";
+ print " Paired-end read files (FASTQ) with primer/adaptor library and FASTQ variant\n";
+ print " User may choose from the provided primer/adaptor library or can give a file containing primer/adaptor sequences, one per line\n";
+ print " Multiple libraries can be given using multiple '-pe' options\n";
+ print " For eg.: -pe r1.fq r2.fq 3 1 -pe t1.fq t2.fq 2 A\n\n";
+ print " -se <Reads file> <Primer/Adaptor library> <FASTQ variant>\n";
+ print " Single-end read file (FASTQ) with primer/adaptor library and FASTQ variant\n";
+ print " Multiple libraries can be given using multiple '-se' options\n";
+ print " For eg.: -se r1.fq 3 2 -se t2.fq 2 2\n\n";
+ print " Primer/Adaptor libraries:\n";
+ my $c = 1;
+ foreach my $lib (@priAdaLibNames) {
+ print " $c = $lib\n";
+ $c++;
+ }
+ print " N = Do not filter for Primer/Adaptor\n";
+ print " <File> = File for user defined primer/adaptor sequences, one per line\n";
+ print "\n";
+ print " FASTQ variants:\n";
+ print " 1 = Sanger (Phred+33, 33 to 73)\n";
+ print " 2 = Solexa (Phred+64, 59 to 104)\n";
+ print " 3 = Illumina (1.3+) (Phred+64, 64 to 104)\n";
+ print " 4 = Illumina (1.5+) (Phred+64, 66 to 104)\n";
+ print " 5 = Illumina (1.8+) (Phred+33, 33 to 74)\n";
+ print " A = Automatic detection of FASTQ variant\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print "--------------------------------- QC Options ---------------------------------\n";
+ print " -l | -cutOffReadLen4HQ <Real number, 0 to 100>\n";
+ print " The cut-off value for percentage of read length that should be of given quality\n";
+ print " default: 70\n";
+ print " -s | -cutOffQualScore <Integer, 0 to 40>\n";
+ print " The cut-off value for PHRED quality score for high-quality filtering\n";
+ print " default: 20\n";
+ print "----------------------------- Processing Options -----------------------------\n";
+ print " -p | -processes <Integer>\n";
+ print " Number of processes to be used\n";
+ print " default: 1\n";
+ print " -onlyStat\n";
+ print " Outputs only statistics without filtered data output\n";
+ print "------------------------------- Output Options -------------------------------\n";
+ print " -t | -statOutFmt <Integer>\n";
+ print " Output format for statistics\n";
+ print " Formats:\n";
+ print " 1 = formatted text\n";
+ print " 2 = tab delimited\n";
+ print " default: 1\n";
+ print " -o | -outputFolder <Output folder name/path>\n";
+ print " Output will be stored in the given folder\n";
+ print " default: By default, output folder (IlluQC_Filtered_files) will be generated where the input files are\n";
+ print " -z | -outputDataCompression <Character>\n";
+ print " Output format for HQ filtered data\n";
+ print " Formats:\n";
+ print " t = text FASTQ files\n";
+ print " g = gzip compressed files\n";
+ print " default: t\n";
+ print "\n";
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+
+sub getNoOfNs { # This takes sequence and returns the number of N/. (unknown base call).
+ my $seq = $_[0];
+ my $count = 0;
+ while($seq =~ /[N\.]/g) {
+ $count++;
+ }
+ return $count;
+}
+
+sub getIndex {
+ my $up = $_[0];
+ my $down = $_[1];
+ my $inp = $up/$down;
+ return (sprintf "%0.0f", $up) if($down == 1);
+ my $index = int((sprintf "%0.2f", $inp)+0.99)-1;
+ $index = 0 if($index < 0);
+ return $index;
+}
+
+
+sub drawBaseComp {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ y_label => 'Count',
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ l_margin => 60,
+ r_margin => 60,
+ b_margin => 50,
+ t_margin => 50,
+ show_values => 1,
+ bar_spacing => 1,
+ values_vertical => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ $mygraph->set_values_font($f, 6);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $dgreen = $myImage->colorAllocate(0,127,0);
+ my $dblue = $myImage->colorAllocate(0,0,127);
+
+ my $sum1 = sum(@{$$dataRef[1]});
+ my $sum2 = sum(@{$$dataRef[2]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Base composition for $fileName",
+ color => $dblue,
+ );
+
+ $wrapbox->set(align => 'center', width => $width);
+ $wrapbox->set_font($f, 11);
+ $wrapbox->draw(0,0);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[1]}[0]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[1]}[1]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[1]}[2]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[1]}[3]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[1]}[4]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-35);
+
+
+ my $startRectX = $width/2-230;
+ my $startRectY = $height-35;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+
+ if(!$isOnlyStat) {
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[2]}[0]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[2]}[1]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[2]}[2]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[2]}[3]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[2]}[4]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-20);
+
+
+
+ $startRectX = $width/2-230;
+ $startRectY = $height-20;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+ }
+
+
+
+
+
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ print I $myImage->png;
+ close(I);
+}
+
+
+sub drawGCDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+
+ $mygraph->set(
+ x_label => '% GC content',
+ y_label => 'Number of reads',
+ title => "GC content distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ markers => [1],
+ marker_size => 3,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ x_labels_vertical => 1,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName, "a");
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ print I $myImage->png;
+ close(I);
+}
+
+
+sub drawSummaryPie {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $width = $_[2];
+ my $height = $_[3];
+ my $mygraph = new GD::Graph::pie($width, $height);
+
+ $mygraph->set(
+ title => "Summary of quality check and filtering",
+ axislabelclr => 'black',
+ pie_height => 40,
+
+ l_margin => 15,
+ r_margin => 15,
+ b_margin => 50,
+ start_angle => 45,
+ dclrs => [ qw(lyellow lgreen lred) ],
+ transparent => 0,
+ ) or warn $mygraph->error;
+
+ $mygraph->set_label_font($f, 8);
+ $mygraph->set_value_font(['verdana', 'arial'],14);
+ $mygraph->set_title_font($f, 11);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $red = $myImage->colorAllocate(255,0,0);
+ my $yellow = $myImage->colorAllocate(255,255,0);
+ my $green = $myImage->colorAllocate(0,255,0);
+
+ my $sum = sum(@{$$dataRef[1]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Primer/Adaptor contaminated reads (%0.02f", @{$$dataRef[1]}[0]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-100,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "High quality filtered reads (%0.02f", @{$$dataRef[1]}[1]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-100,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Low quality reads (%0.02f", @{$$dataRef[1]}[2]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-100,$height-15);
+
+ my $startRectX = $width/2-120;
+ my $startRectY = $height-45;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$yellow);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$green);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$red);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ print I $myImage->png;
+ close(I);
+
+}
+
+sub drawQualDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ x_label => 'Average phred quality score',
+ y_label => 'Number of reads',
+ title => "Quality distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ bar_spacing => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ print I $myImage->png;
+ close(I);
+}
+
+
+sub drawGraph {
+ my @data = @{$_[0]};
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ my $y_min = 0;
+ my $y_max = 0;
+ for(my $i=1; $i<@data; $i++) {
+ $y_max = max($y_max, max(@{$data[$i]}));
+ }
+ $y_max = (sprintf "%0.0f",($y_max/5)) * 5 + 5;
+ my $height = sprintf "%0.0f", $y_max * 300 / 45;
+ my $width = sprintf "%0.0f", scalar @{$data[0]} * 600 / 75;
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+ $mygraph->set(
+ x_label => 'Base position',
+ y_label => 'Average quality score',
+ title => $fileName,
+ y_min_value => $y_min,
+ y_max_value => $y_max,
+ x_label_skip => 2,
+ y_tick_number => $y_max/5,
+ y_label_skip => 1,
+ markers => [7],
+ marker_size => 3,
+ long_ticks => 1,
+ line_width => 2,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ x_labels_vertical => 1,
+ transparent => 0,
+ r_margin => 10,
+ fgclr => '#dddddd',
+ accentclr => 'yellow',
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ my $myimage = $mygraph->plot(\@data) or die $mygraph->error;
+
+ print I $myimage->png;
+ close(I);
+}
+
+sub drawRangeGraph {
+ my @data = @{$_[0]};
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $height = 350;
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ my $y_min = 0;
+ my $y_max = 0;
+ for(my $i=1; $i<@data; $i++) {
+ $y_max = max($y_max, max(@{$data[$i]}));
+ }
+ $y_max = 100; #(sprintf "%0.0f",($y_max/5)) * 5 + 5;
+ my $width = sprintf "%0.0f", scalar @{$data[0]} * 700 / 75;
+ $width = max(700, $width);
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+ $mygraph->set(
+ x_label => 'Base position',
+ y_label => 'Read count (%)',
+ title => "Read count (%) per base for different quality score ranges for $fileName",
+ y_min_value => $y_min,
+ y_max_value => $y_max,
+ x_label_skip => 2,
+ y_tick_number => $y_max/5,
+ y_label_skip => 1,
+ markers => [7],
+ marker_size => 3,
+ long_ticks => 1,
+ line_width => 2,
+ dclrs => [ qw(lred dgreen lyellow blue) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ x_labels_vertical => 1,
+ transparent => 0,
+ r_margin => 10,
+ fgclr => '#dddddd',
+ accentclr => 'yellow',
+ ) or warn $mygraph->error;
+
+ $mygraph->set_legend( "0-10", "11-20", "21-30", "31-40");
+
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ my $myimage = $mygraph->plot(\@data) or die $mygraph->error;
+
+ print I $myimage->png;
+ close(I);
+}
+
+sub prepareData4RangeGraph {
+ my $STAT = $_[0];
+ print $STAT "Read count (%) per base for different quality score ranges\n\n";
+ my $c = 0;
+ my @rangeGraphData = ();
+ foreach my $arr (@positionSpecificBaseCountWithRanges) {
+ my $arrFiltered = $positionSpecificBaseCountHQWithRanges[$c];
+ print $STAT "\t", getFileName($fileName[$c]);
+ print $STAT "\t\t\t\t\t", getFileName($outFileName[$c]) if(!defined($isOnlyStat));
+ print $STAT "\n";
+ print $STAT "Ranges\t0-10\t11-20\t21-30\t31-40";
+ print $STAT "\t\t0-10\t11-20\t21-30\t31-40" if(!defined($isOnlyStat));
+ print $STAT "\nBase\n";
+ my $basePos = 1;
+ foreach my $valArr (@$arr) {
+ my $valArrF = @$arrFiltered[$basePos-1];
+ @$valArr[0] = 0 if(! @$valArr[0]);
+ @$valArr[1] = 0 if(! @$valArr[1]);
+ @$valArr[2] = 0 if(! @$valArr[2]);
+ @$valArr[3] = 0 if(! @$valArr[3]);
+ my $total = @$valArr[0] + @$valArr[1] + @$valArr[2] + @$valArr[3];
+ my $val1 = sprintf "%0.2f", @$valArr[0]/$total*100;
+ my $val2 = sprintf "%0.2f", @$valArr[1]/$total*100;
+ my $val3 = sprintf "%0.2f", @$valArr[2]/$total*100;
+ my $val4 = sprintf "%0.2f", @$valArr[3]/$total*100;
+ $rangeGraphData[$c*2][0][$basePos-1] = $basePos;
+ $rangeGraphData[$c*2][1][$basePos-1] = $val1;
+ $rangeGraphData[$c*2][2][$basePos-1] = $val2;
+ $rangeGraphData[$c*2][3][$basePos-1] = $val3;
+ $rangeGraphData[$c*2][4][$basePos-1] = $val4;
+ print $STAT "$basePos\t$val1\t$val2\t$val3\t$val4";
+ if(! defined($isOnlyStat)) {
+ @$valArrF[0] = 0 if(! @$valArrF[0]);
+ @$valArrF[1] = 0 if(! @$valArrF[1]);
+ @$valArrF[2] = 0 if(! @$valArrF[2]);
+ @$valArrF[3] = 0 if(! @$valArrF[3]);
+ my $totalF = @$valArrF[0] + @$valArrF[1] + @$valArrF[2] + @$valArrF[3];
+ my $valF1 = sprintf "%0.2f", @$valArrF[0]/$totalF*100;
+ my $valF2 = sprintf "%0.2f", @$valArrF[1]/$totalF*100;
+ my $valF3 = sprintf "%0.2f", @$valArrF[2]/$totalF*100;
+ my $valF4 = sprintf "%0.2f", @$valArrF[3]/$totalF*100;
+ $rangeGraphData[$c*2+1][0][$basePos-1] = $basePos;
+ $rangeGraphData[$c*2+1][1][$basePos-1] = $valF1;
+ $rangeGraphData[$c*2+1][2][$basePos-1] = $valF2;
+ $rangeGraphData[$c*2+1][3][$basePos-1] = $valF3;
+ $rangeGraphData[$c*2+1][4][$basePos-1] = $valF4;
+ print $STAT "\t\t$valF1\t$valF2\t$valF3\t$valF4";
+ }
+ print $STAT "\n";
+ $basePos++;
+ }
+ print $STAT "\n\n";
+ $c++;
+ }
+ print $STAT "\n\n";
+ if($isGDMod) {
+ drawRangeGraph($rangeGraphData[0], $outFolder.getFileName($fileName[0])."_QualRangePerBase.png", getFileName($fileName[0]));
+ drawRangeGraph($rangeGraphData[1], $outFolder.getFileName($outFileName[0])."_QualRangePerBase.png", getFileName($outFileName[0])) if(!defined($isOnlyStat));
+ if($isPairedEnd) {
+ drawRangeGraph($rangeGraphData[2], $outFolder.getFileName($fileName[1])."_QualRangePerBase.png", getFileName($fileName[1]));
+ drawRangeGraph($rangeGraphData[3], $outFolder.getFileName($outFileName[1])."_QualRangePerBase.png", getFileName($outFileName[1])) if(!defined($isOnlyStat));
+ }
+ }
+}
+
+sub printStat {
+ my $STAT = $_[0];
+ my $tmpPer;
+ my $inde = " " x 1;
+ print $STAT "Parameters\n";
+ my @graphData1 = ();
+ my @graphData2 = ();
+ if($isPairedEnd) {
+ printf $STAT "$inde %-30s %s\n", "Library type", "Paired-end";
+ printf $STAT "$inde %-30s %s %s\n", "Input files", $fileName[0], $fileName[1];
+ printf $STAT "$inde %-30s %s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf $STAT "$inde %-30s %s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf $STAT "$inde %-30s %s\n", "Cut-off quality score", $cutOffPhScore;
+ printf $STAT "$inde %-30s %s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf $STAT "$inde %-30s %s\n", "Number of processes", $noOfProcesses;
+
+ print $STAT "\n\n";
+
+ print $STAT "QC statistics\n";
+ printf $STAT "$inde %-50s %-20s %s\n", "File name", getFileName($fileName[0]), getFileName($fileName[1]);
+ printf $STAT "$inde %-50s %-20d %d\n", "Total number of reads", $totalReads[0], $totalReads[1];
+ printf $STAT "$inde %-50s %-20d %d\n", "Total number of HQ reads", $totalReadsAfterHQ[0], $totalReadsAfterHQ[1];
+ $tmpPer = sprintf "%0.2f", $totalReadsAfterHQ[0]/$totalReads[0]*100;
+ printf $STAT "$inde %-50s %-20s %0.2f%s\n", "Percentage of HQ reads", $tmpPer."%", $totalReadsAfterHQ[1]/$totalReads[1]*100, "%";
+ printf $STAT "$inde %-50s %-20.f %.f\n", "Total number of bases", $totalBases[0], $totalBases[1];
+ printf $STAT "$inde %-50s %-20.f %.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ[0], $totalBasesAfterHQ[1];
+ printf $STAT "$inde %-50s %-20.f %.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ[0], $totalHQBasesAfterHQ[1];
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ[0]/$totalBasesAfterHQ[0]*100;
+ printf $STAT "$inde %-50s %-20s %0.2f%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%" , $totalHQBasesAfterHQ[1]/$totalBasesAfterHQ[1]*100, "%";
+ if(defined($priAdaLib)) {
+ printf $STAT "$inde %-50s %-20d %d\n", "Number of Primer/Adaptor contaminated HQ reads", $totalValidReadsWithPriAda[0], $totalValidReadsWithPriAda[1];
+ }
+ else {
+ printf $STAT "$inde %-50s %-20s %s\n", "Number of Primer/Adaptor contaminated HQ reads", "NA", "NA";
+ }
+ printf $STAT "$inde %-50s %-20d %d\n", "Total number of HQ filtered reads", $totalReadsFinal[0], $totalReadsFinal[1];
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal[0]/$totalReads[0]*100;
+ printf $STAT "$inde %-50s %-20s %0.2f%s\n", "Percentage of HQ filtered reads", $tmpPer."%", $totalReadsFinal[1]/$totalReads[1]*100, "%";
+
+ print $STAT "\n\n";
+
+ print $STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", getFileName($fileName[0]), getFileName($fileName[1]), getFileName($outFileName[0]), getFileName($outFileName[1])],
+ ["Minimum read length", $minLen[0], $minLen[1], $minLen[2], $minLen[3]],
+ ["Maximum read length", $maxLen[0], $maxLen[1], $maxLen[2], $maxLen[3]],
+ ["Average read length", (sprintf "%0.2f", $totalBases[0]/$totalReads[0]), (sprintf "%0.2f", $totalBases[1]/$totalReads[1]), (sprintf "%0.2f", $totalBasesFinal[0]/$totalReadsFinal[0]), (sprintf "%0.2f", $totalBasesFinal[1]/$totalReadsFinal[1])],
+ ["Total number of reads", $totalReads[0], $totalReads[1], $totalReadsFinal[0], $totalReadsFinal[1]],
+ ["Total number of reads with non-ATGC bases", $readsWithN[0], $readsWithN[1], $readsWithN[2], $readsWithN[3]],
+ ["Percentage of reads with non-ATGC bases", (sprintf "%0.2f", $readsWithN[0]/$totalReads[0]*100)."%", (sprintf "%0.2f", $readsWithN[1]/$totalReads[1]*100)."%", (sprintf "%0.2f", $readsWithN[2]/$totalReadsFinal[0]*100)."%", (sprintf "%0.2f", $readsWithN[3]/$totalReadsFinal[1]*100)."%"],
+ ["Total number of bases", $totalBases[0], $totalBases[1], $totalBasesFinal[0], $totalBasesFinal[1]],
+ ["Total number of HQ bases", $totalHQBases[0], $totalHQBases[1], $totalHQBasesFinal[0], $totalHQBasesFinal[1]],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalHQBases[1]/$totalBases[1]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[0]/$totalBasesFinal[0]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[1]/$totalBasesFinal[1]*100)."%"],
+ ["Total number of non-ATGC bases", $totalNs[0], $totalNs[1], $totalNs[2], $totalNs[3]],
+ ["Percentage of non-ATGC bases", (sprintf "%0.2f", $totalNs[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalNs[1]/$totalBases[1]*100)."%", (sprintf "%0.2f", $totalNs[2]/$totalBasesFinal[0]*100)."%", (sprintf "%0.2f", $totalNs[3]/$totalBasesFinal[1]*100)."%"],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf $STAT "$inde %-45s %-20s %-20s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2], $arr[$i][3], $arr[$i][4];
+ }
+ else {
+ printf $STAT "$inde %-45s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ }
+
+ print $STAT "\n\n";
+
+ print $STAT "Average quality score at each base position of input reads\n\n";
+ my $c = 0;
+ @graphData1 = ();
+ @graphData2 = ();
+ foreach my $arr (@positionSpecificBaseCount) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReads[$c];
+ if($c == 0) {
+ $graphData1[0][$basePos-1] = $basePos;
+ $graphData1[1][$basePos-1] = $outVal;
+ }
+ else {
+ $graphData2[0][$basePos-1] = $basePos;
+ $graphData2[1][$basePos-1] = $outVal;
+ }
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ print $STAT "\n\n";
+ $c++;
+ }
+ print $STAT "\n\n";
+ if(!defined($isOnlyStat)) {
+ print $STAT "Average quality score at each base position of filtered reads\n\n";
+ $c = 0;
+ foreach my $arr (@positionSpecificBaseCountHQ) {
+ print $STAT getFileName($outFileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReadsFinal[$c];
+ if($c == 0) {
+ $graphData1[2][$basePos-1] = $outVal;
+ }
+ else {
+ $graphData2[2][$basePos-1] = $outVal;
+ }
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n\n";
+ }
+ }
+ print $STAT "\n\n";
+ prepareData4RangeGraph($STAT);
+ if($isGDMod) {
+ drawGraph(\@graphData1, $outFolder.getFileName($fileName[0])."_avgQual.png", getFileName($fileName[0]));
+ drawGraph(\@graphData2, $outFolder.getFileName($fileName[1])."_avgQual.png", getFileName($fileName[1]));
+ }
+ }
+ else {
+ printf $STAT "$inde %-30s %s\n", "Library type", "Single-end";
+ printf $STAT "$inde %-30s %s\n", "Input file", $fileName[0];
+ printf $STAT "$inde %-30s %s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf $STAT "$inde %-30s %s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf $STAT "$inde %-30s %s\n", "Cut-off quality score", $cutOffPhScore;
+ printf $STAT "$inde %-30s %s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf $STAT "$inde %-30s %s\n", "Number of processes", $noOfProcesses;
+
+ print $STAT "\n\n";
+
+ print $STAT "QC statistics\n";
+ printf $STAT "$inde %-50s %s\n", "File name", getFileName($fileName[0]);
+ printf $STAT "$inde %-50s %d\n", "Total number of reads", $totalReads[0];
+ printf $STAT "$inde %-50s %d\n", "Total number of HQ reads", $totalReadsAfterHQ[0];
+ $tmpPer = sprintf "%0.2f", $totalReadsAfterHQ[0]/$totalReads[0]*100;
+ printf $STAT "$inde %-50s %s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf $STAT "$inde %-50s %.f\n", "Total number of bases", $totalBases[0];
+ printf $STAT "$inde %-50s %.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ[0];
+ printf $STAT "$inde %-50s %.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ[0];
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ[0]/$totalBasesAfterHQ[0]*100;
+ printf $STAT "$inde %-50s %s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf $STAT "$inde %-50s %d\n", "Number of Primer/Adaptor contaminated HQ reads", $totalValidReadsWithPriAda[0];
+ }
+ else {
+ printf $STAT "$inde %-50s %s\n", "Number of Primer/Adaptor contaminated HQ reads", "NA";
+ }
+ printf $STAT "$inde %-50s %d\n", "Total number of HQ filtered reads", $totalReadsFinal[0];
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal[0]/$totalReads[0]*100;
+ printf $STAT "$inde %-50s %s\n", "Percentage of HQ filtered reads", $tmpPer."%";
+
+ print $STAT "\n\n";
+
+ print $STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", getFileName($fileName[0]), getFileName($outFileName[0])],
+ ["Minimum read length", $minLen[0], $minLen[1]],
+ ["Maximum read length", $maxLen[0], $maxLen[1]],
+ ["Average read length", (sprintf "%0.2f", $totalBases[0]/$totalReads[0]), (sprintf "%0.2f", $totalBasesFinal[0]/$totalReadsFinal[0])],
+ ["Total number of reads", $totalReads[0], $totalReadsFinal[0]],
+ ["Total number of reads with non-ATGC bases", $readsWithN[0], $readsWithN[1]],
+ ["Percentage of reads with non-ATGC bases", (sprintf "%0.2f", $readsWithN[0]/$totalReads[0]*100)."%", (sprintf "%0.2f", $readsWithN[1]/$totalReadsFinal[0]*100)."%"],
+ ["Total number of bases", $totalBases[0], $totalBasesFinal[0]],
+ ["Total number of HQ bases", $totalHQBases[0], $totalHQBasesFinal[0]],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[0]/$totalBasesFinal[0]*100)."%"],
+ ["Total number of non-ATGC bases", $totalNs[0], $totalNs[1]],
+ ["Percentage of non-ATGC bases", (sprintf "%0.2f", $totalNs[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalNs[1]/$totalBasesFinal[0]*100)."%"],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf $STAT "$inde %-50s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf $STAT "$inde %-50s %s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print $STAT "\n\n";
+
+ @graphData1 = ();
+ print $STAT "Average quality score at each base position of input reads\n\n";
+ my $c = 0;
+ foreach my $arr (@positionSpecificBaseCount) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReads[$c];
+ $graphData1[0][$basePos-1] = $basePos;
+ $graphData1[1][$basePos-1] = $outVal;
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n";
+ }
+ print $STAT "\n\n";
+ $c = 0;
+ if(!defined($isOnlyStat)) {
+ print $STAT "Average quality score at each base position of filtered reads\n\n";
+ foreach my $arr (@positionSpecificBaseCountHQ) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReadsFinal[$c];
+ $graphData1[2][$basePos-1] = $outVal;
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n";
+ }
+ }
+ print $STAT "\n\n";
+ prepareData4RangeGraph($STAT);
+ if($isGDMod) {
+ drawGraph(\@graphData1, $outFolder.getFileName($fileName[0])."_avgQual.png", getFileName($fileName[0]));
+ }
+ }
+}
+
+sub printStatTab {
+ my $STAT = $_[0];
+ my $tmpPer;
+ my $inde = "\t";
+ print $STAT "Parameters\n";
+ my @graphData1 = ();
+ my @graphData2 = ();
+ if($isPairedEnd) {
+ printf $STAT "\t%s\t%s\n", "Library type", "Paired-end";
+ printf $STAT "\t%s\t%s\t%s\n", "Input files", $fileName[0], $fileName[1];
+ printf $STAT "\t%s\t%s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf $STAT "\t%s\t%s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf $STAT "\t%s\t%s\n", "Cut-off quality score", $cutOffPhScore;
+ printf $STAT "\t%s\t%s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf $STAT "\t%s\t%s\n", "Number of processes", $noOfProcesses;
+
+ print $STAT "\n\n";
+
+ print $STAT "QC statistics\n";
+ printf $STAT "\t%s\t%s\t%s\n", "File name", getFileName($fileName[0]), getFileName($fileName[1]);
+ printf $STAT "\t%s\t%d\t%d\n", "Total number of reads", $totalReads[0], $totalReads[1];
+ printf $STAT "\t%s\t%d\t%d\n", "Total number of HQ reads", $totalReadsAfterHQ[0], $totalReadsAfterHQ[1];
+ $tmpPer = sprintf "%0.2f", $totalReadsAfterHQ[0]/$totalReads[0]*100;
+ printf $STAT "\t%s\t%s\t%0.2f%s\n", "Percentage of HQ reads", $tmpPer."%", $totalReadsAfterHQ[1]/$totalReads[1]*100, "%";
+ printf $STAT "\t%s\t%.f\t%.f\n", "Total number of bases", $totalBases[0], $totalBases[1];
+ printf $STAT "\t%s\t%.f\t%.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ[0], $totalBasesAfterHQ[1];
+ printf $STAT "\t%s\t%.f\t%.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ[0], $totalHQBasesAfterHQ[1];
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ[0]/$totalBasesAfterHQ[0]*100;
+ printf $STAT "\t%s\t%s\t%0.2f%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%" , $totalHQBasesAfterHQ[1]/$totalBasesAfterHQ[1]*100, "%";
+ if(defined($priAdaLib)) {
+ printf $STAT "\t%s\t%d\t%d\n", "Number of Primer/Adaptor contaminated HQ reads", $totalValidReadsWithPriAda[0], $totalValidReadsWithPriAda[1];
+ }
+ else {
+ printf $STAT "\t%s\t%s\t%s\n", "Number of Primer/Adaptor contaminated HQ reads", "NA", "NA";
+ }
+ printf $STAT "\t%s\t%d\t%d\n", "Total number of HQ filtered reads", $totalReadsFinal[0], $totalReadsFinal[1];
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal[0]/$totalReads[0]*100;
+ printf $STAT "\t%s\t%s\t%0.2f%s\n", "Percentage of HQ filtered reads", $tmpPer."%", $totalReadsFinal[1]/$totalReads[1]*100, "%";
+
+ print $STAT "\n\n";
+
+ print $STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", getFileName($fileName[0]), getFileName($fileName[1]), getFileName($outFileName[0]), getFileName($outFileName[1])],
+ ["Minimum read length", $minLen[0], $minLen[1], $minLen[2], $minLen[3]],
+ ["Maximum read length", $maxLen[0], $maxLen[1], $maxLen[2], $maxLen[3]],
+ ["Average read length", (sprintf "%0.2f", $totalBases[0]/$totalReads[0]), (sprintf "%0.2f", $totalBases[1]/$totalReads[1]), (sprintf "%0.2f", $totalBasesFinal[0]/$totalReadsFinal[0]), (sprintf "%0.2f", $totalBasesFinal[1]/$totalReadsFinal[1])],
+ ["Total number of reads", $totalReads[0], $totalReads[1], $totalReadsFinal[0], $totalReadsFinal[1]],
+ ["Total number of reads with non-ATGC bases", $readsWithN[0], $readsWithN[1], $readsWithN[2], $readsWithN[3]],
+ ["Percentage of reads with non-ATGC bases", (sprintf "%0.2f", $readsWithN[0]/$totalReads[0]*100)."%", (sprintf "%0.2f", $readsWithN[1]/$totalReads[1]*100)."%", (sprintf "%0.2f", $readsWithN[2]/$totalReadsFinal[0]*100)."%", (sprintf "%0.2f", $readsWithN[3]/$totalReadsFinal[1]*100)."%"],
+ ["Total number of bases", $totalBases[0], $totalBases[1], $totalBasesFinal[0], $totalBasesFinal[1]],
+ ["Total number of HQ bases", $totalHQBases[0], $totalHQBases[1], $totalHQBasesFinal[0], $totalHQBasesFinal[1]],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalHQBases[1]/$totalBases[1]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[0]/$totalBasesFinal[0]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[1]/$totalBasesFinal[1]*100)."%"],
+ ["Total number of non-ATGC bases", $totalNs[0], $totalNs[1], $totalNs[2], $totalNs[3]],
+ ["Percentage of non-ATGC bases", (sprintf "%0.2f", $totalNs[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalNs[1]/$totalBases[1]*100)."%", (sprintf "%0.2f", $totalNs[2]/$totalBasesFinal[0]*100)."%", (sprintf "%0.2f", $totalNs[3]/$totalBasesFinal[1]*100)."%"],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf $STAT "\t%s\t%s\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2], $arr[$i][3], $arr[$i][4];
+ }
+ else {
+ printf $STAT "\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ }
+
+ print $STAT "\n\n";
+
+ print $STAT "Average quality score at each base position of input reads\n\n";
+ my $c = 0;
+ @graphData1 = ();
+ @graphData2 = ();
+ foreach my $arr (@positionSpecificBaseCount) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReads[$c];
+ if($c == 0) {
+ $graphData1[0][$basePos-1] = $basePos;
+ $graphData1[1][$basePos-1] = $outVal;
+ }
+ else {
+ $graphData2[0][$basePos-1] = $basePos;
+ $graphData2[1][$basePos-1] = $outVal;
+ }
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ print $STAT "\n\n";
+ $c++;
+ }
+ print $STAT "\n\n";
+ if(!defined($isOnlyStat)) {
+ print $STAT "Average quality score at each base position of filtered reads\n\n";
+ $c = 0;
+ foreach my $arr (@positionSpecificBaseCountHQ) {
+ print $STAT getFileName($outFileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReadsFinal[$c];
+ if($c == 0) {
+ $graphData1[2][$basePos-1] = $outVal;
+ }
+ else {
+ $graphData2[2][$basePos-1] = $outVal;
+ }
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n\n";
+ }
+ }
+ print $STAT "\n\n";
+ prepareData4RangeGraph($STAT);
+ if($isGDMod) {
+ drawGraph(\@graphData1, $outFolder.getFileName($fileName[0])."_avgQual.png", getFileName($fileName[0]));
+ drawGraph(\@graphData2, $outFolder.getFileName($fileName[1])."_avgQual.png", getFileName($fileName[1]));
+ }
+ }
+ else {
+ printf $STAT "\t%s\t%s\n", "Library type", "Single-end";
+ printf $STAT "\t%s\t%s\n", "Input file", $fileName[0];
+ printf $STAT "\t%s\t%s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf $STAT "\t%s\t%s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf $STAT "\t%s\t%s\n", "Cut-off quality score", $cutOffPhScore;
+ printf $STAT "\t%s\t%s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf $STAT "\t%s\t%s\n", "Number of processes", $noOfProcesses;
+
+ print $STAT "\n\n";
+
+ print $STAT "QC statistics\n";
+ printf $STAT "\t%s\t%s\n", "File name", getFileName($fileName[0]);
+ printf $STAT "\t%s\t%d\n", "Total number of reads", $totalReads[0];
+ printf $STAT "\t%s\t%d\n", "Total number of HQ reads", $totalReadsAfterHQ[0];
+ $tmpPer = sprintf "%0.2f", $totalReadsAfterHQ[0]/$totalReads[0]*100;
+ printf $STAT "\t%s\t%s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf $STAT "\t%s\t%.f\n", "Total number of bases", $totalBases[0];
+ printf $STAT "\t%s\t%.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ[0];
+ printf $STAT "\t%s\t%.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ[0];
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ[0]/$totalBasesAfterHQ[0]*100;
+ printf $STAT "\t%s\t%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf $STAT "\t%s\t%d\n", "Number of Primer/Adaptor contaminated HQ reads", $totalValidReadsWithPriAda[0];
+ }
+ else {
+ printf $STAT "\t%s\t%s\n", "Number of Primer/Adaptor contaminated HQ reads", "NA";
+ }
+ printf $STAT "\t%s\t%d\n", "Total number of HQ filtered reads", $totalReadsFinal[0];
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal[0]/$totalReads[0]*100;
+ printf $STAT "\t%s\t%s\n", "Percentage of HQ filtered reads", $tmpPer."%";
+
+ print $STAT "\n\n";
+
+ print $STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", getFileName($fileName[0]), getFileName($outFileName[0])],
+ ["Minimum read length", $minLen[0], $minLen[1]],
+ ["Maximum read length", $maxLen[0], $maxLen[1]],
+ ["Average read length", (sprintf "%0.2f", $totalBases[0]/$totalReads[0]), (sprintf "%0.2f", $totalBasesFinal[0]/$totalReadsFinal[0])],
+ ["Total number of reads", $totalReads[0], $totalReadsFinal[0]],
+ ["Total number of reads with non-ATGC bases", $readsWithN[0], $readsWithN[1]],
+ ["Percentage of reads with non-ATGC bases", (sprintf "%0.2f", $readsWithN[0]/$totalReads[0]*100)."%", (sprintf "%0.2f", $readsWithN[1]/$totalReadsFinal[0]*100)."%"],
+ ["Total number of bases", $totalBases[0], $totalBasesFinal[0]],
+ ["Total number of HQ bases", $totalHQBases[0], $totalHQBasesFinal[0]],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[0]/$totalBasesFinal[0]*100)."%"],
+ ["Total number of non-ATGC bases", $totalNs[0], $totalNs[1]],
+ ["Percentage of non-ATGC bases", (sprintf "%0.2f", $totalNs[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalNs[1]/$totalBasesFinal[0]*100)."%"],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf $STAT "\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf $STAT "\t%s\t%s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print $STAT "\n\n";
+
+
+ @graphData1 = ();
+ print $STAT "Average quality score at each base position of input reads\n\n";
+ my $c = 0;
+ foreach my $arr (@positionSpecificBaseCount) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReads[$c];
+ $graphData1[0][$basePos-1] = $basePos;
+ $graphData1[1][$basePos-1] = $outVal;
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n";
+ }
+ print $STAT "\n\n";
+ $c = 0;
+ if(!defined($isOnlyStat)) {
+ print $STAT "Average quality score at each base position of filtered reads\n\n";
+ foreach my $arr (@positionSpecificBaseCountHQ) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReadsFinal[$c];
+ $graphData1[2][$basePos-1] = $outVal;
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n";
+ }
+ }
+ print $STAT "\n\n";
+ prepareData4RangeGraph($STAT);
+ if($isGDMod) {
+ drawGraph(\@graphData1, $outFolder.getFileName($fileName[0])."_avgQual.png", getFileName($fileName[0]));
+ }
+ }
+}
diff --git a/QC/IlluQC_PRLL.pl b/QC/IlluQC_PRLL.pl
new file mode 100644
index 0000000..e79a42d
--- /dev/null
+++ b/QC/IlluQC_PRLL.pl
@@ -0,0 +1,2994 @@
+#! /usr/bin/perl
+
+use File::Basename;
+#BEGIN {
+# my ($tmp, $path) = fileparse($0);
+# push ( @INC,"$path/lib");
+# #use lib "$path";
+#}
+use strict;
+use warnings;
+use Getopt::Long;
+use List::Util qw(sum min max);
+use Cwd qw(abs_path);
+use IO::Zlib;
+use FindBin qw($RealBin);
+use lib "$RealBin/lib";
+require "html.pl";
+use threads('yield');
+use File::Path;
+use Thread::Queue;
+my $DataQueue;
+my $ProcessingQueue;
+my $thr;
+
+eval {
+ require String::Approx;
+ require GD::Graph::linespoints;
+ require GD::Graph::bars;
+ require GD::Graph::pie;
+ require GD::Text::Wrap;
+};
+
+my $isGDMod = 1;
+
+if($@) {
+ my $errorText = join("", $@);
+ if($errorText =~ /Parallel/) {
+ print "Error:\n\tCan not find 'lib' folder with this perl program\n"; #module 'Parallel::ForkManager'\n";
+ print "\tCopy the 'lib' folder, provided with the toolkit, to the directory where this perl program is and try again\n\n";
+ exit;
+ }
+ elsif($errorText =~ /GD\/Graph\/linespoints/) {
+ print STDERR "Warning:\n\tCan not find module 'GD::Graph'\n";
+ print STDERR "\tGraphs for statistics will not be produced. \n\t\t\tOR \n\tInstall GD::Graph module and try again.\n\n";
+ $isGDMod = 0;
+ }
+ elsif($errorText =~ /String\/Approx/) {
+ print STDERR "Error:\n\tCan not find module 'String::Approx'\n";
+ print STDERR "\tInstall it and try again\n\n";
+ exit;
+ }
+}
+
+
+# Stat variables.
+my @totalBases = (0, 0);
+my @totalHQBases = (0, 0); ### This is for HQ reads, does not include NO_VEC stat.
+my @totalBasesAfterHQ = (0, 0);
+my @totalHQBasesAfterHQ = (0, 0);
+my @totalBasesFinal = (0, 0);
+my @totalHQBasesFinal = (0, 0);
+my @totalReadsAfterHQ = (0, 0); ### This is for HQ reads, does not include NO_VEC stat.
+my @totalReads = (0, 0);
+my @totalValidReadsNoPriAda = (0, 0);
+my @totalValidReadsWithPriAda = (0, 0);
+my @minLen = (1000, 1000, 1000, 1000);
+my @maxLen = (0, 0, 0, 0);
+my @positionSpecificBaseCount = ();
+my @positionSpecificBaseCountHQ = (); #### This is for final output reads only... Which may include NO_VEC according to the user input.
+my @positionSpecificBaseCountWithRanges = ();
+my @positionSpecificBaseCountHQWithRanges = ();
+my @totalReadsFinal = ();
+my @fileName = ();
+my @outFileName = ();
+my @readsWithN = (0, 0, 0, 0);
+my @totalNs = (0, 0, 0, 0);
+my @totalTrimmedReads = (0, 0);
+my @priAdaLibNames = ("Genomic DNA/Chip-seq Library", "Paired End DNA Library", "DpnII gene expression Library", "NlaIII gene expression Library", "Small RNA Library", "Multiplexing DNA Library");
+my $nLines = 0;
+my $seqFormat = 0; # 1: Sanger; 2: Solexa; 3: Illumina 1.3+; 4: Illumina 1.5+;
+my $subVal = 0; # 33: Sanger; 64: Illumina
+my @qualDistribRaw = ();
+my @qualDistribFinal = ();
+my $qualDistribInterval = 1;
+my @qualLabel = ();
+my @gcDistribRaw = ();
+my @gcDistribFinal = ();
+my $gcDistribInterval = 5;
+my @gcLabel = ();
+my @baseCountRaw = ();
+my @baseCountFinal = ();
+my @charCountRaw = ();
+my @charCountFinal = ();
+
+my $font_spec = getFilePath($0) . "lib/Fonts/Dustismo_Sans.ttf";
+my $f = getFilePath($0) . "lib/Fonts/LucidaSansDemiBold.ttf";
+
+
+# Child variables.
+my @ctotalBases = (0, 0);
+my @ctotalHQBases = (0, 0); ### This is for HQ reads, does not include NO_VEC stat.
+my @ctotalBasesAfterHQ = (0, 0);
+my @ctotalHQBasesAfterHQ = (0, 0);
+my @ctotalBasesFinal = (0, 0);
+my @ctotalHQBasesFinal = (0, 0);
+my @ctotalReadsAfterHQ = (0, 0); ### This is for HQ reads, does not include NO_VEC stat.
+my @ctotalValidReadsNoPriAda = (0, 0);
+my @ctotalValidReadsWithPriAda = (0, 0);
+my @cminLen = (1000, 1000, 1000, 1000);
+my @cmaxLen = (0, 0, 0, 0);
+my @cpositionSpecificBaseCount = ();
+my @cpositionSpecificBaseCountHQ = ();
+my @cpositionSpecificBaseCountWithRanges = ();
+my @cpositionSpecificBaseCountHQWithRanges = ();
+my @ctotalReadsFinal = ();
+my @creadsWithN = (0, 0, 0, 0);
+my @ctotalNs = (0, 0, 0, 0);
+my @refArr4OutFReads = ();
+my @refArr4OutRReads = ();
+my @cqualDistribRaw = ();
+my @cqualDistribFinal = ();
+my @cqualLabel = ();
+my @cgcDistribRaw = ();
+my @cgcDistribFinal = ();
+my @cgcLabel = ();
+my @cbaseCountRaw = ();
+my @cbaseCountFinal = ();
+my @ccharCountRaw = ();
+my @ccharCountFinal = ();
+
+
+# Misc variables.
+my $isPairedEnd = 0;
+my $outFolder = "";
+my $substrlen = 20; # For removePriAda
+my $mismLim = 1; # For removePriAda
+my $indOfAnalysis = 0;
+my $uniqFolder = "";
+my $isInpGzip = 0;
+
+# Parameter variables.
+my @peFiles = ();
+my @seFiles = ();
+my @allFiles = ();
+my $noOfInp4PE = 4;
+my $noOfInp4SE = 3;
+my $isOnlyStat;
+my $priAdaLib = "";
+my $cutOffReadLen4HQ = 70;
+my $cutOffPhScore = 20;
+#my $trimAfterUnknownCall;
+my $noOfProcesses = 1;
+my $helpAsked;
+my $statOutFmt = 1; # 1: Text format; 2: Tab-delimited format.
+my $priAdaFile;
+my @usrDefinedPriAda = ();
+my $outputDataFmt = "t"; # t/T: Text; g/G: Gzip.
+GetOptions(
+ "pe=s{$noOfInp4PE}" => \@peFiles,
+ "se=s{$noOfInp4SE}" => \@seFiles,
+ "c|cpus=i" => \$noOfProcesses,
+ "h|help" => \$helpAsked,
+ "l|cutOffReadLen4HQ=f" => \$cutOffReadLen4HQ,
+ "o|outputFolder=s" => \$outFolder,
+ "z|outputDataCompression=s" => \$outputDataFmt,
+ "t|statOutFmt=i" => \$statOutFmt,
+ "onlyStat" => \$isOnlyStat,
+ "s|cutOffQualScore=i" => \$cutOffPhScore,
+ );
+if($helpAsked) {
+ prtUsage();
+ exit;
+}
+if(@peFiles == 0 && @seFiles == 0) {
+ prtError("No input files are provided");
+}
+# Validating inputs
+my @tempFiles = ();
+prtError("Missing inputs for paired-end files") if((scalar @peFiles)%$noOfInp4PE != 0);
+for(my $i=0; $i<@peFiles; $i+=$noOfInp4PE) {
+ my $str = "$peFiles[$i] $peFiles[$i+1] $peFiles[$i+2] $peFiles[$i+3]";
+ if($peFiles[$i+2] =~ /^-/) {
+ prtError("Missing inputs for paired-end files: at '-pe $str'")
+ }
+ if($peFiles[$i+2] =~ /^\d$/) {
+ if($peFiles[$i+2] < 1 || $peFiles[$i+2] > 6) {
+ prtError("Incorrect option for Primer/Adaptor library: at '-pe $str'");
+ }
+ }
+ if($peFiles[$i+3] =~ /^-/) {
+ prtError("Missing inputs for paired-end files: at '-pe $str'")
+ }
+ if($peFiles[$i+3] !~ /\d/ && $peFiles[$i+3] !~ /a/i) {
+ prtError("Incorrect option for FASTQ variant: at '-pe $str'")
+ }
+ if($peFiles[$i+3] !~ /a/i) {
+ if($peFiles[$i+3] < 1 || $peFiles[$i+3] > 5) {
+ prtError("Incorrect option for FASTQ variant: at '-pe $str'");
+ }
+ }
+ push(@tempFiles, $str);
+}
+ at peFiles = ();
+ at peFiles = @tempFiles;
+ at tempFiles = ();
+prtError("Missing inputs for single-end files") if((scalar @seFiles)%$noOfInp4SE != 0);
+for(my $i=0; $i<@seFiles; $i+=$noOfInp4SE) {
+ my $str = "$seFiles[$i] $seFiles[$i+1] $seFiles[$i+2]";
+ if($seFiles[$i+1] =~ /^-/) {
+ prtError("Missing inputs for single-end files: at '-se $str'")
+ }
+ if($seFiles[$i+1] =~ /^\d$/i) {
+ if($seFiles[$i+1] < 1 || $seFiles[$i+1] > 6) {
+ prtError("Incorrect option for Primer/Adaptor library: at '-se $str'");
+ }
+ }
+ if($seFiles[$i+2] =~ /^-/) {
+ prtError("Missing inputs for single-end files: at '-se $str'")
+ }
+ if($seFiles[$i+2] !~ /\d/ && $seFiles[$i+2] !~ /a/i) {
+ prtError("Incorrect option for FASTQ variant: at '-se $str'")
+ }
+ if($seFiles[$i+2] !~ /a/i) {
+ if($seFiles[$i+2] < 1 || $seFiles[$i+2] > 5) {
+ prtError("Incorrect option for FASTQ variant: at '-se $str'");
+ }
+ }
+ push(@tempFiles, $str);
+}
+ at seFiles = ();
+ at seFiles = @tempFiles;
+ at tempFiles = ();
+if($cutOffReadLen4HQ < 0 || $cutOffReadLen4HQ > 100) {
+ prtError("Incorrect value for -l|cutOffReadLen4HQ option: at '-l $cutOffReadLen4HQ'");
+}
+if($statOutFmt < 1 || $statOutFmt > 2) {
+ prtError("Incorrect value for -statOutFmt: at '-statOutFmt $statOutFmt'");
+}
+if($outputDataFmt !~ /^[tg]$/i) {
+ prtError("Incorrect value for -f|outputDataFmt option: at '-f $outputDataFmt'");
+}
+
+#my $pm = new Parallel::ForkManager($noOfProcesses);
+
+ at allFiles = (@peFiles, @seFiles);
+my $pid;
+
+foreach my $file (@allFiles) {
+ @totalBases = (0, 0);
+ @totalHQBases = (0, 0);
+ @totalBasesAfterHQ = (0, 0);
+ @totalHQBasesAfterHQ = (0, 0);
+ @totalBasesFinal = (0, 0);
+ @totalHQBasesFinal = (0, 0);
+ @totalReadsAfterHQ = (0, 0);
+ @totalReads = (0, 0);
+ @totalValidReadsNoPriAda = (0, 0);
+ @totalValidReadsWithPriAda = (0, 0);
+ @minLen = (1000, 1000, 1000, 1000);
+ @maxLen = (0, 0, 0, 0);
+ @positionSpecificBaseCount = ();
+ @positionSpecificBaseCountHQ = ();
+ @positionSpecificBaseCountWithRanges = ();
+ @positionSpecificBaseCountHQWithRanges = ();
+ @totalReadsFinal = (0, 0);
+ @readsWithN = (0, 0, 0, 0);
+ @totalNs = (0, 0, 0, 0);
+ @totalTrimmedReads = (0, 0);
+ @fileName = ();
+ @outFileName = ();
+ @qualDistribRaw = ();
+ @qualDistribFinal = ();
+ @qualLabel = ();
+ @gcDistribRaw = ();
+ @gcDistribFinal = ();
+ @gcLabel = ();
+ @baseCountRaw = ();
+ @baseCountFinal = ();
+ @charCountRaw = ();
+ @charCountFinal = ();
+ $priAdaFile = "";
+ @usrDefinedPriAda = ();
+
+ @ctotalBases = (0, 0);
+ @ctotalHQBases = (0, 0); ### This is for HQ reads, does not include NO_VEC stat.
+ @ctotalBasesAfterHQ = (0, 0);
+ @ctotalHQBasesAfterHQ = (0, 0);
+ @ctotalBasesFinal = (0, 0);
+ @ctotalHQBasesFinal = (0, 0);
+ @ctotalReadsAfterHQ = (0, 0); ### This is for HQ reads, does not include NO_VEC stat.
+ @ctotalValidReadsNoPriAda = (0, 0);
+ @ctotalValidReadsWithPriAda = (0, 0);
+ @cminLen = (1000, 1000, 1000, 1000);
+ @cmaxLen = (0, 0, 0, 0);
+ @cpositionSpecificBaseCount = ();
+ @cpositionSpecificBaseCountHQ = ();
+ @cpositionSpecificBaseCountWithRanges = ();
+ @cpositionSpecificBaseCountHQWithRanges = ();
+ @ctotalReadsFinal = (0, 0);
+ @creadsWithN = (0, 0, 0, 0);
+ @ctotalNs = (0, 0, 0, 0);
+ @refArr4OutFReads = ();
+ @refArr4OutRReads = ();
+ @cqualDistribRaw = ();
+ @cqualDistribFinal = ();
+ @cqualLabel = ();
+ @cgcDistribRaw = ();
+ @cgcDistribFinal = ();
+ @cgcLabel = ();
+ @cbaseCountRaw = ();
+ @cbaseCountFinal = ();
+ @ccharCountRaw = ();
+ @ccharCountFinal = ();
+
+
+ $file =~ s/\\([A-Za-z_\.])/\/$1/g; # To remove '\' from the path of windows file
+ $isPairedEnd = 0;
+ my @inpData = split(/\s+/, $file);
+ if($inpData[$#inpData-1] =~ /^n$/i) {
+ undef $priAdaLib;
+ }
+ elsif($inpData[$#inpData-1] =~ /^\d$/) {
+ $priAdaLib = $inpData[$#inpData-1] - 1;
+ }
+ else {
+ $priAdaLib = "u";
+ $priAdaFile = $inpData[$#inpData-1];
+ open(PRIADA, "<$priAdaFile") or die "Can not open the user-defined primer/adapter file: $priAdaFile\n";
+ @usrDefinedPriAda = <PRIADA>;
+ for(my $i=0; $i<=$#usrDefinedPriAda; $i++) {
+ $usrDefinedPriAda[$i] =~ s/\s+//g;
+ }
+ }
+ $seqFormat = $inpData[$#inpData];
+
+ $indOfAnalysis++;
+#$pid = $pm->start and next;
+ print "Analysis has been started for \"$file\": Index: $indOfAnalysis\n";
+ my $statFile = "";
+ my $outFile1;
+ my $outFile2;
+ my $unPaired;
+ my $outFile;
+ if((scalar @inpData) == $noOfInp4PE) {
+ @fileName = ($inpData[0], $inpData[1]);
+ if($fileName[0] =~ /\.gz$/i || $fileName[1] =~ /\.gz$/i) {
+ $isInpGzip = 1;
+ }
+ $outFolder = getFilePath($fileName[0]) . "IlluQC_Filtered_files" if($outFolder eq "");
+ $outFolder .= "/" if($outFolder !~ /\/$/);
+ if(! -e $outFolder) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+ }
+ $outFile1 = $outFolder . getFileName($fileName[0]) . "_filtered";
+ $outFile2 = $outFolder . getFileName($fileName[1]) . "_filtered";
+ $unPaired = getFilePath($outFile1) . getFileName($fileName[0]) . "_" . getFileName($fileName[1]) . "_unPaired_HQReads";
+ $outFile1 .= ".gz" if($outputDataFmt =~ /g/i);
+ $outFile2 .= ".gz" if($outputDataFmt =~ /g/i);
+ $unPaired .= ".gz" if($outputDataFmt =~ /g/i);
+ $statFile = $outFolder . getFileName($fileName[0]) . "_" . getFileName($fileName[1]) . "_stat";
+ do {
+ $uniqFolder = "";
+ for(my $i=0; $i<5; $i++) {
+ $uniqFolder .= int(rand(10));
+ }
+ $uniqFolder = $outFolder . $uniqFolder;
+ }
+ while(-e $uniqFolder);
+ mkdir($uniqFolder) or die "Can not create folder for temporary files\n";
+
+ $DataQueue = Thread::Queue->new();
+ unlink($outFile1) if(-e $outFile1);
+ unlink($outFile2) if(-e $outFile2);
+ unlink($unPaired) if(-e $unPaired);
+ $thr = threads->create(sub {
+ while (my $DataElement = $DataQueue->dequeue()) {
+ $DataElement =~ s/([fru]$)//;
+ my $readType = $1;
+ my $outH;
+ openFileGetHandle($outFile1, "a", \$outH) if($readType eq "f");
+ openFileGetHandle($outFile2, "a", \$outH) if($readType eq "r");
+ openFileGetHandle($unPaired, "a", \$outH) if($readType eq "u");
+ *OOO = $outH;
+ print OOO "$DataElement";
+ close(OOO);
+ }
+ });
+ $ProcessingQueue = Thread::Queue->new();
+
+ $outFileName[0] = $outFile1;
+ $outFileName[1] = $outFile2;
+ if($seqFormat =~ /a/i) {
+ print "$indOfAnalysis: Checking FASTQ format: File $fileName[0]...\n";
+ $nLines = checkFastQFormat($fileName[0], 1);
+ print "$indOfAnalysis: Checking FASTQ format: File $fileName[1]...\n";
+ if($nLines != checkFastQFormat($fileName[1], 1)) {
+ prtErrorExit("Number of reads in paired end files are not same.\n\t\tFiles: $fileName[0], $fileName[1]");
+ }
+ if($seqFormat == 1) {
+ $subVal = 33;
+ print "$indOfAnalysis: Input FASTQ file format: Sanger\n";
+ }
+ if($seqFormat == 2) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Solexa\n";
+ }
+ if($seqFormat == 3) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.3+\n";
+ }
+ if($seqFormat == 4) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.5+\n";
+ }
+ if($seqFormat == 5) {
+ $subVal = 33;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.8+\n";
+ }
+ }
+ else {
+ $nLines = checkFastQFormat($fileName[0], 0);
+ if($nLines != checkFastQFormat($fileName[1], 0)) {
+ prtErrorExit("Number of reads in paired end files are not same.\n\t\tFiles: $fileName[0], $fileName[1]");
+ }
+ if($seqFormat == 1 || $seqFormat == 5) {
+ $subVal = 33;
+ }
+ else {
+ $subVal = 64;
+ }
+ }
+ print "$indOfAnalysis: Processing input files...\n";
+ processPairedEndFiles($fileName[0], $fileName[1], $outFile1, $outFile2, $unPaired);
+ if(!defined($isOnlyStat)) {
+ }
+ $isPairedEnd = 1;
+ }
+ else {
+ $fileName[0] = $inpData[0]; #$arg;
+ if($fileName[0] =~ /\.gz$/i) {
+ $isInpGzip = 1;
+ }
+ $outFolder = getFilePath($fileName[0]) . "IlluQC_Filtered_files" if($outFolder eq "");
+ $outFolder .= "/" if($outFolder !~ /\/$/);
+ if(! -e $outFolder) {
+ if(!defined($isOnlyStat)) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+ }
+ }
+ $outFile = $outFolder . getFileName($fileName[0]) . "_filtered";
+ $outFile .= ".gz" if($outputDataFmt =~ /g/i);
+ do {
+ $uniqFolder = "";
+ for(my $i=0; $i<5; $i++) {
+ $uniqFolder .= int(rand(10));
+ }
+ $uniqFolder = $outFolder . $uniqFolder;
+ }
+ while(-e $uniqFolder);
+ mkdir($uniqFolder) or die "Can not create folder for temporary files\n";
+
+ $DataQueue = Thread::Queue->new();
+ unlink($outFile) if(-e $outFile);
+ $thr = threads->create(sub {
+ while (my $DataElement = $DataQueue->dequeue()) {
+ my $outH;
+ openFileGetHandle($outFile, "a", \$outH);
+ *OOO = $outH;
+ print OOO "$DataElement";
+ close(OOO);
+ }
+ });
+ $ProcessingQueue = Thread::Queue->new();
+
+ $outFileName[0] = $outFile;
+ $statFile = $outFolder . getFileName($fileName[0]) . "_stat";
+ if($seqFormat =~ /a/i) {
+ print "$indOfAnalysis: Checking FASTQ format: File $fileName[0]...\n";
+ $nLines = checkFastQFormat($fileName[0], 1);
+ if($seqFormat == 1) {
+ $subVal = 33;
+ print "$indOfAnalysis: Input FASTQ file format: Sanger\n";
+ }
+ if($seqFormat == 2) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Solexa\n";
+ }
+ if($seqFormat == 3) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.3+\n";
+ }
+ if($seqFormat == 4) {
+ $subVal = 64;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.5+\n";
+ }
+ if($seqFormat == 5) {
+ $subVal = 33;
+ print "$indOfAnalysis: Input FASTQ file format: Illumina 1.8+\n";
+ }
+ }
+ else {
+ $nLines = checkFastQFormat($fileName[0], 0);
+ if($seqFormat == 1 || $seqFormat == 5) {
+ $subVal = 33;
+ }
+ else {
+ $subVal = 64;
+ }
+ }
+ print "$indOfAnalysis: Processing input files...\n";
+ processSingleEndFiles($fileName[0], $outFile);
+ if(!defined($isOnlyStat)) {
+ }
+ $isPairedEnd = 0;
+ }
+ print "$indOfAnalysis: Analysis completed\n";
+
+ print "$indOfAnalysis: Printing Output...\n";
+
+ my $qualDistF1 = getFileName($fileName[0])."_qualDistribution.png";
+ my $qualDistF2 = getFileName($fileName[1])."_qualDistribution.png" if($isPairedEnd);
+ my $sumPieF;
+ $sumPieF = getFileName($fileName[0]). "_summary.png";
+ $sumPieF = getFileName($fileName[0]). "_" . getFileName($fileName[1]) ."_summary.png" if($isPairedEnd);
+ my $gcDistF1 = getFileName($fileName[0])."_gcDistribution.png";
+ my $gcDistF2 = getFileName($fileName[1])."_gcDistribution.png" if($isPairedEnd);
+ my $baseCntF1 = getFileName($fileName[0])."_baseCompostion.png";
+ my $baseCntF2 = getFileName($fileName[1])."_baseCompostion.png" if($isPairedEnd);
+ my $avgQF1 = getFileName($fileName[0]) . "_avgQual.png";
+ my $avgQF2 = getFileName($fileName[1]) . "_avgQual.png" if($isPairedEnd);
+ my $QRangeRawF1 = getFileName($fileName[0]) . "_QualRangePerBase.png";
+ my $QRangeFilteredF1 = getFileName($outFileName[0]) . "_QualRangePerBase.png" if(!defined($isOnlyStat));
+ my $QRangeF1 = "$QRangeRawF1";
+ $QRangeF1 .= ":::$QRangeFilteredF1" if(!defined($isOnlyStat));
+ my $QRangeF2;
+ if($isPairedEnd) {
+ my $QRangeRawF2 = getFileName($fileName[1]) . "_QualRangePerBase.png";
+ my $QRangeFilteredF2 = getFileName($outFileName[1]) . "_QualRangePerBase.png" if(!defined($isOnlyStat));
+ $QRangeF2 = "$QRangeRawF2";
+ $QRangeF2 .= ":::$QRangeFilteredF2" if(!defined($isOnlyStat));
+ }
+
+ my $c=0;
+ foreach my $ref (@qualDistribRaw) {
+ my $str = "";
+ foreach my $val (@{$ref}) {
+ if($c == 0) {
+ $str = "0";
+ $str .= "-$qualDistribInterval" if($qualDistribInterval>1);
+ }
+ else {
+ $str = $qualDistribInterval*$c;
+ $str .= "-" . $qualDistribInterval*($c+1) if($qualDistribInterval>1);
+ }
+ $c++;
+ push(@qualLabel, $str);
+ }
+ last;
+ }
+ my @file1 = (\@qualLabel, $qualDistribRaw[0]);
+ my @file2 = (\@qualLabel, $qualDistribRaw[1]) if($isPairedEnd);
+ if(!$isOnlyStat) {
+ push(@file1, $qualDistribFinal[0]);
+ push(@file2, $qualDistribFinal[1]) if($isPairedEnd);;
+ }
+ if($isGDMod) {
+ drawQualDist(\@file1, $outFolder.$qualDistF1, getFileName($fileName[0]), 650, 350);
+ drawQualDist(\@file2, $outFolder.$qualDistF2, getFileName($fileName[1]), 650, 300) if($isPairedEnd);
+ }
+
+ my $readsWPriAda = $totalReadsAfterHQ[0] - $totalReadsFinal[0]; # For Paired end, different number of contaminated sequences will be filtered in both the files. And we have to report total reads contaminated including both end files.
+ my $readsLowQual = $totalReads[0] - $readsWPriAda - $totalReadsFinal[0];
+
+ @file1 = (["", "", ""], [$readsWPriAda, $totalReadsFinal[0], $readsLowQual]);
+ if($isGDMod) {
+ drawSummaryPie(\@file1, $outFolder.$sumPieF, 500, 350);
+ }
+
+ $c=0;
+ foreach my $ref (@gcDistribRaw) {
+ foreach my $val (@{$ref}) {
+ my $str = "";
+ if($c == 0) {
+ $str = "0-$gcDistribInterval";
+ }
+ else {
+ $str = $gcDistribInterval*$c . "-" . $gcDistribInterval*($c+1);
+ }
+ $c++;
+ push(@gcLabel, $str);
+ }
+ last;
+ }
+
+ @file1 = (\@gcLabel, $gcDistribRaw[0]);
+ @file2 = (\@gcLabel, $gcDistribRaw[1]) if($isPairedEnd);
+ if(!$isOnlyStat) {
+ push(@file1, $gcDistribFinal[0]);
+ push(@file2, $gcDistribFinal[1]) if($isPairedEnd);
+ }
+
+ if($isGDMod) {
+ drawGCDist(\@file1, $outFolder.$gcDistF1, getFileName($fileName[0]), 550, 350);
+ drawGCDist(\@file2, $outFolder.$gcDistF2, getFileName($fileName[1]), 550, 350) if($isPairedEnd);
+ }
+
+ @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCountRaw[0]);
+ @file2 = (["A", "T", "G", "C", "Non-ATGC"], $charCountRaw[1]) if($isPairedEnd);
+ if(!$isOnlyStat) {
+ @file1 = (["A", "T", "G", "C", "Non-ATGC"], $charCountRaw[0], $charCountFinal[0]);
+ @file2 = (["A", "T", "G", "C", "Non-ATGC"], $charCountRaw[1], $charCountFinal[1]) if($isPairedEnd);
+ }
+ if($isGDMod) {
+ drawBaseComp(\@file1, $outFolder.$baseCntF1, getFileName($fileName[0]), 500, 300);
+ drawBaseComp(\@file2, $outFolder.$baseCntF2, getFileName($fileName[1]), 500, 300) if($isPairedEnd);
+ }
+
+
+
+ open(STAT, ">$statFile") or die "Can not create statistics file $statFile\n";
+ printStat(*STAT) if($statOutFmt == 1);
+ printStatTab(*STAT) if($statOutFmt == 2);
+ close(STAT);
+
+ my $iFol = getFilePath(abs_path($fileName[0]));
+ my $oFol = abs_path($outFolder) . "/";
+ my $inpFs = getFileName($fileName[0]);
+ my $seqFormatName;
+ $inpFs .= ":::::" . getFileName($fileName[1]) if($isPairedEnd);
+ my $htF = $oFol . "output_" . getFileName($fileName[0]);
+ $htF .= "_" . getFileName($fileName[1]) if($isPairedEnd);
+ $htF .= ".html";
+ if($seqFormat == 1) {
+ $seqFormatName = "Sanger";
+ }
+ elsif($seqFormat == 2) {
+ $seqFormatName = "Solexa";
+ }
+ elsif($seqFormat == 3) {
+ $seqFormatName = "Illumina 1.3+";
+ }
+ elsif($seqFormat == 4) {
+ $seqFormatName = "Illumina 1.5+";
+ }
+ my @fileNames4HTML;
+ @fileNames4HTML = ($outFile, $avgQF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieF, $QRangeF1);
+ @fileNames4HTML = ($outFile1, $outFile2, $unPaired, $avgQF1, $avgQF2, $baseCntF1, $baseCntF2, $gcDistF1, $gcDistF2, $qualDistF1, $qualDistF2, $sumPieF, $QRangeF1, $QRangeF2) if($isPairedEnd);
+ htmlPrint(getFilePath(abs_path($0)), getFileName($0), $htF, $iFol, $isPairedEnd, $isOnlyStat, $inpFs, $seqFormatName, $statFile, $oFol, \@fileNames4HTML);
+ $DataQueue->enqueue(undef);
+ $thr->join();
+ rmtree($uniqFolder, 0, 0);
+# $pm->finish;
+}
+#$pm->wait_all_children;
+
+print "================================================================\n";
+print "Processing has been finished\n";
+print "Output files are generated in $outFolder\n" if($outFolder ne "");
+print "Output files are generated in the folder of input files\n" if($outFolder eq "");
+print "================================================================\n";
+
+exit;
+
+sub openFileGetHandle {
+ my ($file, $rOrw, $ref) = @_;
+ if($file =~ /\.gz$/i) {
+ $$ref = new IO::Zlib;
+ $$ref->open("$file", "rb") or die "Can not open file $file" if($rOrw eq "r");
+ $$ref->open("$file", "wb") or die "Can not create file $file" if($rOrw eq "w");
+ $$ref->open("$file", "ab") or die "Can not open-append file $file" if($rOrw eq "a");
+ }
+ else {
+ open($$ref, "<$file") or die "Can not open file $file" if($rOrw eq "r");
+ open($$ref, ">$file") or die "Can not create file $file" if($rOrw eq "w");
+ open($$ref, ">>$file") or die "Can not open-append file $file" if($rOrw eq "a");
+ }
+}
+
+
+sub processPairedEndFiles {
+ my $file1 = $_[0];
+ my $file2 = $_[1];
+ my $outFile1 = $_[2];
+ my $outFile2 = $_[3];
+ my $unPaired = $_[4];
+ $totalReads[0] = sprintf("%0.0f", $nLines/4);
+ $totalReads[1] = sprintf("%0.0f", $nLines/4);
+
+
+ if($file1 =~ /\.gz$/i || $file2 =~ /\.gz$/i) {
+ my @fileNames = ($file1, $file2);
+ my $thRef = threads->create('readDivideGzip4PE', @fileNames);
+ threading4PE();
+ $thRef->join;
+ print "$indOfAnalysis: Number of reads processed: " . "$totalReads[0]/$totalReads[0] (100\%)...\n";
+ }
+ else {
+ print "$indOfAnalysis: Number of reads processed: " . "0/$totalReads[0] (0\%)...\n";
+ my $isEOF = 1;
+ if($nLines/4 > 0) {
+ $isEOF = 0;
+ }
+
+ my $lineCount = 0;
+ my @thArr = ();
+ my $ttlJobCount = 0;
+ my $noOfSeqPerThread = 100000; #int($nLines/$noOfProcesses);
+ my $fH1;
+ openFileGetHandle($file1, "r", \$fH1);
+ my $fH2;
+ openFileGetHandle($file2, "r", \$fH2);
+ while(!$isEOF) {
+ my $jobCounter = 0; # Lowest value: 0 and Highest value: $noOfProcesses
+ for(my $j=0; $j<$noOfProcesses && (!$isEOF); $j++) {
+ my @fRead = ();
+ my @rRead = ();
+ $jobCounter++;
+ for(my $i=0; $i<4*$noOfSeqPerThread && (!$isEOF); $i++) {
+ $fRead[$i] = <$fH1>;
+ $rRead[$i] = <$fH2>;
+ $lineCount += 1;
+ if($lineCount >= $nLines) {
+ $isEOF = 1;
+ }
+ $i-- if($fRead[$i]=~ /^\n$/ || $rRead[$i]=~ /^\n$/);
+ }
+ $ttlJobCount++;
+ my $id = sprintf "%05s", $ttlJobCount;
+ my @refArr = (\@fRead, \@rRead, $id);
+ $thArr[$j] = threads->create('passPESeq', @refArr);
+ if($lineCount % (100000*4) == 0) {
+ my $tmpP = sprintf "%0.0f", ($lineCount/4/$totalReads[0]*100);
+ print "$indOfAnalysis: Number of reads processed: " . $lineCount/4 . "/$totalReads[0] ($tmpP\%)...\n";
+ }
+ }
+ for(my $j=0; $j<$jobCounter; $j++) {
+ my $refRefArr = $thArr[$j]->join;
+ my @refArr = @{$refRefArr};
+ undef $refRefArr;
+ $totalBases[0] += @{$refArr[0]}[0];
+ $totalBases[1] += @{$refArr[0]}[1];
+ $totalHQBases[0] += @{$refArr[1]}[0];
+ $totalHQBases[1] += @{$refArr[1]}[1];
+ $totalBasesAfterHQ[0] += @{$refArr[2]}[0];
+ $totalBasesAfterHQ[1] += @{$refArr[2]}[1];
+ $totalHQBasesAfterHQ[0] += @{$refArr[3]}[0];
+ $totalHQBasesAfterHQ[1] += @{$refArr[3]}[1];
+ $totalBasesFinal[0] += @{$refArr[4]}[0];
+ $totalBasesFinal[1] += @{$refArr[4]}[1];
+ $totalHQBasesFinal[0] += @{$refArr[5]}[0];
+ $totalHQBasesFinal[1] += @{$refArr[5]}[1];
+ $totalReadsAfterHQ[0] += @{$refArr[6]}[0];
+ $totalReadsAfterHQ[1] += @{$refArr[6]}[1];
+ $totalValidReadsNoPriAda[0] += @{$refArr[7]}[0];
+ $totalValidReadsNoPriAda[1] += @{$refArr[7]}[1];
+ $totalValidReadsWithPriAda[0] += @{$refArr[8]}[0];
+ $totalValidReadsWithPriAda[1] += @{$refArr[8]}[1];
+ $minLen[0] = min(@{$refArr[9]}[0], $minLen[0]);
+ $minLen[1] = min(@{$refArr[9]}[1], $minLen[1]);
+ $minLen[2] = min(@{$refArr[9]}[2], $minLen[2]);
+ $minLen[3] = min(@{$refArr[9]}[3], $minLen[3]);
+ $maxLen[0] = max(@{$refArr[10]}[0], $maxLen[0]);
+ $maxLen[1] = max(@{$refArr[10]}[1], $maxLen[1]);
+ $maxLen[2] = max(@{$refArr[10]}[2], $maxLen[2]);
+ $maxLen[3] = max(@{$refArr[10]}[3], $maxLen[3]);
+ my @tmpArr4positionSpecific = @{$refArr[11]};
+ my @tmpArr4positionSpecificHQ = @{$refArr[15]};
+ for(my $x=0; $x<@tmpArr4positionSpecific; $x++) {
+ my @row = @{$tmpArr4positionSpecific[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $positionSpecificBaseCount[$x][$y] += $tmpArr4positionSpecific[$x][$y];
+ if($tmpArr4positionSpecificHQ[$x][$y]) {
+ $positionSpecificBaseCountHQ[$x][$y] += $tmpArr4positionSpecificHQ[$x][$y];
+ }
+ }
+ }
+ my @tmpArr4positionSpecificR = @{$refArr[22]};
+ my @tmpArr4positionSpecificHQR = @{$refArr[23]};
+ for(my $x=0; $x<@tmpArr4positionSpecificR; $x++) {
+ my @row = @{$tmpArr4positionSpecificR[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ my @col = @{@{$tmpArr4positionSpecificR[$x]}[$y]};
+ for(my $z=0; $z<@col; $z++) {
+ $tmpArr4positionSpecificR[$x][$y][$z] = 0 if(! $tmpArr4positionSpecificR[$x][$y][$z]);
+ $positionSpecificBaseCountWithRanges[$x][$y][$z] += $tmpArr4positionSpecificR[$x][$y][$z];
+ if($tmpArr4positionSpecificHQR[$x][$y][$z]) {
+ $positionSpecificBaseCountHQWithRanges[$x][$y][$z] += $tmpArr4positionSpecificHQR[$x][$y][$z];
+ }
+ }
+ }
+ }
+ $totalReadsFinal[0] += @{$refArr[12]}[0];
+ $totalReadsFinal[1] += @{$refArr[12]}[1];
+ $readsWithN[0] += @{$refArr[13]}[0];
+ $readsWithN[1] += @{$refArr[13]}[1];
+ $readsWithN[2] += @{$refArr[13]}[2];
+ $readsWithN[3] += @{$refArr[13]}[3];
+ $totalNs[0] += @{$refArr[14]}[0];
+ $totalNs[1] += @{$refArr[14]}[1];
+ $totalNs[2] += @{$refArr[14]}[2];
+ $totalNs[3] += @{$refArr[14]}[3];
+ addTwoArrays($refArr[16], \@qualDistribRaw);
+ addTwoArrays($refArr[17], \@qualDistribFinal);
+ addTwoArrays($refArr[18], \@gcDistribRaw);
+ addTwoArrays($refArr[19], \@gcDistribFinal);
+ addTwoArrays($refArr[20], \@charCountRaw);
+ addTwoArrays($refArr[21], \@charCountFinal);
+ undef $refRefArr;
+ @refArr = ();
+ }
+ }
+ print "$indOfAnalysis: Number of reads processed: " . "$totalReads[0]/$totalReads[0] (100\%)...\n";
+ close($fH1);
+ close($fH2);
+ }
+ if(!defined($isOnlyStat)) {
+ my ($fHndl, $rHndl, $uHndl);
+ openFileGetHandle($outFile1, "w", \$fHndl);
+ openFileGetHandle($outFile2, "w", \$rHndl);
+ openFileGetHandle($unPaired, "w", \$uHndl);
+ *OOF = $fHndl;
+ *OOR = $rHndl;
+ *OOU = $uHndl;
+ print "$indOfAnalysis: Printing filtered data...\n";
+ opendir(DIR, $uniqFolder);
+ my @partFiles = readdir(DIR);
+ @partFiles = sort @partFiles;
+ foreach my $pFile (@partFiles) {
+ next if($pFile =~ /\./);
+ my $npFile = "$uniqFolder/$pFile";
+ open(P, "<$npFile") or die "Can not open part file\n";
+ while(<P>) {
+ print OOF if($pFile =~ /fseq[^\n]+out/);
+ print OOR if($pFile =~ /rseq[^\n]+out/);
+ print OOU if($pFile =~ /useq[^\n]+out/);
+ }
+ close(P);
+ }
+ closedir(DIR);
+ close(OOF);
+ close(OOR);
+ close(OOU);
+ }
+}
+
+sub readDivideGzip4PE {
+ my ($file1, $file2) = @_;
+ my $isEOF = 1;
+ if($nLines/4 > 0) {
+ $isEOF = 0;
+ }
+ my $lineCount = 0;
+ my $chunkCount = 0;
+ my $noOfSeqPerThread = 100000; #int($nLines/$noOfProcesses);
+ my $fH1;
+ openFileGetHandle($file1, "r", \$fH1);
+ my $fH2;
+ openFileGetHandle($file2, "r", \$fH2);
+ while(!$isEOF) {
+ my $jobCounter = 0; # Lowest value: 0 and Highest value: $noOfProcesses
+ for(my $j=0; $j<$noOfProcesses && (!$isEOF); $j++) {
+ $jobCounter++;
+ $chunkCount++;
+ my $id = sprintf "%05s", $chunkCount;
+ my $fseqH;
+ openFileGetHandle("$uniqFolder/part_fseq_$id", "w", \$fseqH);
+ my $rseqH;
+ openFileGetHandle("$uniqFolder/part_rseq_$id", "w", \$rseqH);
+ for(my $i=0; $i<4*$noOfSeqPerThread && (!$isEOF); $i++) {
+ my $fRead = <$fH1>;
+ my $rRead = <$fH2>;
+ print $fseqH $fRead;
+ print $rseqH $rRead;
+ $lineCount += 1;
+ if($lineCount >= $nLines) {
+ $isEOF = 1;
+ }
+ $i-- if($fRead=~ /^\n$/ || $rRead=~ /^\n$/);
+ }
+ close($fseqH);
+ close($rseqH);
+ $ProcessingQueue->enqueue("$uniqFolder/part_fseq_$id"."\t"."$uniqFolder/part_rseq_$id");
+ }
+ }
+ close($fH1);
+ close($fH2);
+ $ProcessingQueue->enqueue(undef);
+}
+
+sub fireMyPEJob {
+ my $lineCount = $_[0];
+ my $fileName = $ProcessingQueue->dequeue();
+ return undef if(!defined($fileName));
+ my ($file1, $file2) = split(/\t/, $fileName);
+ open(CHKF, "<$file1") or die "Can't open chunk file containing input reads for processing: $file1\n";
+ my @fReads = <CHKF>;
+ $$lineCount += (scalar @fReads);
+ close(CHKF);
+ open(CHKR, "<$file2") or die "Can't open chunk file containing input reads for processing: $file2\n";
+ my @rReads = <CHKR>;
+ close(CHKR);
+ my ($id) = $file1=~/(\d+)$/;
+ my @reads = (\@fReads, \@rReads, $id);
+ my $thRef = threads->create('passPESeq', @reads);
+ return $thRef;
+}
+
+sub threading4PE {
+ my @thArr;
+ my $done = 0;
+ my $lineCount = 0;
+ while(1) {
+ if($lineCount % (100000*4) == 0) {
+ my $tmpP = sprintf "%0.0f", ($lineCount/4/$totalReads[0]*100);
+ print "$indOfAnalysis: Number of reads processed: " . $lineCount/4 . "/$totalReads[0] ($tmpP\%)...\n";
+ }
+ my $i;
+ for($i=0; $i<$noOfProcesses; $i++) {
+ my $thRef = fireMyPEJob(\$lineCount);
+ if(!defined($thRef)) {
+ $done = 1;
+ last;
+ }
+ $thArr[$i] = $thRef;
+ }
+ for(my $j=0; $j<$i; $j++) {
+ my $refRefArr = $thArr[$j]->join;
+ my @refArr = @{$refRefArr};
+ undef $refRefArr;
+ $totalBases[0] += @{$refArr[0]}[0];
+ $totalBases[1] += @{$refArr[0]}[1];
+ $totalHQBases[0] += @{$refArr[1]}[0];
+ $totalHQBases[1] += @{$refArr[1]}[1];
+ $totalBasesAfterHQ[0] += @{$refArr[2]}[0];
+ $totalBasesAfterHQ[1] += @{$refArr[2]}[1];
+ $totalHQBasesAfterHQ[0] += @{$refArr[3]}[0];
+ $totalHQBasesAfterHQ[1] += @{$refArr[3]}[1];
+ $totalBasesFinal[0] += @{$refArr[4]}[0];
+ $totalBasesFinal[1] += @{$refArr[4]}[1];
+ $totalHQBasesFinal[0] += @{$refArr[5]}[0];
+ $totalHQBasesFinal[1] += @{$refArr[5]}[1];
+ $totalReadsAfterHQ[0] += @{$refArr[6]}[0];
+ $totalReadsAfterHQ[1] += @{$refArr[6]}[1];
+ $totalValidReadsNoPriAda[0] += @{$refArr[7]}[0];
+ $totalValidReadsNoPriAda[1] += @{$refArr[7]}[1];
+ $totalValidReadsWithPriAda[0] += @{$refArr[8]}[0];
+ $totalValidReadsWithPriAda[1] += @{$refArr[8]}[1];
+ $minLen[0] = min(@{$refArr[9]}[0], $minLen[0]);
+ $minLen[1] = min(@{$refArr[9]}[1], $minLen[1]);
+ $minLen[2] = min(@{$refArr[9]}[2], $minLen[2]);
+ $minLen[3] = min(@{$refArr[9]}[3], $minLen[3]);
+ $maxLen[0] = max(@{$refArr[10]}[0], $maxLen[0]);
+ $maxLen[1] = max(@{$refArr[10]}[1], $maxLen[1]);
+ $maxLen[2] = max(@{$refArr[10]}[2], $maxLen[2]);
+ $maxLen[3] = max(@{$refArr[10]}[3], $maxLen[3]);
+ my @tmpArr4positionSpecific = @{$refArr[11]};
+ my @tmpArr4positionSpecificHQ = @{$refArr[15]};
+ for(my $x=0; $x<@tmpArr4positionSpecific; $x++) {
+ my @row = @{$tmpArr4positionSpecific[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $positionSpecificBaseCount[$x][$y] += $tmpArr4positionSpecific[$x][$y];
+ if($tmpArr4positionSpecificHQ[$x][$y]) {
+ $positionSpecificBaseCountHQ[$x][$y] += $tmpArr4positionSpecificHQ[$x][$y];
+ }
+ }
+ }
+ my @tmpArr4positionSpecificR = @{$refArr[22]};
+ my @tmpArr4positionSpecificHQR = @{$refArr[23]};
+ for(my $x=0; $x<@tmpArr4positionSpecificR; $x++) {
+ my @row = @{$tmpArr4positionSpecificR[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ my @col = @{@{$tmpArr4positionSpecificR[$x]}[$y]};
+ for(my $z=0; $z<@col; $z++) {
+ $tmpArr4positionSpecificR[$x][$y][$z] = 0 if(! $tmpArr4positionSpecificR[$x][$y][$z]);
+ $positionSpecificBaseCountWithRanges[$x][$y][$z] += $tmpArr4positionSpecificR[$x][$y][$z];
+ if($tmpArr4positionSpecificHQR[$x][$y][$z]) {
+ $positionSpecificBaseCountHQWithRanges[$x][$y][$z] += $tmpArr4positionSpecificHQR[$x][$y][$z];
+ }
+ }
+ }
+ }
+ $totalReadsFinal[0] += @{$refArr[12]}[0];
+ $totalReadsFinal[1] += @{$refArr[12]}[1];
+ $readsWithN[0] += @{$refArr[13]}[0];
+ $readsWithN[1] += @{$refArr[13]}[1];
+ $readsWithN[2] += @{$refArr[13]}[2];
+ $readsWithN[3] += @{$refArr[13]}[3];
+ $totalNs[0] += @{$refArr[14]}[0];
+ $totalNs[1] += @{$refArr[14]}[1];
+ $totalNs[2] += @{$refArr[14]}[2];
+ $totalNs[3] += @{$refArr[14]}[3];
+ addTwoArrays($refArr[16], \@qualDistribRaw);
+ addTwoArrays($refArr[17], \@qualDistribFinal);
+ addTwoArrays($refArr[18], \@gcDistribRaw);
+ addTwoArrays($refArr[19], \@gcDistribFinal);
+ addTwoArrays($refArr[20], \@charCountRaw);
+ addTwoArrays($refArr[21], \@charCountFinal);
+ undef $refRefArr;
+ @refArr = ();
+ }
+ last if($done);
+ }
+}
+
+sub passPESeq {
+ my($fReadRef, $rReadRef, $id) = @_;
+ open(FSEQ, ">$uniqFolder/part_fseq_$id"."_out") or die "Can not open part_fseq_$id"."_out file\n";
+ open(RSEQ, ">$uniqFolder/part_rseq_$id"."_out") or die "Can not open part_rseq_$id"."_out file\n";
+ open(USEQ, ">$uniqFolder/part_useq_$id"."_out") or die "Can not open part_useq_$id"."_out file\n";
+ my $seqCounter = scalar @{$fReadRef};
+ my @fRead = ();
+ my @rRead = ();
+ for(my $i=0; $i<$seqCounter;) {
+ push @fRead, shift @{$fReadRef};
+ push @rRead, shift @{$rReadRef};
+ $i++;
+ if($i%4 == 0) {
+ processPairedEndSeq(\@fRead, \@rRead);
+ @fRead = ();
+ @rRead = ();
+ }
+ }
+ my @refArr = (\@ctotalBases, \@ctotalHQBases, \@ctotalBasesAfterHQ, \@ctotalHQBasesAfterHQ, \@ctotalBasesFinal, \@ctotalHQBasesFinal, \@ctotalReadsAfterHQ, \@ctotalValidReadsNoPriAda, \@ctotalValidReadsWithPriAda, \@cminLen, \@cmaxLen, \@cpositionSpecificBaseCount, \@ctotalReadsFinal, \@creadsWithN, \@ctotalNs, \@cpositionSpecificBaseCountHQ, \@cqualDistribRaw, \@cqualDistribFinal, \@cgcDistribRaw, \@cgcDistribFinal, \@ccharCountRaw, \@ccharCountFinal, \@cpositionSpecificBaseCountWithRa [...]
+ close(FSEQ);
+ close(RSEQ);
+ close(USEQ);
+ return \@refArr;
+}
+
+sub processPairedEndSeq {
+ yield;
+ my($fReadRef, $rReadRef) = @_;
+ my @fRead = @{$fReadRef};
+ my @rRead = @{$rReadRef};
+ chomp(my $fQualLine = $fRead[3]);
+ chomp(my $rQualLine = $rRead[3]);
+ chomp(my $fSeqLine = $fRead[1]);
+ chomp(my $rSeqLine = $rRead[1]);
+ my $fNs = getNoOfNs($fSeqLine);
+ my $rNs = getNoOfNs($rSeqLine);
+ $ctotalNs[0] += $fNs;
+ $ctotalNs[1] += $rNs;
+ if($fNs) {
+ $creadsWithN[0]++;
+ }
+ if($rNs) {
+ $creadsWithN[1]++;
+ }
+
+ my @qualArr = ();
+ my $isFReadOfHQ = isReadOfHQ($fQualLine, 0, \@qualArr);
+ my $isRReadOfHQ = isReadOfHQ($rQualLine, 1, \@qualArr);
+ my $fSeqLineLen = length $fSeqLine;
+ my $rSeqLineLen = length $rSeqLine;
+ my $fAvgQual = sprintf "%.0f", (sum(@{$qualArr[0]})/$fSeqLineLen);
+ my $rAvgQual = sprintf "%.0f", (sum(@{$qualArr[1]})/$rSeqLineLen);
+ $cqualDistribRaw[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $cqualDistribRaw[1][getIndex($rAvgQual,$qualDistribInterval)]++;
+ my $fAs = $fSeqLine =~ s/A/A/gi;
+ my $fTs = $fSeqLine =~ s/T/T/gi;
+ my $fGs = $fSeqLine =~ s/G/G/gi;
+ my $fCs = $fSeqLine =~ s/C/C/gi;
+ my $fgcPercent = ($fGs + $fCs)/$fSeqLineLen*100;
+ $ccharCountRaw[0][0] += $fAs;
+ $ccharCountRaw[0][1] += $fTs;
+ $ccharCountRaw[0][2] += $fGs;
+ $ccharCountRaw[0][3] += $fCs;
+ $ccharCountRaw[0][4] += $fNs;
+ my $rAs = $rSeqLine =~ s/A/A/gi;
+ my $rTs = $rSeqLine =~ s/T/T/gi;
+ my $rGs = $rSeqLine =~ s/G/G/gi;
+ my $rCs = $rSeqLine =~ s/C/C/gi;
+ my $rgcPercent = ($rGs + $rCs)/$rSeqLineLen*100;
+ $ccharCountRaw[1][0] += $rAs;
+ $ccharCountRaw[1][1] += $rTs;
+ $ccharCountRaw[1][2] += $rGs;
+ $ccharCountRaw[1][3] += $rCs;
+ $ccharCountRaw[1][4] += $rNs;
+ $cgcDistribRaw[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $cgcDistribRaw[1][getIndex($rgcPercent,$gcDistribInterval)]++;
+ if($isFReadOfHQ && $isRReadOfHQ) {
+ $ctotalReadsAfterHQ[0]++;
+ $ctotalReadsAfterHQ[1]++;
+ $ctotalBasesAfterHQ[0] += $fSeqLineLen;
+ $ctotalBasesAfterHQ[1] += $rSeqLineLen;
+ $ctotalHQBasesAfterHQ[0] += $isFReadOfHQ;
+ $ctotalHQBasesAfterHQ[1] += $isRReadOfHQ;
+ if(defined($priAdaLib)) {
+ my $isFWOPriAda = isWOPriAda($fSeqLine, 0, 1);
+ my $isRWOPriAda = isWOPriAda($rSeqLine, 1, 1);
+ if($isFWOPriAda && $isRWOPriAda) {
+ $ctotalReadsFinal[0]++;
+ $ctotalReadsFinal[1]++;
+ $ctotalBasesFinal[0] += $fSeqLineLen;
+ $ctotalBasesFinal[1] += $rSeqLineLen;
+ $ctotalHQBasesFinal[0] += $isFReadOfHQ;
+ $ctotalHQBasesFinal[1] += $isRReadOfHQ;
+ $cminLen[2] = $fSeqLineLen if($cminLen[2] > $fSeqLineLen);
+ $cmaxLen[2] = $fSeqLineLen if($cmaxLen[2] < $fSeqLineLen);
+ $cminLen[3] = $rSeqLineLen if($cminLen[3] > $rSeqLineLen);
+ $cmaxLen[3] = $rSeqLineLen if($cmaxLen[3] < $rSeqLineLen);
+ $ctotalNs[2] += $fNs;
+ $ctotalNs[3] += $rNs;
+ if($fNs) {
+ $creadsWithN[2]++;
+ }
+ if($rNs) {
+ $creadsWithN[3]++;
+ }
+ for(my $x=0; $x<@qualArr; $x++) {
+ my @row = @{$qualArr[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $cpositionSpecificBaseCountHQ[$x][$y] += $qualArr[$x][$y];
+ my $ind = int($qualArr[$x][$y]/10);
+ $ind-- if($qualArr[$x][$y]%10 == 0 && $qualArr[$x][$y] != 0);
+ $cpositionSpecificBaseCountHQWithRanges[$x][$y][$ind]++;
+ }
+ }
+ if(!defined($isOnlyStat)) {
+ $cqualDistribFinal[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $cqualDistribFinal[1][getIndex($rAvgQual,$qualDistribInterval)]++;
+ $cgcDistribFinal[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $cgcDistribFinal[1][getIndex($rgcPercent,$gcDistribInterval)]++;
+ $ccharCountFinal[0][0] += $fAs;
+ $ccharCountFinal[0][1] += $fTs;
+ $ccharCountFinal[0][2] += $fGs;
+ $ccharCountFinal[0][3] += $fCs;
+ $ccharCountFinal[0][4] += $fNs;
+ $ccharCountFinal[1][0] += $rAs;
+ $ccharCountFinal[1][1] += $rTs;
+ $ccharCountFinal[1][2] += $rGs;
+ $ccharCountFinal[1][3] += $rCs;
+ $ccharCountFinal[1][4] += $rNs;
+ print FSEQ @fRead;
+ print RSEQ @rRead;
+ }
+ }
+ else {
+ if(!defined($isOnlyStat)) {
+ if($isFWOPriAda) {
+ print USEQ @fRead;
+ }
+ elsif($isRWOPriAda) {
+ print USEQ @rRead;
+ }
+ }
+ }
+ }
+ else {
+ $ctotalReadsFinal[0]++;
+ $ctotalReadsFinal[1]++;
+ $ctotalBasesFinal[0] += $fSeqLineLen;
+ $ctotalBasesFinal[1] += $rSeqLineLen;
+ $ctotalHQBasesFinal[0] += $isFReadOfHQ;
+ $ctotalHQBasesFinal[1] += $isRReadOfHQ;
+ $cminLen[2] = $fSeqLineLen if($cminLen[2] > $fSeqLineLen);
+ $cmaxLen[2] = $fSeqLineLen if($cmaxLen[2] < $fSeqLineLen);
+ $cminLen[3] = $rSeqLineLen if($cminLen[3] > $rSeqLineLen);
+ $cmaxLen[3] = $rSeqLineLen if($cmaxLen[3] < $rSeqLineLen);
+ $ctotalNs[2] += $fNs;
+ $ctotalNs[3] += $rNs;
+ if($fNs) {
+ $creadsWithN[2]++;
+ }
+ if($rNs) {
+ $creadsWithN[3]++;
+ }
+ for(my $x=0; $x<@qualArr; $x++) {
+ my @row = @{$qualArr[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $cpositionSpecificBaseCountHQ[$x][$y] += $qualArr[$x][$y];
+ my $ind = int($qualArr[$x][$y]/10);
+ $ind-- if($qualArr[$x][$y]%10 == 0 && $qualArr[$x][$y] != 0);
+ $cpositionSpecificBaseCountHQWithRanges[$x][$y][$ind]++;
+ }
+ }
+ if(!defined($isOnlyStat)) {
+ $cqualDistribFinal[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $cqualDistribFinal[1][getIndex($rAvgQual,$qualDistribInterval)]++;
+ $cgcDistribFinal[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $cgcDistribFinal[1][getIndex($rgcPercent,$gcDistribInterval)]++;
+ $ccharCountFinal[0][0] += $fAs;
+ $ccharCountFinal[0][1] += $fTs;
+ $ccharCountFinal[0][2] += $fGs;
+ $ccharCountFinal[0][3] += $fCs;
+ $ccharCountFinal[0][4] += $fNs;
+ $ccharCountFinal[1][0] += $rAs;
+ $ccharCountFinal[1][1] += $rTs;
+ $ccharCountFinal[1][2] += $rGs;
+ $ccharCountFinal[1][3] += $rCs;
+ $ccharCountFinal[1][4] += $rNs;
+ print FSEQ @fRead;
+ print RSEQ @rRead;
+ }
+ }
+ }
+ else {
+ if(!defined($isOnlyStat)) {
+ if($isFReadOfHQ) {
+ my $isFWOPriAda = 1;
+ $isFWOPriAda = isWOPriAda($fSeqLine, 0, 0) if(defined($priAdaLib));
+ if($isFWOPriAda) {
+ print USEQ @fRead;
+ }
+ }
+ elsif($isRReadOfHQ) {
+ my $isRWOPriAda = 1;
+ $isRWOPriAda = isWOPriAda($rSeqLine, 1, 0) if(defined($priAdaLib));
+ if($isRWOPriAda) {
+ print USEQ @rRead;
+ }
+ }
+ }
+ }
+}
+
+sub processSingleEndFiles {
+ my $file = $_[0];
+ my $outFile = $_[1];
+ $totalReads[0] = sprintf("%0.0f", $nLines/4);
+
+
+
+
+ if($file =~ /\.gz$/i) {
+ my $thRef = threads->create('readDivideGzip4SE', $file);
+ threading4SE();
+ $thRef->join;
+ print "$indOfAnalysis: Number of reads processed: " . "$totalReads[0]/$totalReads[0] (100\%)...\n";
+ }
+ else {
+ print "$indOfAnalysis: Number of reads processed: " . "0/$totalReads[0] (0\%)...\n";
+ my $isEOF = 1;
+ if($nLines/4 > 0) {
+ $isEOF = 0;
+ }
+ my $lineCount = 0;
+ my @thArr = ();
+ my $ttlJobCount = 0;
+ my $noOfSeqPerThread = 100000; #int($nLines/$noOfProcesses);
+ my $fH;
+ openFileGetHandle($file, "r", \$fH);
+ while(!$isEOF) {
+ my $jobCounter = 0; # Lowest value: 0 and Highest value: $noOfProcesses
+ for(my $j=0; $j<$noOfProcesses && (!$isEOF); $j++) {
+ my @fRead = ();
+ $jobCounter++;
+ for(my $i=0; $i<4*$noOfSeqPerThread && (!$isEOF); $i++) {
+ $fRead[$i] = <$fH>;
+ $lineCount += 1;
+ if($lineCount >= $nLines) {
+ $isEOF = 1;
+ }
+ $i-- if($fRead[$i]=~ /^\n$/);
+ }
+ $ttlJobCount++;
+ my $id = sprintf "%05s", $ttlJobCount;
+ my @refArr = (\@fRead, $id);
+ $thArr[$j] = threads->create('passSESeq', @refArr);
+ if($lineCount % (100000*4) == 0) {
+ my $tmpP = sprintf "%0.0f", ($lineCount/4/$totalReads[0]*100);
+ print "$indOfAnalysis: Number of reads processed: " . $lineCount/4 . "/$totalReads[0] ($tmpP\%)...\n";
+ }
+ }
+ for(my $j=0; $j<$jobCounter; $j++) {
+ my $refRefArr;
+ $refRefArr = $thArr[$j]->join;
+ my @refArr = @{$refRefArr};
+ $totalBases[0] += @{$refArr[0]}[0];
+ $totalHQBases[0] += @{$refArr[1]}[0];
+ $totalBasesAfterHQ[0] += @{$refArr[2]}[0];
+ $totalHQBasesAfterHQ[0] += @{$refArr[3]}[0];
+ $totalBasesFinal[0] += @{$refArr[4]}[0];
+ $totalHQBasesFinal[0] += @{$refArr[5]}[0];
+ $totalReadsAfterHQ[0] += @{$refArr[6]}[0];
+ $totalValidReadsNoPriAda[0] += @{$refArr[7]}[0];
+ $totalValidReadsWithPriAda[0] += @{$refArr[8]}[0];
+ $minLen[0] = min(@{$refArr[9]}[0], $minLen[0]);
+ $minLen[1] = min(@{$refArr[9]}[1], $minLen[1]);
+ $maxLen[0] = max(@{$refArr[10]}[0], $maxLen[0]);
+ $maxLen[1] = max(@{$refArr[10]}[1], $maxLen[1]);
+ my @tmpArr4positionSpecific = @{$refArr[11]};
+ my @tmpArr4positionSpecificHQ = @{$refArr[15]};
+ for(my $x=0; $x<@tmpArr4positionSpecific; $x++) {
+ my @row = @{$tmpArr4positionSpecific[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $positionSpecificBaseCount[$x][$y] += $tmpArr4positionSpecific[$x][$y];
+ if($tmpArr4positionSpecificHQ[$x][$y]) {
+ $positionSpecificBaseCountHQ[$x][$y] += $tmpArr4positionSpecificHQ[$x][$y];
+ }
+ }
+ }
+ my @tmpArr4positionSpecificR = @{$refArr[22]};
+ my @tmpArr4positionSpecificHQR = @{$refArr[23]};
+ for(my $x=0; $x<@tmpArr4positionSpecificR; $x++) {
+ my @row = @{$tmpArr4positionSpecificR[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ my @col = @{@{$tmpArr4positionSpecificR[$x]}[$y]};
+ for(my $z=0; $z<@col; $z++) {
+ $tmpArr4positionSpecificR[$x][$y][$z] = 0 if(! $tmpArr4positionSpecificR[$x][$y][$z]);
+ $positionSpecificBaseCountWithRanges[$x][$y][$z] += $tmpArr4positionSpecificR[$x][$y][$z];
+ if($tmpArr4positionSpecificHQR[$x][$y][$z]) {
+ $positionSpecificBaseCountHQWithRanges[$x][$y][$z] += $tmpArr4positionSpecificHQR[$x][$y][$z];
+ }
+ }
+ }
+ }
+ $totalReadsFinal[0] += @{$refArr[12]}[0];
+ $readsWithN[0] += @{$refArr[13]}[0];
+ $readsWithN[1] += @{$refArr[13]}[1];
+ $totalNs[0] += @{$refArr[14]}[0];
+ $totalNs[1] += @{$refArr[14]}[1];
+ addTwoArrays($refArr[16], \@qualDistribRaw);
+ addTwoArrays($refArr[17], \@qualDistribFinal);
+ addTwoArrays($refArr[18], \@gcDistribRaw);
+ addTwoArrays($refArr[19], \@gcDistribFinal);
+ addTwoArrays($refArr[20], \@charCountRaw);
+ addTwoArrays($refArr[21], \@charCountFinal);
+ @refArr = ();
+ }
+ }
+ close($fH);
+ print "$indOfAnalysis: Number of reads processed: " . "$totalReads[0]/$totalReads[0] (100\%)...\n";
+ }
+ if(!defined($isOnlyStat)) {
+ my ($fHndl);
+ openFileGetHandle($outFile, "w", \$fHndl);
+ *OOF = $fHndl;
+ print "$indOfAnalysis: Printing filtered data...\n";
+ opendir(DIR, $uniqFolder);
+ my @partFiles = readdir(DIR);
+ @partFiles = sort @partFiles;
+ foreach my $pFile (@partFiles) {
+ next if($pFile =~ /\./);
+ my $npFile = "$uniqFolder/$pFile";
+ open(P, "<$npFile") or die "Can not open part file\n";
+ while(<P>) {
+ print OOF if($pFile =~ /fseq[^\n]+out/);
+ }
+ close(P);
+ }
+ closedir(DIR);
+ close(OOF);
+ }
+}
+
+sub readDivideGzip4SE {
+ my ($file) = @_;
+ my $isEOF = 1;
+ if($nLines/4 > 0) {
+ $isEOF = 0;
+ }
+ my $lineCount = 0;
+ my $chunkCount = 0;
+ my $noOfSeqPerThread = 100000; #int($nLines/$noOfProcesses);
+ my $fH;
+ openFileGetHandle($file, "r", \$fH);
+ while(!$isEOF) {
+ my $jobCounter = 0; # Lowest value: 0 and Highest value: $noOfProcesses
+ for(my $j=0; $j<$noOfProcesses && (!$isEOF); $j++) {
+ $jobCounter++;
+ $chunkCount++;
+ my $id = sprintf "%05s", $chunkCount;
+ my $fseqH;
+ openFileGetHandle("$uniqFolder/part_fseq_$id", "w", \$fseqH);
+ for(my $i=0; $i<4*$noOfSeqPerThread && (!$isEOF); $i++) {
+ my $fRead = <$fH>;
+ print $fseqH $fRead;
+ $lineCount += 1;
+ if($lineCount >= $nLines) {
+ $isEOF = 1;
+ }
+ $i-- if($fRead=~ /^\n$/);
+ }
+ close($fseqH);
+ $ProcessingQueue->enqueue("$uniqFolder/part_fseq_$id");
+ }
+ }
+ close($fH);
+ $ProcessingQueue->enqueue(undef);
+}
+
+sub fireMySEJob {
+ my $lineCount = $_[0];
+ my $fileName = $ProcessingQueue->dequeue();
+ return undef if(!defined($fileName));
+ open(CHKF, "<$fileName") or die "Can't open chunk file containing input reads for processing: $fileName\n";
+ my @reads = <CHKF>;
+ $$lineCount += (scalar @reads);
+ close(CHKF);
+ my ($id) = $fileName=~/(\d+)$/;
+ my @refArr = (\@reads, $id);
+ my $thRef = threads->create('passSESeq', @refArr);
+ return $thRef;
+}
+
+sub threading4SE {
+ my @thArr;
+ my $done = 0;
+ my $lineCount = 0;
+ while(1) {
+ if($lineCount % (100000*4) == 0) {
+ my $tmpP = sprintf "%0.0f", ($lineCount/4/$totalReads[0]*100);
+ print "$indOfAnalysis: Number of reads processed: " . $lineCount/4 . "/$totalReads[0] ($tmpP\%)...\n";
+ }
+ my $i;
+ for($i=0; $i<$noOfProcesses; $i++) {
+ my $thRef = fireMySEJob(\$lineCount);
+ if(!defined($thRef)) {
+ $done = 1;
+ last;
+ }
+ $thArr[$i] = $thRef;
+ }
+ for(my $j=0; $j<$i; $j++) {
+ my $refRefArr;
+ $refRefArr = $thArr[$j]->join;
+ my @refArr = @{$refRefArr};
+ $totalBases[0] += @{$refArr[0]}[0];
+ $totalHQBases[0] += @{$refArr[1]}[0];
+ $totalBasesAfterHQ[0] += @{$refArr[2]}[0];
+ $totalHQBasesAfterHQ[0] += @{$refArr[3]}[0];
+ $totalBasesFinal[0] += @{$refArr[4]}[0];
+ $totalHQBasesFinal[0] += @{$refArr[5]}[0];
+ $totalReadsAfterHQ[0] += @{$refArr[6]}[0];
+ $totalValidReadsNoPriAda[0] += @{$refArr[7]}[0];
+ $totalValidReadsWithPriAda[0] += @{$refArr[8]}[0];
+ $minLen[0] = min(@{$refArr[9]}[0], $minLen[0]);
+ $minLen[1] = min(@{$refArr[9]}[1], $minLen[1]);
+ $maxLen[0] = max(@{$refArr[10]}[0], $maxLen[0]);
+ $maxLen[1] = max(@{$refArr[10]}[1], $maxLen[1]);
+ my @tmpArr4positionSpecific = @{$refArr[11]};
+ my @tmpArr4positionSpecificHQ = @{$refArr[15]};
+ for(my $x=0; $x<@tmpArr4positionSpecific; $x++) {
+ my @row = @{$tmpArr4positionSpecific[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $positionSpecificBaseCount[$x][$y] += $tmpArr4positionSpecific[$x][$y];
+ if($tmpArr4positionSpecificHQ[$x][$y]) {
+ $positionSpecificBaseCountHQ[$x][$y] += $tmpArr4positionSpecificHQ[$x][$y];
+ }
+ }
+ }
+ my @tmpArr4positionSpecificR = @{$refArr[22]};
+ my @tmpArr4positionSpecificHQR = @{$refArr[23]};
+ for(my $x=0; $x<@tmpArr4positionSpecificR; $x++) {
+ my @row = @{$tmpArr4positionSpecificR[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ my @col = @{@{$tmpArr4positionSpecificR[$x]}[$y]};
+ for(my $z=0; $z<@col; $z++) {
+ $tmpArr4positionSpecificR[$x][$y][$z] = 0 if(! $tmpArr4positionSpecificR[$x][$y][$z]);
+ $positionSpecificBaseCountWithRanges[$x][$y][$z] += $tmpArr4positionSpecificR[$x][$y][$z];
+ if($tmpArr4positionSpecificHQR[$x][$y][$z]) {
+ $positionSpecificBaseCountHQWithRanges[$x][$y][$z] += $tmpArr4positionSpecificHQR[$x][$y][$z];
+ }
+ }
+ }
+ }
+ $totalReadsFinal[0] += @{$refArr[12]}[0];
+ $readsWithN[0] += @{$refArr[13]}[0];
+ $readsWithN[1] += @{$refArr[13]}[1];
+ $totalNs[0] += @{$refArr[14]}[0];
+ $totalNs[1] += @{$refArr[14]}[1];
+ addTwoArrays($refArr[16], \@qualDistribRaw);
+ addTwoArrays($refArr[17], \@qualDistribFinal);
+ addTwoArrays($refArr[18], \@gcDistribRaw);
+ addTwoArrays($refArr[19], \@gcDistribFinal);
+ addTwoArrays($refArr[20], \@charCountRaw);
+ addTwoArrays($refArr[21], \@charCountFinal);
+ @refArr = ();
+ }
+ last if($done);
+ }
+}
+
+sub passSESeq {
+ my($fReadRef, $id) = @_;
+ open(FSEQ, ">$uniqFolder/part_fseq_$id"."_out") or die "Can not open part_fseq_$id"."_out file\n";
+ my @fTmp = @{$fReadRef};
+ my $seqCounter = scalar @fTmp;
+ my @fRead = ();
+ for(my $i=0; $i<$seqCounter;) {
+ push @fRead, shift @fTmp;
+ $i++;
+ if($i%4 == 0) {
+ processSingleEndSeq(\@fRead);
+ @fRead = ();
+ }
+ }
+ my @refArr = (\@ctotalBases, \@ctotalHQBases, \@ctotalBasesAfterHQ, \@ctotalHQBasesAfterHQ, \@ctotalBasesFinal, \@ctotalHQBasesFinal, \@ctotalReadsAfterHQ, \@ctotalValidReadsNoPriAda, \@ctotalValidReadsWithPriAda, \@cminLen, \@cmaxLen, \@cpositionSpecificBaseCount, \@ctotalReadsFinal, \@creadsWithN, \@ctotalNs, \@cpositionSpecificBaseCountHQ, \@cqualDistribRaw, \@cqualDistribFinal, \@cgcDistribRaw, \@cgcDistribFinal, \@ccharCountRaw, \@ccharCountFinal, \@cpositionSpecificBaseCountWithRa [...]
+ close(FSEQ);
+ return \@refArr;
+}
+
+sub processSingleEndSeq {
+ yield;
+ my($fReadRef) = @_;
+ my @fRead = @{$fReadRef};
+ chomp(my $fQualLine = $fRead[3]);
+ chomp(my $fSeqLine = $fRead[1]);
+ my $fNs = getNoOfNs($fSeqLine);
+ $ctotalNs[0] += $fNs;
+ if($fNs) {
+ $creadsWithN[0]++;
+ }
+
+ my @qualArr = ();
+ my $isFReadOfHQ = isReadOfHQ($fQualLine, 0, \@qualArr);
+ my $fSeqLineLen = length $fSeqLine;
+ my $fAvgQual = sprintf "%.0f", (sum(@{$qualArr[0]})/$fSeqLineLen);
+ $cqualDistribRaw[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ my $fAs = $fSeqLine =~ s/A/A/gi;
+ my $fTs = $fSeqLine =~ s/T/T/gi;
+ my $fGs = $fSeqLine =~ s/G/G/gi;
+ my $fCs = $fSeqLine =~ s/C/C/gi;
+ my $fgcPercent = ($fGs + $fCs)/$fSeqLineLen*100;
+ $ccharCountRaw[0][0] += $fAs;
+ $ccharCountRaw[0][1] += $fTs;
+ $ccharCountRaw[0][2] += $fGs;
+ $ccharCountRaw[0][3] += $fCs;
+ $ccharCountRaw[0][4] += $fNs;
+ $cgcDistribRaw[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ if($isFReadOfHQ) {
+ $ctotalReadsAfterHQ[0]++;
+ $ctotalBasesAfterHQ[0] += $fSeqLineLen;
+ $ctotalHQBasesAfterHQ[0] += $isFReadOfHQ;
+ if(defined($priAdaLib)) {
+ my $isFWOPriAda = isWOPriAda($fSeqLine, 0, 1);
+ if($isFWOPriAda) {
+ $ctotalReadsFinal[0]++;
+ $ctotalBasesFinal[0] += $fSeqLineLen;
+ $ctotalHQBasesFinal[0] += $isFReadOfHQ;
+ $cminLen[1] = $fSeqLineLen if($cminLen[1] > $fSeqLineLen);
+ $cmaxLen[1] = $fSeqLineLen if($cmaxLen[1] < $fSeqLineLen);
+ $ctotalNs[1] += $fNs;
+ if($fNs) {
+ $creadsWithN[1]++;
+ }
+ for(my $x=0; $x<@qualArr; $x++) {
+ my @row = @{$qualArr[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $cpositionSpecificBaseCountHQ[$x][$y] += $qualArr[$x][$y];
+ my $ind = int($qualArr[$x][$y]/10);
+ $ind-- if($qualArr[$x][$y]%10 == 0 && $qualArr[$x][$y] != 0);
+ $cpositionSpecificBaseCountHQWithRanges[$x][$y][$ind]++;
+ }
+ }
+ if(!defined($isOnlyStat)) {
+ $cqualDistribFinal[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $cgcDistribFinal[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $ccharCountFinal[0][0] += $fAs;
+ $ccharCountFinal[0][1] += $fTs;
+ $ccharCountFinal[0][2] += $fGs;
+ $ccharCountFinal[0][3] += $fCs;
+ $ccharCountFinal[0][4] += $fNs;
+ print FSEQ @fRead;
+ }
+ }
+ }
+ else {
+ $ctotalReadsFinal[0]++;
+ $ctotalBasesFinal[0] += $fSeqLineLen;
+ $ctotalHQBasesFinal[0] += $isFReadOfHQ;
+ $cminLen[1] = $fSeqLineLen if($cminLen[1] > $fSeqLineLen);
+ $cmaxLen[1] = $fSeqLineLen if($cmaxLen[1] < $fSeqLineLen);
+ $ctotalNs[1] += $fNs;
+ if($fNs) {
+ $creadsWithN[1]++;
+ }
+ for(my $x=0; $x<@qualArr; $x++) {
+ my @row = @{$qualArr[$x]};
+ for(my $y=0; $y<@row; $y++) {
+ $cpositionSpecificBaseCountHQ[$x][$y] += $qualArr[$x][$y];
+ my $ind = int($qualArr[$x][$y]/10);
+ $ind-- if($qualArr[$x][$y]%10 == 0 && $qualArr[$x][$y] != 0);
+ $cpositionSpecificBaseCountHQWithRanges[$x][$y][$ind]++;
+ }
+ }
+ if(!defined($isOnlyStat)) {
+ $cqualDistribFinal[0][getIndex($fAvgQual,$qualDistribInterval)]++;
+ $cgcDistribFinal[0][getIndex($fgcPercent,$gcDistribInterval)]++;
+ $ccharCountFinal[0][0] += $fAs;
+ $ccharCountFinal[0][1] += $fTs;
+ $ccharCountFinal[0][2] += $fGs;
+ $ccharCountFinal[0][3] += $fCs;
+ $ccharCountFinal[0][4] += $fNs;
+ print FSEQ @fRead;
+ }
+ }
+ }
+ else {
+ }
+}
+
+sub addTwoArrays {
+ my $arr1Ref = $_[0];
+ my $arr2Ref = $_[1];
+ my $c=0;
+ my $i=0;
+ foreach my $arrRef (@{$arr1Ref}) {
+ $c=0;
+ foreach my $val (@{$arrRef}) {
+ @{$$arr2Ref[$i]}[$c] += $val if($val);
+ @{$$arr2Ref[$i]}[$c] = 0 if(!defined(@{$$arr2Ref[$i]}[$c]));
+ $c++;
+ }
+ $i++
+ }
+}
+
+sub checkFastQFormat { # Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file.
+ my $file = $_[0];
+ my $isVariantIdntfcntOn = $_[1];
+ my $lines = 0;
+ my $fHt;
+ openFileGetHandle("$file", "r", \$fHt);
+ *FF = $fHt;
+ my $counter = 0;
+ my $minVal = 1000;
+ my $maxVal = 0;
+ while(my $line = <FF>) {
+ $lines++;
+ $counter++;
+ next if($line =~ /^\n$/);
+ if($counter == 1 && $line !~ /^\@/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 3 && $line !~ /^\+/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 4 && $lines < 1000000) {
+ chomp $line;
+ my @ASCII = unpack("C*", $line);
+ $minVal = min(min(@ASCII), $minVal);
+ $maxVal = max(max(@ASCII), $maxVal);
+ }
+ if($counter == 4) {
+ $counter = 0;
+ }
+ }
+ close(FF);
+ my $tseqFormat = 0;
+ if($minVal >= 33 && $minVal <= 73 && $maxVal >= 33 && $maxVal <= 73) {
+ $tseqFormat = 1;
+ }
+ elsif($minVal >= 66 && $minVal <= 105 && $maxVal >= 66 && $maxVal <= 105) {
+ $tseqFormat = 4; # Illumina 1.5+
+ }
+ elsif($minVal >= 64 && $minVal <= 105 && $maxVal >= 64 && $maxVal <= 105) {
+ $tseqFormat = 3; # Illumina 1.3+
+ }
+ elsif($minVal >= 59 && $minVal <= 105 && $maxVal >= 59 && $maxVal <= 105) {
+ $tseqFormat = 2; # Solexa
+ }
+ elsif($minVal >= 33 && $minVal <= 74 && $maxVal >= 33 && $maxVal <= 74) {
+ $tseqFormat = 5; # Illumina 1.8+
+ }
+ if($isVariantIdntfcntOn) {
+ $seqFormat = $tseqFormat;
+ }
+ else {
+ if($tseqFormat != $seqFormat) {
+ print STDERR "Warning: It seems the specified variant of FASTQ doesn't match the quality values in input FASTQ files.\n";
+ }
+ }
+ return $lines;
+}
+
+sub getFilePath {
+ my $name = $_[0];
+ my $path = "";
+ if($name =~ /\//) {
+ $name =~ /(.+)\//;
+ $path = $1 . "/";
+ }
+ else {
+ $path = "./";
+ }
+ return $path;
+}
+
+sub getFileName { # This sub takes a path of a file and returns just its name after separating the path from it.
+ my $path = $_[0];
+ my $name = "";
+ $path =~ /([^\/]+)$/;
+ $name = $1;
+ return $name;
+}
+
+sub prtErrorExit {
+ my $errmsg = $_[0];
+ print STDERR "Error:\t", $errmsg, "\n";
+ exit;
+}
+
+sub isReadOfHQ { # Criteria for HQ is greater than or equal to 70% of bases have phred score >= 20
+ my $read = $_[0];
+ my $v0Or1 = $_[1]; # 0 will be for forward reads and 1 for reverse reads.
+ my $arrRef = $_[2];
+ my $readLen = length $read;
+ $cminLen[$v0Or1] = $readLen if($cminLen[$v0Or1] > $readLen);
+ $cmaxLen[$v0Or1] = $readLen if($cmaxLen[$v0Or1] < $readLen);
+ my $cutOffLen = sprintf("%0.0f", $readLen * $cutOffReadLen4HQ / 100); # 70% length of read length is calculated.
+ my $validBaseCount = 0;
+ my @ASCII = unpack("C*", $read);
+ my $c = 0;
+ foreach my $val (@ASCII) {
+ $val -= $subVal;
+ $cpositionSpecificBaseCount[$v0Or1][$c] += $val;
+ my $ind = int($val/10);
+ $ind-- if($val%10 == 0 && $val != 0);
+ $cpositionSpecificBaseCountWithRanges[$v0Or1][$c][$ind]++;
+ $$arrRef[$v0Or1][$c] = $val;
+ if($val >= $cutOffPhScore) {
+ $validBaseCount++;
+ }
+ $c++;
+ }
+ $ctotalBases[$v0Or1] += $readLen;
+ $ctotalHQBases[$v0Or1] += $validBaseCount;
+ if($validBaseCount >= $cutOffLen) {
+ return $validBaseCount; # Return true.
+ }
+ else {
+ return 0; # Return false.
+ }
+}
+
+
+sub getIndex {
+ my $up = $_[0];
+ my $down = $_[1];
+ my $inp = $up/$down;
+ return (sprintf "%0.0f", $up) if($down == 1);
+ my $index = int((sprintf "%0.2f", $inp)+0.99)-1;
+ $index = 0 if($index < 0);
+ return $index;
+}
+
+
+sub isWOPriAda {
+ my $seq = $_[0];
+ my $v0Or1 = $_[1];
+ my $isCountStatOn = $_[2];
+ chomp($seq);
+
+ my @arrGenomic = (
+ "GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
+ );
+
+ my @arrPE = (
+ "GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"
+ );
+
+ my @arrDpnII = (
+ "GATCGTCGGACTGTAGAACTCTGAAC",
+ "ACAGGTTCAGAGTTCTACAGTCCGAC",
+ "CAAGCAGAAGACGGCATACGANN",
+ "TCGTATGCCGTCTTCTGCTTG",
+ "CAAGCAGAAGACGGCATACGA",
+ "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA",
+ "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
+ );
+
+ my @arrNlaIII = (
+ "TCGGACTGTAGAACTCTGAAC",
+ "ACAGGTTCAGAGTTCTACAGTCCGACATG",
+ "CAAGCAGAAGACGGCATACGANN",
+ "TCGTATGCCGTCTTCTGCTTG",
+ "CAAGCAGAAGACGGCATACGA",
+ "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA",
+ "CCGACAGGTTCAGAGTTCTACAGTCCGACATG"
+ );
+
+ my @arrsmRNA = (
+ "GTTCAGAGTTCTACAGTCCGACGATC",
+ "TCGTATGCCGTCTTCTGCTTGT",
+ "CAAGCAGAAGACGGCATACGA",
+ "CAAGCAGAAGACGGCATACGA",
+ "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA",
+ "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
+ );
+
+ my @arrmulPlex = (
+ "GATCGGAAGAGCACACGTCT",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT",
+ "ACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+ "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC",
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT",
+ "CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC",
+ "CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC"
+ );
+
+ my @priAdas = (\@arrGenomic, \@arrPE, \@arrDpnII, \@arrNlaIII, \@arrsmRNA, \@arrmulPlex);
+ my %checkedPriStr = (); # The 20 bp from start and end are stored in this hash as key. So that next time when another pri/ada seq
+
+ my @priAdaSeqs = ();
+ if($priAdaLib eq "u") {
+ @priAdaSeqs = @usrDefinedPriAda;
+ }
+ else {
+ @priAdaSeqs = @{$priAdas[$priAdaLib]};
+ }
+ my @stat = ();
+ my $priInd = 0;
+
+
+ my $isMatched = 0;
+ foreach my $priAda (@priAdaSeqs) {
+ if(findSeq($priAda, $seq, \%checkedPriStr)) {
+ $isMatched = 1;
+ last;
+ }
+ }
+
+ if($isMatched) {
+ $ctotalValidReadsWithPriAda[$v0Or1]++ if($isCountStatOn);
+ return 0;
+ }
+ else {
+ $ctotalValidReadsNoPriAda[$v0Or1]++ if($isCountStatOn);
+ return 1;
+ }
+}
+
+sub findSeq {
+ my $pri = $_[0];
+ my $seq = $_[1];
+ my $hashRef = $_[2];
+ my $spri = substr($pri, 0, $substrlen);
+ my $tmpInd = (length $pri) - $substrlen;
+ $tmpInd = 0 if($tmpInd < 0);
+ my $epri = substr($pri, $tmpInd, $substrlen);
+ my $ans;
+ if(!defined($$hashRef{$spri})) {
+ my @catches = String::Approx::amatch($spri, ['I0 D0 S1'], $seq);
+ if(@catches != 0) {
+ return 1;
+ }
+ $$hashRef{$spri} = 1;
+ }
+ if(!defined($$hashRef{$epri})) {
+ my @catches = String::Approx::amatch($epri, ['I0 D0 S1'], $seq);
+ if(@catches != 0) {
+ return 1;
+ }
+ $$hashRef{$epri} = 1;
+ }
+ return 0;
+}
+
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads (FASTQ) options (Atleast one option is required)\n";
+ print " -pe <Forward reads file> <Reverse reads file> <Primer/Adaptor library> <FASTQ variant>\n";
+ print " Paired-end read files (FASTQ) with primer/adaptor library and FASTQ variant\n";
+ print " User may choose from the provided primer/adaptor library or can give a file containing primer/adaptor sequences, one per line\n";
+ print " Multiple libraries can be given using multiple '-pe' options\n";
+ print " For eg.: -pe r1.fq r2.fq 3 1 -pe t1.fq t2.fq 2 A\n\n";
+ print " -se <Reads file> <Primer/Adaptor library> <FASTQ variant>\n";
+ print " Single-end read file (FASTQ) with primer/adaptor library and FASTQ variant\n";
+ print " Multiple libraries can be given using multiple '-se' options\n";
+ print " For eg.: -se r1.fq 3 2 -se t2.fq 2 2\n\n";
+ print " Primer/Adaptor libraries:\n";
+ my $c = 1;
+ foreach my $lib (@priAdaLibNames) {
+ print " $c = $lib\n";
+ $c++;
+ }
+ print " N = Do not filter for Primer/Adaptor\n";
+ print " <File> = File for user defined primer/adaptor sequences, one per line\n";
+ print "\n";
+ print " FASTQ variants:\n";
+ print " 1 = Sanger (Phred+33, 33 to 73)\n";
+ print " 2 = Solexa (Phred+64, 59 to 104)\n";
+ print " 3 = Illumina (1.3+) (Phred+64, 64 to 104)\n";
+ print " 4 = Illumina (1.5+) (Phred+64, 66 to 104)\n";
+ print " 5 = Illumina (1.8+) (Phred+33, 33 to 74)\n";
+ print " A = Automatic detection of FASTQ variant\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print "--------------------------------- QC Options ---------------------------------\n";
+ print " -l | -cutOffReadLen4HQ <Real number, 0 to 100>\n";
+ print " The cut-off value for percentage of read length that should be of given quality\n";
+ print " default: 70\n";
+ print " -s | -cutOffQualScore <Integer, 0 to 40>\n";
+ print " The cut-off value for PHRED quality score for high-quality filtering\n";
+ print " default: 20\n";
+ print "----------------------------- Processing Options -----------------------------\n";
+ print " -c | -cpus <Integer>\n";
+ print " Number of CPUs to be used\n";
+ print " default: 1\n";
+ print " -onlyStat\n";
+ print " Outputs only statistics without filtered data output\n";
+ print "------------------------------- Output Options -------------------------------\n";
+ print " -t | -statOutFmt <Integer>\n";
+ print " Output format for statistics\n";
+ print " Formats:\n";
+ print " 1 = formatted text\n";
+ print " 2 = tab delimited\n";
+ print " default: 1\n";
+ print " -o | -outputFolder <Output folder name/path>\n";
+ print " Output will be stored in the given folder\n";
+ print " default: By default, output folder (IlluQC_Filtered_files) will be generated where the input files are\n";
+ print " -z | -outputDataCompression <Character>\n";
+ print " Output format for HQ filtered data\n";
+ print " Formats:\n";
+ print " t = text FASTQ files\n";
+ print " g = gzip compressed files\n";
+ print " default: t\n";
+ print "\n";
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+
+sub getNoOfNs { # This takes sequence and returns the number of N/. (unknown base call).
+ my $seq = $_[0];
+ my $count = 0;
+ while($seq =~ /[N\.]/g) {
+ $count++;
+ }
+ return $count;
+}
+
+sub getYMax {
+ my $maxVal = $_[0];
+ my $l=length($maxVal);
+ my $subV = 2;
+ my $div=10**($l-$subV);
+ my $val = 0;
+ for(my $i=1;;$i++) {
+ return $i*1000 if(($i*1000)>$maxVal);
+ }
+ return int($maxVal/$div+0.99999999)*$div;
+}
+
+
+sub drawBaseComp {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ y_label => 'Count',
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ l_margin => 60,
+ r_margin => 60,
+ b_margin => 50,
+ t_margin => 50,
+ show_values => 1,
+ bar_spacing => 1,
+ values_vertical => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ $mygraph->set_values_font($f, 6);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $lred = $myImage->colorAllocate(255,0,0);
+ my $dgreen = $myImage->colorAllocate(0,127,0);
+ my $dblue = $myImage->colorAllocate(0,0,127);
+
+ my $sum1 = sum(@{$$dataRef[1]});
+ my $sum2 = sum(@{$$dataRef[2]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Base composition for $fileName",
+ color => $dblue,
+ );
+
+ $wrapbox->set(align => 'center', width => $width);
+ $wrapbox->set_font($f, 11);
+ $wrapbox->draw(0,0);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[1]}[0]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[1]}[1]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[1]}[2]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[1]}[3]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-35);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[1]}[4]/$sum1*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-35);
+
+
+ my $startRectX = $width/2-230;
+ my $startRectY = $height-35;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$lred);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+
+ if(!$isOnlyStat) {
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "A (" . (sprintf "%0.2f", @{$$dataRef[2]}[0]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-220,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "T (" . (sprintf "%0.2f", @{$$dataRef[2]}[1]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-130,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "G (" . (sprintf "%0.2f", @{$$dataRef[2]}[2]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-40,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "C (" . (sprintf "%0.2f", @{$$dataRef[2]}[3]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+50,$height-20);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => "Non-ATGC (" . (sprintf "%0.2f", @{$$dataRef[2]}[4]/$sum2*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2+140,$height-20);
+
+
+
+ $startRectX = $width/2-230;
+ $startRectY = $height-20;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-140;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2-50;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+40;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectX = $width/2+130;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$dgreen);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+ }
+
+
+
+
+
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ print I $myImage->png;
+ close(I);
+}
+
+
+sub drawGCDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+ my $y_max = getYMax(max(@{$$dataRef[1]}));
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+
+ $mygraph->set(
+ x_label => '% GC content',
+ y_label => 'Number of reads',
+ title => "GC content distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ markers => [1],
+ marker_size => 3,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ x_labels_vertical => 1,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName, "a");
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ print I $myImage->png;
+ close(I);
+}
+
+
+sub drawSummaryPie {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $width = $_[2];
+ my $height = $_[3];
+ my $mygraph = new GD::Graph::pie($width, $height);
+
+ $mygraph->set(
+ title => "Summary of quality check and filtering",
+ axislabelclr => 'black',
+ pie_height => 40,
+
+ l_margin => 15,
+ r_margin => 15,
+ b_margin => 50,
+ start_angle => 45,
+ dclrs => [ qw(lyellow lgreen lred) ],
+ transparent => 0,
+ ) or warn $mygraph->error;
+
+ $mygraph->set_label_font($f, 8);
+ $mygraph->set_value_font(['verdana', 'arial'],14);
+ $mygraph->set_title_font($f, 11);
+
+ my $myImage = $mygraph->plot($dataRef);
+
+ my $black = $myImage->colorAllocate(0,0,0); # To set the color for the next time printing on the image.
+ my $red = $myImage->colorAllocate(255,0,0);
+ my $yellow = $myImage->colorAllocate(255,255,0);
+ my $green = $myImage->colorAllocate(0,255,0);
+
+ my $sum = sum(@{$$dataRef[1]});
+
+ my $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Primer/Adaptor contaminated reads (%0.02f", @{$$dataRef[1]}[0]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-100,$height-45);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "High quality filtered reads (%0.02f", @{$$dataRef[1]}[1]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-100,$height-30);
+
+ $wrapbox = GD::Text::Wrap->new( $myImage,
+ line_space => 4,
+ text => (sprintf "Low quality reads (%0.02f", @{$$dataRef[1]}[2]/$sum*100) . "\%)",
+ color => $black,
+ );
+
+ $wrapbox->set(align => 'left', width => 300);
+ $wrapbox->set_font($f, 8);
+ $wrapbox->draw($width/2-100,$height-15);
+
+ my $startRectX = $width/2-120;
+ my $startRectY = $height-45;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$yellow);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$green);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ $startRectY += 15;
+ $myImage->filledRectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$red);
+ $myImage->rectangle($startRectX,$startRectY,$startRectX+8,$startRectY+8,$black);
+
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ print I $myImage->png;
+ close(I);
+
+}
+
+
+sub drawQualDist {
+ my $dataRef = $_[0];
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $width = $_[3];
+ my $height = $_[4];
+
+ my $mygraph = GD::Graph::bars->new($width, $height);
+
+ $mygraph->set(
+ x_label => 'Average phred quality score',
+ y_label => 'Number of reads',
+ title => "Quality distribution for $fileName",
+ y_min_value => 0,
+ box_axis => 0,
+ line_width => 3,
+ transparent => 0,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ long_ticks => 1,
+ fgclr => '#dddddd',
+ bar_spacing => 1,
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+
+ my $myImage = $mygraph->plot($dataRef);
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ print I $myImage->png;
+ close(I);
+}
+
+
+sub drawGraph {
+ my @data = @{$_[0]};
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ my $y_min = 0;
+ my $y_max = 0;
+ for(my $i=1; $i<@data; $i++) {
+ $y_max = max($y_max, max(@{$data[$i]}));
+ }
+ $y_max = (sprintf "%0.0f",($y_max/5)) * 5 + 5;
+ my $height = sprintf "%0.0f", $y_max * 300 / 45;
+ my $width = sprintf "%0.0f", scalar @{$data[0]} * 600 / 75;
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+ $mygraph->set(
+ x_label => 'Base position',
+ y_label => 'Average quality score',
+ title => $fileName,
+ y_min_value => $y_min,
+ y_max_value => $y_max,
+ x_label_skip => 2,
+ y_tick_number => $y_max/5,
+ y_label_skip => 1,
+ markers => [7],
+ marker_size => 3,
+ long_ticks => 1,
+ line_width => 2,
+ dclrs => [ qw(lred dgreen) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ x_labels_vertical => 1,
+ transparent => 0,
+ r_margin => 10,
+ fgclr => '#dddddd',
+ accentclr => 'yellow',
+ ) or warn $mygraph->error;
+
+ if(!defined($isOnlyStat)) {
+ $mygraph->set_legend( $fileName, $fileName."_filtered");
+ }
+ else {
+ $mygraph->set_legend( $fileName);
+ }
+
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ my $myimage = $mygraph->plot(\@data) or die $mygraph->error;
+
+ print I $myimage->png;
+ close(I);
+}
+
+sub drawRangeGraph {
+ my @data = @{$_[0]};
+ my $fileNameWPath = $_[1];
+ my $fileName = $_[2];
+ my $height = 350;
+ open(I, ">$fileNameWPath") or print STDERR "Error:\n\tCan not create image file: $fileNameWPath\n";
+ binmode I;
+ my $y_min = 0;
+ my $y_max = 0;
+ for(my $i=1; $i<@data; $i++) {
+ $y_max = max($y_max, max(@{$data[$i]}));
+ }
+ $y_max = 100; #(sprintf "%0.0f",($y_max/5)) * 5 + 5;
+ my $width = sprintf "%0.0f", scalar @{$data[0]} * 700 / 75;
+ $width = max(700, $width);
+ my $mygraph = GD::Graph::linespoints->new($width, $height);
+ $mygraph->set(
+ x_label => 'Base position',
+ y_label => 'Read count (%)',
+ title => "Read count (%) per base for different quality score ranges for $fileName",
+ y_min_value => $y_min,
+ y_max_value => $y_max,
+ x_label_skip => 2,
+ y_tick_number => $y_max/5,
+ y_label_skip => 1,
+ markers => [7],
+ marker_size => 3,
+ long_ticks => 1,
+ line_width => 2,
+ dclrs => [ qw(lred dgreen lyellow blue) ],
+ legend_placement => 'BR',
+ x_label_position => 1/2,
+ x_labels_vertical => 1,
+ transparent => 0,
+ r_margin => 10,
+ fgclr => '#dddddd',
+ accentclr => 'yellow',
+ ) or warn $mygraph->error;
+
+ $mygraph->set_legend( "0-10", "11-20", "21-30", "31-40");
+
+
+ $mygraph->set_y_label_font($font_spec, 12);
+ $mygraph->set_x_label_font($font_spec, 12);
+ $mygraph->set_y_axis_font($font_spec, 10);
+ $mygraph->set_x_axis_font($font_spec, 8);
+ $mygraph->set_title_font($f, 11);
+ $mygraph->set_legend_font($f, 8);
+ my $myimage = $mygraph->plot(\@data) or die $mygraph->error;
+
+ print I $myimage->png;
+ close(I);
+}
+
+sub prepareData4RangeGraph {
+ my $STAT = $_[0];
+ print $STAT "Read count (%) per base for different quality score ranges\n\n";
+ my $c = 0;
+ my @rangeGraphData = ();
+ foreach my $arr (@positionSpecificBaseCountWithRanges) {
+ my $arrFiltered = $positionSpecificBaseCountHQWithRanges[$c];
+ print $STAT "\t", getFileName($fileName[$c]);
+ print $STAT "\t\t\t\t\t", getFileName($outFileName[$c]) if(!defined($isOnlyStat));
+ print $STAT "\n";
+ print $STAT "Ranges\t0-10\t11-20\t21-30\t31-40";
+ print $STAT "\t\t0-10\t11-20\t21-30\t31-40" if(!defined($isOnlyStat));
+ print $STAT "\nBase\n";
+ my $basePos = 1;
+ foreach my $valArr (@$arr) {
+ my $valArrF = @$arrFiltered[$basePos-1];
+ @$valArr[0] = 0 if(! @$valArr[0]);
+ @$valArr[1] = 0 if(! @$valArr[1]);
+ @$valArr[2] = 0 if(! @$valArr[2]);
+ @$valArr[3] = 0 if(! @$valArr[3]);
+ my $total = @$valArr[0] + @$valArr[1] + @$valArr[2] + @$valArr[3];
+ my $val1 = sprintf "%0.2f", @$valArr[0]/$total*100;
+ my $val2 = sprintf "%0.2f", @$valArr[1]/$total*100;
+ my $val3 = sprintf "%0.2f", @$valArr[2]/$total*100;
+ my $val4 = sprintf "%0.2f", @$valArr[3]/$total*100;
+ $rangeGraphData[$c*2][0][$basePos-1] = $basePos;
+ $rangeGraphData[$c*2][1][$basePos-1] = $val1;
+ $rangeGraphData[$c*2][2][$basePos-1] = $val2;
+ $rangeGraphData[$c*2][3][$basePos-1] = $val3;
+ $rangeGraphData[$c*2][4][$basePos-1] = $val4;
+ print $STAT "$basePos\t$val1\t$val2\t$val3\t$val4";
+ if(! defined($isOnlyStat)) {
+ @$valArrF[0] = 0 if(! @$valArrF[0]);
+ @$valArrF[1] = 0 if(! @$valArrF[1]);
+ @$valArrF[2] = 0 if(! @$valArrF[2]);
+ @$valArrF[3] = 0 if(! @$valArrF[3]);
+ my $totalF = @$valArrF[0] + @$valArrF[1] + @$valArrF[2] + @$valArrF[3];
+ my $valF1 = sprintf "%0.2f", @$valArrF[0]/$totalF*100;
+ my $valF2 = sprintf "%0.2f", @$valArrF[1]/$totalF*100;
+ my $valF3 = sprintf "%0.2f", @$valArrF[2]/$totalF*100;
+ my $valF4 = sprintf "%0.2f", @$valArrF[3]/$totalF*100;
+ $rangeGraphData[$c*2+1][0][$basePos-1] = $basePos;
+ $rangeGraphData[$c*2+1][1][$basePos-1] = $valF1;
+ $rangeGraphData[$c*2+1][2][$basePos-1] = $valF2;
+ $rangeGraphData[$c*2+1][3][$basePos-1] = $valF3;
+ $rangeGraphData[$c*2+1][4][$basePos-1] = $valF4;
+ print $STAT "\t\t$valF1\t$valF2\t$valF3\t$valF4";
+ }
+ print $STAT "\n";
+ $basePos++;
+ }
+ print $STAT "\n\n";
+ $c++;
+ }
+ print $STAT "\n\n";
+ if($isGDMod) {
+ drawRangeGraph($rangeGraphData[0], $outFolder.getFileName($fileName[0])."_QualRangePerBase.png", getFileName($fileName[0]));
+ drawRangeGraph($rangeGraphData[1], $outFolder.getFileName($outFileName[0])."_QualRangePerBase.png", getFileName($outFileName[0])) if(!defined($isOnlyStat));
+ if($isPairedEnd) {
+ drawRangeGraph($rangeGraphData[2], $outFolder.getFileName($fileName[1])."_QualRangePerBase.png", getFileName($fileName[1]));
+ drawRangeGraph($rangeGraphData[3], $outFolder.getFileName($outFileName[1])."_QualRangePerBase.png", getFileName($outFileName[1])) if(!defined($isOnlyStat));
+ }
+ }
+}
+
+sub printStat {
+ my $STAT = $_[0];
+ my $tmpPer;
+ my $inde = " " x 1;
+ print $STAT "Parameters\n";
+ my @graphData1 = ();
+ my @graphData2 = ();
+ if($isPairedEnd) {
+ printf $STAT "$inde %-30s %s\n", "Library type", "Paired-end";
+ printf $STAT "$inde %-30s %s %s\n", "Input files", $fileName[0], $fileName[1];
+ printf $STAT "$inde %-30s %s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf $STAT "$inde %-30s %s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf $STAT "$inde %-30s %s\n", "Cut-off quality score", $cutOffPhScore;
+ printf $STAT "$inde %-30s %s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf $STAT "$inde %-30s %s\n", "Number of CPUs", $noOfProcesses;
+
+ print $STAT "\n\n";
+
+ print $STAT "QC statistics\n";
+ printf $STAT "$inde %-50s %-20s %s\n", "File name", getFileName($fileName[0]), getFileName($fileName[1]);
+ printf $STAT "$inde %-50s %-20d %d\n", "Total number of reads", $totalReads[0], $totalReads[1];
+ printf $STAT "$inde %-50s %-20d %d\n", "Total number of HQ reads", $totalReadsAfterHQ[0], $totalReadsAfterHQ[1];
+ $tmpPer = sprintf "%0.2f", $totalReadsAfterHQ[0]/$totalReads[0]*100;
+ printf $STAT "$inde %-50s %-20s %0.2f%s\n", "Percentage of HQ reads", $tmpPer."%", $totalReadsAfterHQ[1]/$totalReads[1]*100, "%";
+ printf $STAT "$inde %-50s %-20.f %.f\n", "Total number of bases", $totalBases[0], $totalBases[1];
+ printf $STAT "$inde %-50s %-20.f %.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ[0], $totalBasesAfterHQ[1];
+ printf $STAT "$inde %-50s %-20.f %.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ[0], $totalHQBasesAfterHQ[1];
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ[0]/$totalBasesAfterHQ[0]*100;
+ printf $STAT "$inde %-50s %-20s %0.2f%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%" , $totalHQBasesAfterHQ[1]/$totalBasesAfterHQ[1]*100, "%";
+ if(defined($priAdaLib)) {
+ printf $STAT "$inde %-50s %-20d %d\n", "Number of Primer/Adaptor contaminated HQ reads", $totalValidReadsWithPriAda[0], $totalValidReadsWithPriAda[1];
+ }
+ else {
+ printf $STAT "$inde %-50s %-20s %s\n", "Number of Primer/Adaptor contaminated HQ reads", "NA", "NA";
+ }
+ printf $STAT "$inde %-50s %-20d %d\n", "Total number of HQ filtered reads", $totalReadsFinal[0], $totalReadsFinal[1];
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal[0]/$totalReads[0]*100;
+ printf $STAT "$inde %-50s %-20s %0.2f%s\n", "Percentage of HQ filtered reads", $tmpPer."%", $totalReadsFinal[1]/$totalReads[1]*100, "%";
+
+ print $STAT "\n\n";
+
+ print $STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", getFileName($fileName[0]), getFileName($fileName[1]), getFileName($outFileName[0]), getFileName($outFileName[1])],
+ ["Minimum read length", $minLen[0], $minLen[1], $minLen[2], $minLen[3]],
+ ["Maximum read length", $maxLen[0], $maxLen[1], $maxLen[2], $maxLen[3]],
+ ["Average read length", (sprintf "%0.2f", $totalBases[0]/$totalReads[0]), (sprintf "%0.2f", $totalBases[1]/$totalReads[1]), (sprintf "%0.2f", $totalBasesFinal[0]/$totalReadsFinal[0]), (sprintf "%0.2f", $totalBasesFinal[1]/$totalReadsFinal[1])],
+ ["Total number of reads", $totalReads[0], $totalReads[1], $totalReadsFinal[0], $totalReadsFinal[1]],
+ ["Total number of reads with non-ATGC bases", $readsWithN[0], $readsWithN[1], $readsWithN[2], $readsWithN[3]],
+ ["Percentage of reads with non-ATGC bases", (sprintf "%0.2f", $readsWithN[0]/$totalReads[0]*100)."%", (sprintf "%0.2f", $readsWithN[1]/$totalReads[1]*100)."%", (sprintf "%0.2f", $readsWithN[2]/$totalReadsFinal[0]*100)."%", (sprintf "%0.2f", $readsWithN[3]/$totalReadsFinal[1]*100)."%"],
+ ["Total number of bases", $totalBases[0], $totalBases[1], $totalBasesFinal[0], $totalBasesFinal[1]],
+ ["Total number of HQ bases", $totalHQBases[0], $totalHQBases[1], $totalHQBasesFinal[0], $totalHQBasesFinal[1]],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalHQBases[1]/$totalBases[1]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[0]/$totalBasesFinal[0]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[1]/$totalBasesFinal[1]*100)."%"],
+ ["Total number of non-ATGC bases", $totalNs[0], $totalNs[1], $totalNs[2], $totalNs[3]],
+ ["Percentage of non-ATGC bases", (sprintf "%0.2f", $totalNs[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalNs[1]/$totalBases[1]*100)."%", (sprintf "%0.2f", $totalNs[2]/$totalBasesFinal[0]*100)."%", (sprintf "%0.2f", $totalNs[3]/$totalBasesFinal[1]*100)."%"],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf $STAT "$inde %-45s %-20s %-20s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2], $arr[$i][3], $arr[$i][4];
+ }
+ else {
+ printf $STAT "$inde %-45s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ }
+
+ print $STAT "\n\n";
+
+ print $STAT "Average quality score at each base position of input reads\n\n";
+ my $c = 0;
+ @graphData1 = ();
+ @graphData2 = ();
+ foreach my $arr (@positionSpecificBaseCount) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReads[$c];
+ if($c == 0) {
+ $graphData1[0][$basePos-1] = $basePos;
+ $graphData1[1][$basePos-1] = $outVal;
+ }
+ else {
+ $graphData2[0][$basePos-1] = $basePos;
+ $graphData2[1][$basePos-1] = $outVal;
+ }
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ print $STAT "\n\n";
+ $c++;
+ }
+ print $STAT "\n\n";
+ if(!defined($isOnlyStat)) {
+ print $STAT "Average quality score at each base position of filtered reads\n\n";
+ $c = 0;
+ foreach my $arr (@positionSpecificBaseCountHQ) {
+ print $STAT getFileName($outFileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReadsFinal[$c];
+ if($c == 0) {
+ $graphData1[2][$basePos-1] = $outVal;
+ }
+ else {
+ $graphData2[2][$basePos-1] = $outVal;
+ }
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n\n";
+ }
+ }
+ print $STAT "\n\n";
+ prepareData4RangeGraph($STAT);
+ if($isGDMod) {
+ drawGraph(\@graphData1, $outFolder.getFileName($fileName[0])."_avgQual.png", getFileName($fileName[0]));
+ drawGraph(\@graphData2, $outFolder.getFileName($fileName[1])."_avgQual.png", getFileName($fileName[1]));
+ }
+ }
+ else {
+ printf $STAT "$inde %-30s %s\n", "Library type", "Single-end";
+ printf $STAT "$inde %-30s %s\n", "Input file", $fileName[0];
+ printf $STAT "$inde %-30s %s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf $STAT "$inde %-30s %s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf $STAT "$inde %-30s %s\n", "Cut-off quality score", $cutOffPhScore;
+ printf $STAT "$inde %-30s %s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf $STAT "$inde %-30s %s\n", "Number of CPUs", $noOfProcesses;
+
+ print $STAT "\n\n";
+
+ print $STAT "QC statistics\n";
+ printf $STAT "$inde %-50s %s\n", "File name", getFileName($fileName[0]);
+ printf $STAT "$inde %-50s %d\n", "Total number of reads", $totalReads[0];
+ printf $STAT "$inde %-50s %d\n", "Total number of HQ reads", $totalReadsAfterHQ[0];
+ $tmpPer = sprintf "%0.2f", $totalReadsAfterHQ[0]/$totalReads[0]*100;
+ printf $STAT "$inde %-50s %s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf $STAT "$inde %-50s %.f\n", "Total number of bases", $totalBases[0];
+ printf $STAT "$inde %-50s %.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ[0];
+ printf $STAT "$inde %-50s %.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ[0];
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ[0]/$totalBasesAfterHQ[0]*100;
+ printf $STAT "$inde %-50s %s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf $STAT "$inde %-50s %d\n", "Number of Primer/Adaptor contaminated HQ reads", $totalValidReadsWithPriAda[0];
+ }
+ else {
+ printf $STAT "$inde %-50s %s\n", "Number of Primer/Adaptor contaminated HQ reads", "NA";
+ }
+ printf $STAT "$inde %-50s %d\n", "Total number of HQ filtered reads", $totalReadsFinal[0];
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal[0]/$totalReads[0]*100;
+ printf $STAT "$inde %-50s %s\n", "Percentage of HQ filtered reads", $tmpPer."%";
+
+ print $STAT "\n\n";
+
+ print $STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", getFileName($fileName[0]), getFileName($outFileName[0])],
+ ["Minimum read length", $minLen[0], $minLen[1]],
+ ["Maximum read length", $maxLen[0], $maxLen[1]],
+ ["Average read length", (sprintf "%0.2f", $totalBases[0]/$totalReads[0]), (sprintf "%0.2f", $totalBasesFinal[0]/$totalReadsFinal[0])],
+ ["Total number of reads", $totalReads[0], $totalReadsFinal[0]],
+ ["Total number of reads with non-ATGC bases", $readsWithN[0], $readsWithN[1]],
+ ["Percentage of reads with non-ATGC bases", (sprintf "%0.2f", $readsWithN[0]/$totalReads[0]*100)."%", (sprintf "%0.2f", $readsWithN[1]/$totalReadsFinal[0]*100)."%"],
+ ["Total number of bases", $totalBases[0], $totalBasesFinal[0]],
+ ["Total number of HQ bases", $totalHQBases[0], $totalHQBasesFinal[0]],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[0]/$totalBasesFinal[0]*100)."%"],
+ ["Total number of non-ATGC bases", $totalNs[0], $totalNs[1]],
+ ["Percentage of non-ATGC bases", (sprintf "%0.2f", $totalNs[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalNs[1]/$totalBasesFinal[0]*100)."%"],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf $STAT "$inde %-50s %-20s %s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf $STAT "$inde %-50s %s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print $STAT "\n\n";
+
+ @graphData1 = ();
+ print $STAT "Average quality score at each base position of input reads\n\n";
+ my $c = 0;
+ foreach my $arr (@positionSpecificBaseCount) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReads[$c];
+ $graphData1[0][$basePos-1] = $basePos;
+ $graphData1[1][$basePos-1] = $outVal;
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n";
+ }
+ print $STAT "\n\n";
+ $c = 0;
+ if(!defined($isOnlyStat)) {
+ print $STAT "Average quality score at each base position of filtered reads\n\n";
+ foreach my $arr (@positionSpecificBaseCountHQ) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReadsFinal[$c];
+ $graphData1[2][$basePos-1] = $outVal;
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n";
+ }
+ }
+ print $STAT "\n\n";
+ prepareData4RangeGraph($STAT);
+ if($isGDMod) {
+ drawGraph(\@graphData1, $outFolder.getFileName($fileName[0])."_avgQual.png", getFileName($fileName[0]));
+ }
+ }
+}
+
+sub printStatTab {
+ my $STAT = $_[0];
+ my $tmpPer;
+ my $inde = "\t";
+ print $STAT "Parameters\n";
+ my @graphData1 = ();
+ my @graphData2 = ();
+ if($isPairedEnd) {
+ printf $STAT "\t%s\t%s\n", "Library type", "Paired-end";
+ printf $STAT "\t%s\t%s\t%s\n", "Input files", $fileName[0], $fileName[1];
+ printf $STAT "\t%s\t%s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf $STAT "\t%s\t%s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf $STAT "\t%s\t%s\n", "Cut-off quality score", $cutOffPhScore;
+ printf $STAT "\t%s\t%s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf $STAT "\t%s\t%s\n", "Number of CPUs", $noOfProcesses;
+
+ print $STAT "\n\n";
+
+ print $STAT "QC statistics\n";
+ printf $STAT "\t%s\t%s\t%s\n", "File name", getFileName($fileName[0]), getFileName($fileName[1]);
+ printf $STAT "\t%s\t%d\t%d\n", "Total number of reads", $totalReads[0], $totalReads[1];
+ printf $STAT "\t%s\t%d\t%d\n", "Total number of HQ reads", $totalReadsAfterHQ[0], $totalReadsAfterHQ[1];
+ $tmpPer = sprintf "%0.2f", $totalReadsAfterHQ[0]/$totalReads[0]*100;
+ printf $STAT "\t%s\t%s\t%0.2f%s\n", "Percentage of HQ reads", $tmpPer."%", $totalReadsAfterHQ[1]/$totalReads[1]*100, "%";
+ printf $STAT "\t%s\t%.f\t%.f\n", "Total number of bases", $totalBases[0], $totalBases[1];
+ printf $STAT "\t%s\t%.f\t%.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ[0], $totalBasesAfterHQ[1];
+ printf $STAT "\t%s\t%.f\t%.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ[0], $totalHQBasesAfterHQ[1];
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ[0]/$totalBasesAfterHQ[0]*100;
+ printf $STAT "\t%s\t%s\t%0.2f%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%" , $totalHQBasesAfterHQ[1]/$totalBasesAfterHQ[1]*100, "%";
+ if(defined($priAdaLib)) {
+ printf $STAT "\t%s\t%d\t%d\n", "Number of Primer/Adaptor contaminated HQ reads", $totalValidReadsWithPriAda[0], $totalValidReadsWithPriAda[1];
+ }
+ else {
+ printf $STAT "\t%s\t%s\t%s\n", "Number of Primer/Adaptor contaminated HQ reads", "NA", "NA";
+ }
+ printf $STAT "\t%s\t%d\t%d\n", "Total number of HQ filtered reads", $totalReadsFinal[0], $totalReadsFinal[1];
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal[0]/$totalReads[0]*100;
+ printf $STAT "\t%s\t%s\t%0.2f%s\n", "Percentage of HQ filtered reads", $tmpPer."%", $totalReadsFinal[1]/$totalReads[1]*100, "%";
+
+ print $STAT "\n\n";
+
+ print $STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", getFileName($fileName[0]), getFileName($fileName[1]), getFileName($outFileName[0]), getFileName($outFileName[1])],
+ ["Minimum read length", $minLen[0], $minLen[1], $minLen[2], $minLen[3]],
+ ["Maximum read length", $maxLen[0], $maxLen[1], $maxLen[2], $maxLen[3]],
+ ["Average read length", (sprintf "%0.2f", $totalBases[0]/$totalReads[0]), (sprintf "%0.2f", $totalBases[1]/$totalReads[1]), (sprintf "%0.2f", $totalBasesFinal[0]/$totalReadsFinal[0]), (sprintf "%0.2f", $totalBasesFinal[1]/$totalReadsFinal[1])],
+ ["Total number of reads", $totalReads[0], $totalReads[1], $totalReadsFinal[0], $totalReadsFinal[1]],
+ ["Total number of reads with non-ATGC bases", $readsWithN[0], $readsWithN[1], $readsWithN[2], $readsWithN[3]],
+ ["Percentage of reads with non-ATGC bases", (sprintf "%0.2f", $readsWithN[0]/$totalReads[0]*100)."%", (sprintf "%0.2f", $readsWithN[1]/$totalReads[1]*100)."%", (sprintf "%0.2f", $readsWithN[2]/$totalReadsFinal[0]*100)."%", (sprintf "%0.2f", $readsWithN[3]/$totalReadsFinal[1]*100)."%"],
+ ["Total number of bases", $totalBases[0], $totalBases[1], $totalBasesFinal[0], $totalBasesFinal[1]],
+ ["Total number of HQ bases", $totalHQBases[0], $totalHQBases[1], $totalHQBasesFinal[0], $totalHQBasesFinal[1]],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalHQBases[1]/$totalBases[1]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[0]/$totalBasesFinal[0]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[1]/$totalBasesFinal[1]*100)."%"],
+ ["Total number of non-ATGC bases", $totalNs[0], $totalNs[1], $totalNs[2], $totalNs[3]],
+ ["Percentage of non-ATGC bases", (sprintf "%0.2f", $totalNs[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalNs[1]/$totalBases[1]*100)."%", (sprintf "%0.2f", $totalNs[2]/$totalBasesFinal[0]*100)."%", (sprintf "%0.2f", $totalNs[3]/$totalBasesFinal[1]*100)."%"],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf $STAT "\t%s\t%s\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2], $arr[$i][3], $arr[$i][4];
+ }
+ else {
+ printf $STAT "\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ }
+
+ print $STAT "\n\n";
+
+ print $STAT "Average quality score at each base position of input reads\n\n";
+ my $c = 0;
+ @graphData1 = ();
+ @graphData2 = ();
+ foreach my $arr (@positionSpecificBaseCount) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReads[$c];
+ if($c == 0) {
+ $graphData1[0][$basePos-1] = $basePos;
+ $graphData1[1][$basePos-1] = $outVal;
+ }
+ else {
+ $graphData2[0][$basePos-1] = $basePos;
+ $graphData2[1][$basePos-1] = $outVal;
+ }
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ print $STAT "\n\n";
+ $c++;
+ }
+ print $STAT "\n\n";
+ if(!defined($isOnlyStat)) {
+ print $STAT "Average quality score at each base position of filtered reads\n\n";
+ $c = 0;
+ foreach my $arr (@positionSpecificBaseCountHQ) {
+ print $STAT getFileName($outFileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReadsFinal[$c];
+ if($c == 0) {
+ $graphData1[2][$basePos-1] = $outVal;
+ }
+ else {
+ $graphData2[2][$basePos-1] = $outVal;
+ }
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n\n";
+ }
+ }
+ print $STAT "\n\n";
+ prepareData4RangeGraph($STAT);
+ if($isGDMod) {
+ drawGraph(\@graphData1, $outFolder.getFileName($fileName[0])."_avgQual.png", getFileName($fileName[0]));
+ drawGraph(\@graphData2, $outFolder.getFileName($fileName[1])."_avgQual.png", getFileName($fileName[1]));
+ }
+ }
+ else {
+ printf $STAT "\t%s\t%s\n", "Library type", "Single-end";
+ printf $STAT "\t%s\t%s\n", "Input file", $fileName[0];
+ printf $STAT "\t%s\t%s\n", "Primer/Adaptor library", defined($priAdaLib)?(($priAdaLib ne "u")?$priAdaLibNames[$priAdaLib]:"User defined ($priAdaFile)"):"NA";
+ printf $STAT "\t%s\t%s\n", "Cut-off read length for HQ", $cutOffReadLen4HQ."%";
+ printf $STAT "\t%s\t%s\n", "Cut-off quality score", $cutOffPhScore;
+ printf $STAT "\t%s\t%s\n", "Only statistics", defined($isOnlyStat)?"On":"Off";
+ printf $STAT "\t%s\t%s\n", "Number of CPUs", $noOfProcesses;
+
+ print $STAT "\n\n";
+
+ print $STAT "QC statistics\n";
+ printf $STAT "\t%s\t%s\n", "File name", getFileName($fileName[0]);
+ printf $STAT "\t%s\t%d\n", "Total number of reads", $totalReads[0];
+ printf $STAT "\t%s\t%d\n", "Total number of HQ reads", $totalReadsAfterHQ[0];
+ $tmpPer = sprintf "%0.2f", $totalReadsAfterHQ[0]/$totalReads[0]*100;
+ printf $STAT "\t%s\t%s\n", "Percentage of HQ reads", $tmpPer."%";
+ printf $STAT "\t%s\t%.f\n", "Total number of bases", $totalBases[0];
+ printf $STAT "\t%s\t%.f\n", "Total number of bases in HQ reads", $totalBasesAfterHQ[0];
+ printf $STAT "\t%s\t%.f\n", "Total number of HQ bases in HQ reads", $totalHQBasesAfterHQ[0];
+ $tmpPer = sprintf "%0.2f", $totalHQBasesAfterHQ[0]/$totalBasesAfterHQ[0]*100;
+ printf $STAT "\t%s\t%s\n", "Percentage of HQ bases in HQ reads", $tmpPer."%";
+ if(defined($priAdaLib)) {
+ printf $STAT "\t%s\t%d\n", "Number of Primer/Adaptor contaminated HQ reads", $totalValidReadsWithPriAda[0];
+ }
+ else {
+ printf $STAT "\t%s\t%s\n", "Number of Primer/Adaptor contaminated HQ reads", "NA";
+ }
+ printf $STAT "\t%s\t%d\n", "Total number of HQ filtered reads", $totalReadsFinal[0];
+ $tmpPer = sprintf "%0.2f", $totalReadsFinal[0]/$totalReads[0]*100;
+ printf $STAT "\t%s\t%s\n", "Percentage of HQ filtered reads", $tmpPer."%";
+
+ print $STAT "\n\n";
+
+ print $STAT "Detailed QC statistics\n";
+ my @arr = (
+ ["File name", getFileName($fileName[0]), getFileName($outFileName[0])],
+ ["Minimum read length", $minLen[0], $minLen[1]],
+ ["Maximum read length", $maxLen[0], $maxLen[1]],
+ ["Average read length", (sprintf "%0.2f", $totalBases[0]/$totalReads[0]), (sprintf "%0.2f", $totalBasesFinal[0]/$totalReadsFinal[0])],
+ ["Total number of reads", $totalReads[0], $totalReadsFinal[0]],
+ ["Total number of reads with non-ATGC bases", $readsWithN[0], $readsWithN[1]],
+ ["Percentage of reads with non-ATGC bases", (sprintf "%0.2f", $readsWithN[0]/$totalReads[0]*100)."%", (sprintf "%0.2f", $readsWithN[1]/$totalReadsFinal[0]*100)."%"],
+ ["Total number of bases", $totalBases[0], $totalBasesFinal[0]],
+ ["Total number of HQ bases", $totalHQBases[0], $totalHQBasesFinal[0]],
+ ["Percentage of HQ bases", (sprintf "%0.2f", $totalHQBases[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalHQBasesFinal[0]/$totalBasesFinal[0]*100)."%"],
+ ["Total number of non-ATGC bases", $totalNs[0], $totalNs[1]],
+ ["Percentage of non-ATGC bases", (sprintf "%0.2f", $totalNs[0]/$totalBases[0]*100)."%", (sprintf "%0.2f", $totalNs[1]/$totalBasesFinal[0]*100)."%"],
+ );
+ for(my $i=0; $i<@arr; $i++) {
+ if(!defined($isOnlyStat)) {
+ printf $STAT "\t%s\t%s\t%s\n", $arr[$i][0], $arr[$i][1], $arr[$i][2];
+ }
+ else {
+ printf $STAT "\t%s\t%s\n", $arr[$i][0], $arr[$i][1];
+ }
+ }
+
+ print $STAT "\n\n";
+
+
+ @graphData1 = ();
+ print $STAT "Average quality score at each base position of input reads\n\n";
+ my $c = 0;
+ foreach my $arr (@positionSpecificBaseCount) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReads[$c];
+ $graphData1[0][$basePos-1] = $basePos;
+ $graphData1[1][$basePos-1] = $outVal;
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n";
+ }
+ print $STAT "\n\n";
+ $c = 0;
+ if(!defined($isOnlyStat)) {
+ print $STAT "Average quality score at each base position of filtered reads\n\n";
+ foreach my $arr (@positionSpecificBaseCountHQ) {
+ print $STAT getFileName($fileName[$c]), "\n";
+ my $basePos = 1;
+ foreach my $val (@$arr) {
+ my $outVal = sprintf "%0.2f", $val/$totalReadsFinal[$c];
+ $graphData1[2][$basePos-1] = $outVal;
+ print $STAT $outVal, "\t";
+ $basePos++;
+ }
+ $c++;
+ print $STAT "\n";
+ }
+ }
+ print $STAT "\n\n";
+ prepareData4RangeGraph($STAT);
+ if($isGDMod) {
+ drawGraph(\@graphData1, $outFolder.getFileName($fileName[0])."_avgQual.png", getFileName($fileName[0]));
+ }
+ }
+}
diff --git a/QC/lib/454PEhtml.pl b/QC/lib/454PEhtml.pl
new file mode 100644
index 0000000..9debd75
--- /dev/null
+++ b/QC/lib/454PEhtml.pl
@@ -0,0 +1,384 @@
+sub htmlPrint{
+ my ($progPath, $prog, $htF, $iFol, $isOnlyStat, $inpFs, $statFile, $oFol, $fileNames4HTML) = @_;
+ my $imgPath = $progPath . "lib/imgs";
+ my $cssPath = $progPath . "lib";
+ my $analMsg1 = ($isOnlyStat)?"":"and filtering";
+ my ($file1, $file2) = split(":::::", $inpFs);
+ open(SF, "<$statFile") or print "Can not open statistics file: $statFile\n";
+ my @statFData = <SF>;
+ close(SF);
+ my $statFileOnlyName = getFileName($statFile);
+ my ($t, $priAdaLib, $linker, $isHPTOn, $HPLen, $isLenFOn, $minLen, $cutLen, $cutQual, $nCPUs, $onlySOnOff);
+ my $ind = 2;
+ ($t, $t, $priAdaLib) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ ($t, $t, $linker) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ ($t, $t, $isHPTOn) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ if($isHPTOn =~ /Off/) {
+ $HPLen = "NA";
+ }
+ else {
+ ($t, $t, $HPLen) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ }
+ ($t, $t, $isLenFOn) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ if($isLenFOn =~ /Off/) {
+ $minLen = "NA";
+ }
+ else {
+ ($t, $t, $minLen) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ }
+ ($t, $t, $cutLen) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ ($t, $t, $cutQual) = split(/ {2,}|\t/, $statFData[$ind]); $ind++; $ind++;
+ ($t, $t, $nCPUs) = split(/ {2,}|\t/, $statFData[$ind]);
+ $onlySOnOff = ($isOnlyStat)?"On":"Off";
+ my ($outSeqFile, $outQualFile, $lenDistF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieFPE, $sumPieF);
+ ($outSeqFile, $outQualFile, $lenDistF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieFPE, $sumPieF) = @$fileNames4HTML;
+ #($outFile1, $outFile2, $unPaired, $avgQF1, $avgQF2, $baseCntF1, $baseCntF2, $gcDistF1, $gcDistF2, $qualDistF1, $qualDistF2, $sumPieF) = @$fileNames4HTML if($isPairedEnd);
+ $outSeqFile = getFileName($outSeqFile);
+ $outQualFile = getFileName($outQualFile);
+ #$unPaired = getFileName($unPaired) if($isPairedEnd);
+ my $inpFilesMsg;
+ $inpFilesMsg = "input file, $file1(A),";
+ #$inpFilesMsg = "both input files, $file1(A) and $file2(B)," if($isPairedEnd);
+ my $b4A4Msg;
+ $b4A4Msg = "before and after QC";
+ $b4A4Msg = "before QC" if($isOnlyStat);
+ #### Getting current time
+ my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
+ my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun);
+ my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
+ my $year = 1900 + $yearOffset;
+ #my $theTime = "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
+ my $theTime = "$weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
+ open(O,">$htF") or die "Can not create HTML file: $htF\n";
+ ##### Get a toolkit version
+ open(V, $progPath."lib/version");
+ my $version = <V>;
+ close(V);
+
+print O <<EOF;
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+ <html>
+ <head>
+ <title>NGS QC Toolkit</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+
+
+ <style>
+ BODY {
+ margin-top: 0;
+ background-repeat: repeat;
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 5px;
+ margin-bottom: 0;
+ }
+ .cnt {
+ font-family: Verdana, Arial, Helvetica, sans-serif;
+ font-size: 12px;
+ line-height: 20px;
+ padding: 5px 20px 0px;
+ }
+ Table .exp {
+ font-family: Verdana, Arial, Helvetica, sans-serif;
+ font-size: 13px;
+ line-height: 20px;
+ }
+ TD .padding {
+ padding: 0px 20px;
+ }
+ .head1 {
+ font-size: 18px;
+ font-weight: bold;
+ padding: 5px 0px;
+ }
+ .head2 {
+ font-size: 14px;
+ font-weight: bold;
+ padding: 5px 0px;
+ }
+ .head3 {
+ font-size: 12px;
+ font-weight: bold;
+ }
+ A {
+ text-decoration: none;
+ color: #0000FF;
+ }
+ .tblBg TABLE TD {
+ background-color: #EEEEEE;
+ }
+ .tblBg2 TABLE TD {
+ background-color: #E1E1E1;
+ }
+ </style>
+
+ </head>
+
+ <body bgcolor="#bbbbbb">
+ <table align="center" width="900" cellspacing="0" cellpadding="0" bgcolor="#ffffff" border="0">
+
+ <tr>
+ <td width="17" rowspan="6"> </td>
+ <td height="150" bgcolor="#E1E1E1"><center><a href="http://www.nipgr.res.in/ngsqctoolkit.html"><b><font style="font-size: 50px;">NGS QC T</font><font style="font-size: 40px;">OOLKIT</font></b></center></a>
+ </td>
+ <td width="17" rowspan="6"></td>
+ </tr>
+ <tr>
+ <td valign="top" class="tblBg">
+ <div class="cnt">
+ <table class="cnt" width="100%" border="0">
+ <tr>
+ <td class="head1">Results of quality control (QC) using $prog v$version <font style="font-size:10px">($theTime)</font></td>
+ </tr>
+ <tr>
+ <td class="head2">Input files and parameters:</td>
+ </tr>
+ <tr><td class="tblBg2">
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td>Analysis type</td><td>Quality check $analMsg1 of 454 paired-end sequencing data</td>
+ </tr>
+ <tr>
+ <td>Input file directory</td><td><a href="file://$iFol" target="_blank">$iFol</a></td>
+ </tr>
+ <tr>
+ <td>Input sequence file</td><td>$file1</td>
+ </tr>
+ <tr>
+ <td>Input quality file</td><td>$file2</td>
+ </tr>
+ <tr>
+ <td>Input file format</td><td>454 format (FASTA and QUAL)</td>
+ </tr>
+ <tr>
+ <td>Primer/Adaptor library</td><td>$priAdaLib</td>
+ </tr>
+ <tr>
+ <td>Linker sequence</td><td>$linker</td>
+ </tr>
+ <tr>
+ <td>Homopolymer trimming</td><td>$isHPTOn</td>
+ </tr>
+ <tr>
+ <td>Length of the homopolymer to be removed</td><td>$HPLen</td>
+ </tr>
+ <tr>
+ <td>Length filter</td><td>$isLenFOn</td>
+ </tr>
+ <tr>
+ <td>Cut-off for minimum read length</td><td>$minLen</td>
+ </tr>
+ <tr>
+ <td>Cut-off read length for HQ</td><td>$cutLen</td>
+ </tr>
+ <tr>
+ <td>Cut-off quality score</td><td>$cutQual</td>
+ </tr>
+ <tr>
+ <td>Only statistics</td><td>$onlySOnOff</td>
+ </tr>
+ <tr>
+EOF
+print O "
+ <td>Number of ". (($prog=~/_PRLL/)?"CPUs":"processes") ."</td><td>$nCPUs</td>
+\n";
+print O <<EOF;
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head2">Output files:</td>
+ </tr>
+ <tr><td class="tblBg2">
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td>Output folder</td><td><a href="file://$oFol" target="_blank">$oFol</a></td>
+ </tr>
+ <tr>
+ <td>QC statistics</td><td><a href="$statFileOnlyName" target="_blank">$statFileOnlyName</a></td>
+ </tr>
+EOF
+if(!$isOnlyStat) {
+print O <<EOF;
+ <tr>
+ <td>High quality filtered sequence file</td><td>$outSeqFile</td>
+ </tr>
+ <tr>
+ <td>High quality filtered quality file</td><td>$outQualFile</td>
+ </tr>
+EOF
+}
+print O <<EOF;
+ <tr>
+ <td>Length distribution</td><td><a href="$lenDistF1" target="_blank">$lenDistF1</a></td>
+ </tr>
+ <tr>
+ <td>Base composition</td><td><a href="$baseCntF1" target="_blank">$baseCntF1</a></td>
+ </tr>
+ <tr>
+ <td>GC content distribution</td><td><a href="$gcDistF1" target="_blank">$gcDistF1</a></td>
+ </tr>
+ <tr>
+ <td>Quality distribution</td><td><a href="$qualDistF1" target="_blank">$qualDistF1</a></td>
+ </tr>
+ <tr>
+ <td>Summary of QC (Paired reads)</td><td><a href="$sumPieFPE" target="_blank">$sumPieFPE</a></td>
+ </tr>
+ <tr>
+ <td>Summary of QC (Unpaired reads)</td><td><a href="$sumPieF" target="_blank">$sumPieF</a></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head2" style="background-color: #ffffff;"> </td>
+ </tr>
+ <tr>
+ <td class="head2">Results of QC</td>
+ </tr>
+EOF
+my $flag = 0;
+for(my $i=0; $i<@statFData; $i++) {
+ my $line = $statFData[$i];
+ if($line =~ /^QC statistics/) {
+ $flag = 1;
+ print O "<tr><td class=\"head3\">QC statistics</td></tr>\n<tr><td class=\"tblBg2\"><table width=\"100%\" border=\"0\" class=\"cnt\">\n";
+ next;
+ }
+ if($line =~ /^Detailed QC statistics/) {
+ $flag = 2;
+ print O "</table></td></tr>\n<tr><td class=\"head3\">Detailed QC statistics</td></tr>\n<tr><td class=\"tblBg2\"><table width=\"100%\" border=\"0\" class=\"cnt\">\n";
+ next;
+ }
+ chomp($line);
+ if($flag != 0 && $line ne "") {
+ if($line =~ /^\* /) {
+ print O "<tr><td><font size=\"1\">$line<br>";
+ $i++;
+ print O " $statFData[$i]</font></td></tr>\n";
+ next;
+ }
+ if($line =~ /^\-\-\-\-\-/) {
+ print O "<tr><td colspan=\"3\">$line-----------------------------------</td></tr>\n";
+ next;
+ }
+ my @clms = split(/ {2,}|\t/, $line);
+ shift(@clms);
+ my $bold = 0;
+ $bold = 1 if($line =~ /QC analysis of [^\n]+\:/);
+ print O "<tr><td colspan=\"3\"> </td></tr>\n" if($bold);
+ print O "<tr>";
+ foreach my $f (@clms) {
+ print O "<td>$f</td>" if(!$bold);
+ print O "<td><b>$f</b></td>" if($bold);
+ }
+ print O "</tr>\n";
+ }
+}
+print O "</table></td></tr>\n";
+print O <<EOF;
+ <tr>
+ <td class="head3">Summary of QC (Paired reads)</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following pie chart shows the summary of QC of Paired reads depicting percentage of high quality, trimmed (homopolymer and/or contamination) and trashed (low quality and/or short) paired read, and unpaired reads (one of the paired reads which passed QC).</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$sumPieFPE" border="1"><br><b>(A)</b></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Summary of QC (Unpaired reads)</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following pie chart shows the summary of QC of Unpaired reads depicting percentage of high quality, trimmed (homopolymer and/or contamination) and trashed (low quality and/or short) reads.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$sumPieF" border="1"><br><b>(A)</b></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Length distribution</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show number of reads for sequence length range for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$lenDistF1" border="1"><br><b>(A)</b>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">GC content distribution</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show number of reads for distinct average GC content (%) ranges for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$gcDistF1" border="1"><br><b>(A)</b>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Quality distribution</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show number of reads for different average PHRED quality scores for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$qualDistF1" border="1"><br><b>(A)</b>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Base composition</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show base composition (count) for $inpFilesMsg $b4A4Msg with percentage of bases at the bottom.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$baseCntF1" border="1"><br><b>(A)</b>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ </table>
+ </div>
+ </td>
+ </tr>
+ <tr><td style="font-size: 11px;" align="center" valign="middle" height="25" background="$imgPath/btmLine.png" bgcolor="#EEEEEE">For Questions and Suggestions, contact <a href="mailto:mjain\@nipgr.res.in">Mukesh Jain (mjain\@nipgr.res.in)</a>; <a href="mailto:ravipatel\@nipgr.res.in">Ravi Patel (ravipatel\@nipgr.res.in)</a></td></tr>
+
+ <!--- <tr><td colspan="3"><img src="$imgPath/down.png"></td></tr> --->
+ </table>
+ </body>
+ </html>
+EOF
+
+close(O);
+}
+1;
diff --git a/QC/lib/454html.pl b/QC/lib/454html.pl
new file mode 100644
index 0000000..7bf2f3d
--- /dev/null
+++ b/QC/lib/454html.pl
@@ -0,0 +1,347 @@
+sub htmlPrint{
+ my ($progPath, $prog, $htF, $iFol, $isOnlyStat, $inpFs, $statFile, $oFol, $fileNames4HTML) = @_;
+ my $imgPath = $progPath . "lib/imgs";
+ my $cssPath = $progPath . "lib";
+ my $analMsg1 = ($isOnlyStat)?"":"and filtering";
+ my ($file1, $file2) = split(":::::", $inpFs);
+ open(SF, "<$statFile") or print "Can not open statistics file: $statFile\n";
+ my @statFData = <SF>;
+ close(SF);
+ my $statFileOnlyName = getFileName($statFile);
+ my ($t, $priAdaLib, $isHPTOn, $HPLen, $isLenFOn, $minLen, $cutLen, $cutQual, $nCPUs, $onlySOnOff);
+ my $ind = 2;
+ ($t, $t, $priAdaLib) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ ($t, $t, $isHPTOn) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ if($isHPTOn =~ /Off/) {
+ $HPLen = "NA";
+ }
+ else {
+ ($t, $t, $HPLen) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ }
+ ($t, $t, $isLenFOn) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ if($isLenFOn =~ /Off/) {
+ $minLen = "NA";
+ }
+ else {
+ ($t, $t, $minLen) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ }
+ ($t, $t, $cutLen) = split(/ {2,}|\t/, $statFData[$ind]); $ind++;
+ ($t, $t, $cutQual) = split(/ {2,}|\t/, $statFData[$ind]); $ind++; $ind++;
+ ($t, $t, $nCPUs) = split(/ {2,}|\t/, $statFData[$ind]);
+ $onlySOnOff = ($isOnlyStat)?"On":"Off";
+ my ($outSeqFile, $outQualFile, $lenDistF1, $baseCntF1, $gcDistF1, $qualDistF1,, $sumPieF);
+ ($outSeqFile, $outQualFile, $lenDistF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieF) = @$fileNames4HTML;
+ #($outFile1, $outFile2, $unPaired, $avgQF1, $avgQF2, $baseCntF1, $baseCntF2, $gcDistF1, $gcDistF2, $qualDistF1, $qualDistF2, $sumPieF) = @$fileNames4HTML if($isPairedEnd);
+ $outSeqFile = getFileName($outSeqFile);
+ $outQualFile = getFileName($outQualFile);
+ #$unPaired = getFileName($unPaired) if($isPairedEnd);
+ my $inpFilesMsg;
+ $inpFilesMsg = "input file, $file1(A),";
+ #$inpFilesMsg = "both input files, $file1(A) and $file2(B)," if($isPairedEnd);
+ my $b4A4Msg;
+ $b4A4Msg = "before and after QC";
+ $b4A4Msg = "before QC" if($isOnlyStat);
+ #### Getting current time
+ my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
+ my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun);
+ my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
+ my $year = 1900 + $yearOffset;
+ #my $theTime = "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
+ my $theTime = "$weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
+ open(O,">$htF") or die "Can not create HTML file: $htF\n";
+ open(V, $progPath."lib/version");
+ my $version = <V>;
+ close(V);
+
+print O <<EOF;
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+ <html>
+ <head>
+ <title>NGS QC Toolkit</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+
+
+ <style>
+ BODY {
+ margin-top: 0;
+ background-repeat: repeat;
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 5px;
+ margin-bottom: 0;
+ }
+ .cnt {
+ font-family: Verdana, Arial, Helvetica, sans-serif;
+ font-size: 12px;
+ line-height: 20px;
+ padding: 5px 20px 0px;
+ }
+ Table .exp {
+ font-family: Verdana, Arial, Helvetica, sans-serif;
+ font-size: 13px;
+ line-height: 20px;
+ }
+ TD .padding {
+ padding: 0px 20px;
+ }
+ .head1 {
+ font-size: 18px;
+ font-weight: bold;
+ padding: 5px 0px;
+ }
+ .head2 {
+ font-size: 14px;
+ font-weight: bold;
+ padding: 5px 0px;
+ }
+ .head3 {
+ font-size: 12px;
+ font-weight: bold;
+ }
+ A {
+ text-decoration: none;
+ color: #0000FF;
+ }
+ .tblBg TABLE TD {
+ background-color: #EEEEEE;
+ }
+ .tblBg2 TABLE TD {
+ background-color: #E1E1E1;
+ }
+ </style>
+
+ </head>
+
+ <body bgcolor="#bbbbbb">
+ <table align="center" width="900" cellspacing="0" cellpadding="0" bgcolor="#ffffff" border="0">
+
+ <tr>
+ <td width="17" rowspan="6"> </td>
+ <td height="150" bgcolor="#E1E1E1"><center><a href="http://www.nipgr.res.in/ngsqctoolkit.html"><b><font style="font-size: 50px;">NGS QC T</font><font style="font-size: 40px;">OOLKIT</font></b></center></a>
+ </td>
+ <td width="17" rowspan="6"></td>
+ </tr>
+ <tr>
+ <td valign="top" class="tblBg">
+ <div class="cnt">
+ <table class="cnt" width="100%" border="0">
+ <tr>
+ <td class="head1">Results of quality control (QC) using $prog v$version <font style="font-size:10px">($theTime)</font></td>
+ </tr>
+ <tr>
+ <td class="head2">Input files and parameters:</td>
+ </tr>
+ <tr><td class="tblBg2">
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td>Analysis type</td><td>Quality check $analMsg1 of 454 data</td>
+ </tr>
+ <tr>
+ <td>Input file directory</td><td><a href="file://$iFol" target="_blank">$iFol</a></td>
+ </tr>
+ <tr>
+ <td>Input sequence file</td><td>$file1</td>
+ </tr>
+ <tr>
+ <td>Input quality file</td><td>$file2</td>
+ </tr>
+ <tr>
+ <td>Input file format</td><td>454 format (FASTA and QUAL)</td>
+ </tr>
+ <tr>
+ <td>Primer/Adaptor library</td><td>$priAdaLib</td>
+ </tr>
+ <tr>
+ <td>Homopolymer trimming</td><td>$isHPTOn</td>
+ </tr>
+ <tr>
+ <td>Length of the homopolymer to be removed</td><td>$HPLen</td>
+ </tr>
+ <tr>
+ <td>Length filter</td><td>$isLenFOn</td>
+ </tr>
+ <tr>
+ <td>Cut-off for minimum read length</td><td>$minLen</td>
+ </tr>
+ <tr>
+ <td>Cut-off read length for HQ</td><td>$cutLen</td>
+ </tr>
+ <tr>
+ <td>Cut-off quality score</td><td>$cutQual</td>
+ </tr>
+ <tr>
+ <td>Only statistics</td><td>$onlySOnOff</td>
+ </tr>
+ <tr>
+EOF
+print O "
+ <td>Number of ". (($prog=~/_PRLL/)?"CPUs":"processes") ."</td><td>$nCPUs</td>
+\n";
+print O <<EOF;
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head2">Output files:</td>
+ </tr>
+ <tr><td class="tblBg2">
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td>Output folder</td><td><a href="file://$oFol" target="_blank">$oFol</a></td>
+ </tr>
+ <tr>
+ <td>QC statistics</td><td><a href="$statFileOnlyName" target="_blank">$statFileOnlyName</a></td>
+ </tr>
+EOF
+if(!$isOnlyStat) {
+print O <<EOF;
+ <tr>
+ <td>High quality filtered sequence file</td><td>$outSeqFile</td>
+ </tr>
+ <tr>
+ <td>High quality filtered quality file</td><td>$outQualFile</td>
+ </tr>
+EOF
+}
+print O <<EOF;
+ <tr>
+ <td>Length distribution</td><td><a href="$lenDistF1" target="_blank">$lenDistF1</a></td>
+ </tr>
+ <tr>
+ <td>Base composition</td><td><a href="$baseCntF1" target="_blank">$baseCntF1</a></td>
+ </tr>
+ <tr>
+ <td>GC content distribution</td><td><a href="$gcDistF1" target="_blank">$gcDistF1</a></td>
+ </tr>
+ <tr>
+ <td>Quality distribution</td><td><a href="$qualDistF1" target="_blank">$qualDistF1</a></td>
+ </tr>
+ <tr>
+ <td>Summary of QC</td><td><a href="$sumPieF" target="_blank">$sumPieF</a></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head2" style="background-color: #ffffff;"> </td>
+ </tr>
+ <tr>
+ <td class="head2">Results of QC</td>
+ </tr>
+EOF
+my $flag = 0;
+for(my $i=0; $i<@statFData; $i++) {
+ my $line = $statFData[$i];
+ if($line =~ /^QC statistics/) {
+ $flag = 1;
+ print O "<tr><td class=\"head3\">QC statistics</td></tr>\n<tr><td class=\"tblBg2\"><table width=\"100%\" border=\"0\" class=\"cnt\">\n";
+ next;
+ }
+ if($line =~ /^Detailed QC statistics/) {
+ $flag = 2;
+ print O "</table></td></tr>\n<tr><td class=\"head3\">Detailed QC statistics</td></tr>\n<tr><td class=\"tblBg2\"><table width=\"100%\" border=\"0\" class=\"cnt\">\n";
+ next;
+ }
+ chomp($line);
+ if($flag != 0 && $line ne "") {
+ my @clms = split(/ {2,}|\t/, $line);
+ shift(@clms);
+ print O "<tr>";
+ foreach my $f (@clms) {
+ print O "<td>$f</td>";
+ }
+ print O "</tr>\n";
+ }
+}
+print O "</table></td></tr>\n";
+print O <<EOF;
+ <tr>
+ <td class="head3">Summary of QC</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following pie chart shows the summary of QC depicting percentage of high quality, trimmed (homopolymer and/or contamination) and trashed (low quality and/or short) reads.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$sumPieF" border="1"><br><b>(A)</b></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Length distribution</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show number of reads for sequence length range for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$lenDistF1" border="1"><br><b>(A)</b>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">GC content distribution</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show number of reads for distinct average GC content (%) ranges for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$gcDistF1" border="1"><br><b>(A)</b>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Quality distribution</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show number of reads for different average PHRED quality scores for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$qualDistF1" border="1"><br><b>(A)</b>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Base composition</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show base composition (count) for $inpFilesMsg $b4A4Msg with percentage of bases at the bottom.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$baseCntF1" border="1"><br><b>(A)</b>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ </table>
+ </div>
+ </td>
+ </tr>
+ <tr><td style="font-size: 11px;" align="center" valign="middle" height="25" background="$imgPath/btmLine.png" bgcolor="#EEEEEE">For Questions and Suggestions, contact <a href="mailto:mjain\@nipgr.res.in">Mukesh Jain (mjain\@nipgr.res.in)</a>; <a href="mailto:ravipatel\@nipgr.res.in">Ravi Patel (ravipatel\@nipgr.res.in)</a></td></tr>
+
+ <!--- <tr><td colspan="3"><img src="$imgPath/down.png"></td></tr> --->
+ </table>
+ </body>
+ </html>
+EOF
+
+close(O);
+}
+1;
diff --git a/QC/lib/Fonts/Dustismo_Sans.ttf b/QC/lib/Fonts/Dustismo_Sans.ttf
new file mode 100644
index 0000000..473573d
Binary files /dev/null and b/QC/lib/Fonts/Dustismo_Sans.ttf differ
diff --git a/QC/lib/Fonts/LucidaSansDemiBold.ttf b/QC/lib/Fonts/LucidaSansDemiBold.ttf
new file mode 100644
index 0000000..a15910e
Binary files /dev/null and b/QC/lib/Fonts/LucidaSansDemiBold.ttf differ
diff --git a/QC/lib/Parallel/Changes b/QC/lib/Parallel/Changes
new file mode 100644
index 0000000..10018b1
--- /dev/null
+++ b/QC/lib/Parallel/Changes
@@ -0,0 +1,36 @@
+Revision history for Perl extension Parallel::ForkManager.
+
+0.7.5 Wed Dec 25 23:54:46 CET 2002
+ - Documentation fixes
+ - Fix bug if you specify max_procs = 0
+
+0.7.4 Thu Jul 4 23:00:46 CEST 2002
+ - on_wait callback now runs from the wait_all_children method
+ - run_on_wait can run a task periodically, not only once.
+
+0.7.3 Wed Oct 24 01:25:36 CEST 2001
+ - minor bugfix on calling the "on_finish" callback
+
+0.7.2 Mon May 14 15:38:55 CEST 2001
+ - win32 port
+ - fix for the broken wait_one_child
+
+0.7.1 Thu Apr 26 13:28:30 CEST 2001
+ - various semantical and grammar fixes in the documentation
+ - on_finish now get the exit signal also
+ - on_start now get the process-identification also
+ - described limitations in the doc
+
+0.7 Wed Apr 4 12:52:53 CEST 2001
+ - callback code tested, exit status return (Chuck, dLux)
+ - added parallel_get.pl, a parallel webget example (dLux)
+ - added callbacks.pl, a callback example (Chuck, dLux)
+ - documentation updtes (Chuck, dLux)
+
+0.6 Thu Nov 30 11:56:15 CET 2000
+ - documentation tweak fixes by Noah Robin
+ - warning elimination fixes
+
+0.5 Wed Oct 18 16:24:59 2000
+ - original version; created by h2xs 1.19
+
diff --git a/QC/lib/Parallel/ForkManager.pm b/QC/lib/Parallel/ForkManager.pm
new file mode 100644
index 0000000..0de0d20
--- /dev/null
+++ b/QC/lib/Parallel/ForkManager.pm
@@ -0,0 +1,416 @@
+=head1 NAME
+
+Parallel::ForkManager - A simple parallel processing fork manager
+
+=head1 SYNOPSIS
+
+ use Parallel::ForkManager;
+
+ $pm = new Parallel::ForkManager($MAX_PROCESSES);
+
+ foreach $data (@all_data) {
+ # Forks and returns the pid for the child:
+ my $pid = $pm->start and next;
+
+ ... do some work with $data in the child process ...
+
+ $pm->finish; # Terminates the child process
+ }
+
+=head1 DESCRIPTION
+
+This module is intended for use in operations that can be done in parallel
+where the number of processes to be forked off should be limited. Typical
+use is a downloader which will be retrieving hundreds/thousands of files.
+
+The code for a downloader would look something like this:
+
+ use LWP::Simple;
+ use Parallel::ForkManager;
+
+ ...
+
+ @links=(
+ ["http://www.foo.bar/rulez.data","rulez_data.txt"],
+ ["http://new.host/more_data.doc","more_data.doc"],
+ ...
+ );
+
+ ...
+
+ # Max 30 processes for parallel download
+ my $pm = new Parallel::ForkManager(30);
+
+ foreach my $linkarray (@links) {
+ $pm->start and next; # do the fork
+
+ my ($link,$fn) = @$linkarray;
+ warn "Cannot get $fn from $link"
+ if getstore($link,$fn) != RC_OK;
+
+ $pm->finish; # do the exit in the child process
+ }
+ $pm->wait_all_children;
+
+First you need to instantiate the ForkManager with the "new" constructor.
+You must specify the maximum number of processes to be created. If you
+specify 0, then NO fork will be done; this is good for debugging purposes.
+
+Next, use $pm->start to do the fork. $pm returns 0 for the child process,
+and child pid for the parent process (see also L<perlfunc(1p)/fork()>).
+The "and next" skips the internal loop in the parent process. NOTE:
+$pm->start dies if the fork fails.
+
+$pm->finish terminates the child process (assuming a fork was done in the
+"start").
+
+NOTE: You cannot use $pm->start if you are already in the child process.
+If you want to manage another set of subprocesses in the child process,
+you must instantiate another Parallel::ForkManager object!
+
+=head1 METHODS
+
+=over 5
+
+=item new $processes
+
+Instantiate a new Parallel::ForkManager object. You must specify the maximum
+number of children to fork off. If you specify 0 (zero), then no children
+will be forked. This is intended for debugging purposes.
+
+=item start [ $process_identifier ]
+
+This method does the fork. It returns the pid of the child process for
+the parent, and 0 for the child process. If the $processes parameter
+for the constructor is 0 then, assuming you're in the child process,
+$pm->start simply returns 0.
+
+An optional $process_identifier can be provided to this method... It is used by
+the "run_on_finish" callback (see CALLBACKS) for identifying the finished
+process.
+
+=item finish [ $exit_code ]
+
+Closes the child process by exiting and accepts an optional exit code
+(default exit code is 0) which can be retrieved in the parent via callback.
+If you use the program in debug mode ($processes == 0), this method doesn't
+do anything.
+
+=item set_max_procs $processes
+
+Allows you to set a new maximum number of children to maintain. Returns
+the previous setting.
+
+=item wait_all_children
+
+You can call this method to wait for all the processes which have been
+forked. This is a blocking wait.
+
+=back
+
+=head1 CALLBACKS
+
+You can define callbacks in the code, which are called on events like starting
+a process or upon finish.
+
+The callbacks can be defined with the following methods:
+
+=over 4
+
+=item run_on_finish $code [, $pid ]
+
+You can define a subroutine which is called when a child is terminated. It is
+called in the parent process.
+
+The paremeters of the $code are the following:
+
+ - pid of the process, which is terminated
+ - exit code of the program
+ - identification of the process (if provided in the "start" method)
+ - exit signal (0-127: signal name)
+ - core dump (1 if there was core dump at exit)
+
+=item run_on_start $code
+
+You can define a subroutine which is called when a child is started. It called
+after the successful startup of a child in the parent process.
+
+The parameters of the $code are the following:
+
+ - pid of the process which has been started
+ - identification of the process (if provided in the "start" method)
+
+=item run_on_wait $code, [$period]
+
+You can define a subroutine which is called when the child process needs to wait
+for the startup. If $period is not defined, then one call is done per
+child. If $period is defined, then $code is called periodically and the
+module waits for $period seconds betwen the two calls. Note, $period can be
+fractional number also. The exact "$period seconds" is not guarranteed,
+signals can shorten and the process scheduler can make it longer (on busy
+systems).
+
+The $code called in the "start" and the "wait_all_children" method also.
+
+No parameters are passed to the $code on the call.
+
+=back
+
+=head1 EXAMPLE
+
+=head2 Parallel get
+
+This small example can be used to get URLs in parallel.
+
+ use Parallel::ForkManager;
+ use LWP::Simple;
+ my $pm=new Parallel::ForkManager(10);
+ for my $link (@ARGV) {
+ $pm->start and next;
+ my ($fn)= $link =~ /^.*\/(.*?)$/;
+ if (!$fn) {
+ warn "Cannot determine filename from $fn\n";
+ } else {
+ $0.=" ".$fn;
+ print "Getting $fn from $link\n";
+ my $rc=getstore($link,$fn);
+ print "$link downloaded. response code: $rc\n";
+ };
+ $pm->finish;
+ };
+
+=head2 Callbacks
+
+Example of a program using callbacks to get child exit codes:
+
+ use strict;
+ use Parallel::ForkManager;
+
+ my $max_procs = 5;
+ my @names = qw( Fred Jim Lily Steve Jessica Bob Dave Christine Rico Sara );
+ # hash to resolve PID's back to child specific information
+
+ my $pm = new Parallel::ForkManager($max_procs);
+
+ # Setup a callback for when a child finishes up so we can
+ # get it's exit code
+ $pm->run_on_finish(
+ sub { my ($pid, $exit_code, $ident) = @_;
+ print "** $ident just got out of the pool ".
+ "with PID $pid and exit code: $exit_code\n";
+ }
+ );
+
+ $pm->run_on_start(
+ sub { my ($pid,$ident)=@_;
+ print "** $ident started, pid: $pid\n";
+ }
+ );
+
+ $pm->run_on_wait(
+ sub {
+ print "** Have to wait for one children ...\n"
+ },
+ 0.5
+ );
+
+ foreach my $child ( 0 .. $#names ) {
+ my $pid = $pm->start($names[$child]) and next;
+
+ # This code is the child process
+ print "This is $names[$child], Child number $child\n";
+ sleep ( 2 * $child );
+ print "$names[$child], Child $child is about to get out...\n";
+ sleep 1;
+ $pm->finish($child); # pass an exit code to finish
+ }
+
+ print "Waiting for Children...\n";
+ $pm->wait_all_children;
+ print "Everybody is out of the pool!\n";
+
+=head1 BUGS AND LIMITATIONS
+
+Do not use Parallel::ForkManager in an environment, where other child
+processes can affect the run of the main program, so using this module
+is not recommended in an environment where fork() / wait() is already used.
+
+If you want to use more than one copies of the Parallel::ForkManager, then
+you have to make sure that all children processes are terminated, before you
+use the second object in the main program.
+
+You are free to use a new copy of Parallel::ForkManager in the child
+processes, although I don't think it makes sense.
+
+=head1 COPYRIGHT
+
+Copyright (c) 2000 Szab�, Bal�zs (dLux)
+
+All right reserved. This program is free software; you can redistribute it
+and/or modify it under the same terms as Perl itself.
+
+=head1 AUTHOR
+
+ dLux (Szab�, Bal�zs) <dlux at kapu.hu>
+
+=head1 CREDITS
+
+ Noah Robin <sitz at onastick.net> (documentation tweaks)
+ Chuck Hirstius <chirstius at megapathdsl.net> (callback exit status, example)
+ Grant Hopwood <hopwoodg at valero.com> (win32 port)
+ Mark Southern <mark_southern at merck.com> (bugfix)
+
+=cut
+
+package Parallel::ForkManager;
+use POSIX ":sys_wait_h";
+use strict;
+use vars qw($VERSION);
+$VERSION='0.7.5';
+
+sub new { my ($c,$processes)=@_;
+ my $h={
+ max_proc => $processes,
+ processes => {},
+ in_child => 0,
+ };
+ return bless($h,ref($c)||$c);
+};
+
+sub start { my ($s,$identification)=@_;
+ die "Cannot start another process while you are in the child process"
+ if $s->{in_child};
+ while ($s->{max_proc} && ( keys %{ $s->{processes} } ) >= $s->{max_proc}) {
+ $s->on_wait;
+ $s->wait_one_child(defined $s->{on_wait_period} ? &WNOHANG : undef);
+ };
+ $s->wait_children;
+ if ($s->{max_proc}) {
+ my $pid=fork();
+ die "Cannot fork: $!" if !defined $pid;
+ if ($pid) {
+ $s->{processes}->{$pid}=$identification;
+ $s->on_start($pid,$identification);
+ } else {
+ $s->{in_child}=1 if !$pid;
+ }
+ return $pid;
+ } else {
+ $s->{processes}->{$$}=$identification;
+ $s->on_start($$,$identification);
+ return 0; # Simulating the child which returns 0
+ }
+}
+
+sub finish { my ($s, $x)=@_;
+ if ( $s->{in_child} ) {
+ exit ($x || 0);
+ }
+ if ($s->{max_proc} == 0) { # max_proc == 0
+ $s->on_finish($$, $x ,$s->{processes}->{$$}, 0, 0);
+ delete $s->{processes}->{$$};
+ }
+ return 0;
+}
+
+sub wait_children { my ($s)=@_;
+ return if !keys %{$s->{processes}};
+ my $kid;
+ do {
+ $kid = $s->wait_one_child(&WNOHANG);
+ } while $kid > 0 || $kid < -1; # AS 5.6/Win32 returns negative PIDs
+};
+
+*wait_childs=*wait_children; # compatibility
+
+sub wait_one_child { my ($s,$par)=@_;
+ my $kid;
+ while (1) {
+ $kid = $s->_waitpid(-1,$par||=0);
+ last if $kid == 0 || $kid == -1; # AS 5.6/Win32 returns negative PIDs
+ redo if !exists $s->{processes}->{$kid};
+ my $id = delete $s->{processes}->{$kid};
+ $s->on_finish( $kid, $? >> 8 , $id, $? & 0x7f, $? & 0x80 ? 1 : 0);
+ last;
+ }
+ $kid;
+};
+
+sub wait_all_children { my ($s)=@_;
+ while (keys %{ $s->{processes} }) {
+ $s->on_wait;
+ $s->wait_one_child(defined $s->{on_wait_period} ? &WNOHANG : undef);
+ };
+}
+
+*wait_all_childs=*wait_all_children; # compatibility;
+
+sub run_on_finish { my ($s,$code,$pid)=@_;
+ $s->{on_finish}->{$pid || 0}=$code;
+}
+
+sub on_finish { my ($s,$pid, at par)=@_;
+ my $code=$s->{on_finish}->{$pid} || $s->{on_finish}->{0} or return 0;
+ $code->($pid, at par);
+};
+
+sub run_on_wait { my ($s,$code, $period)=@_;
+ $s->{on_wait}=$code;
+ $s->{on_wait_period} = $period;
+}
+
+sub on_wait { my ($s)=@_;
+ if(ref($s->{on_wait}) eq 'CODE') {
+ $s->{on_wait}->();
+ if (defined $s->{on_wait_period}) {
+ local $SIG{CHLD} = sub { } if ! defined $SIG{CHLD};
+ select undef, undef, undef, $s->{on_wait_period}
+ };
+ };
+};
+
+sub run_on_start { my ($s,$code)=@_;
+ $s->{on_start}=$code;
+}
+
+sub on_start { my ($s, at par)=@_;
+ $s->{on_start}->(@par) if ref($s->{on_start}) eq 'CODE';
+};
+
+sub set_max_procs { my ($s, $mp)=@_;
+ $s->{max_proc} = $mp;
+}
+
+# OS dependant code follows...
+
+sub _waitpid { # Call waitpid() in the standard Unix fashion.
+ return waitpid($_[1],$_[2]);
+}
+
+# On ActiveState Perl 5.6/Win32 build 625, waitpid(-1, &WNOHANG) always
+# blocks unless an actual PID other than -1 is given.
+sub _NT_waitpid { my ($s, $pid, $par) = @_;
+ if ($par == &WNOHANG) { # Need to nonblock on each of our PIDs in the pool.
+ my @pids = keys %{ $s->{processes} };
+ # Simulate -1 (no processes awaiting cleanup.)
+ return -1 unless scalar(@pids);
+ # Check each PID in the pool.
+ my $kid;
+ foreach $pid (@pids) {
+ $kid = waitpid($pid, $par);
+ return $kid if $kid != 0; # AS 5.6/Win32 returns negative PIDs.
+ }
+ return $kid;
+ } else { # Normal waitpid() call.
+ return waitpid($pid, $par);
+ }
+}
+
+{
+ local $^W = 0;
+ if ($^O eq 'NT' or $^O eq 'MSWin32') {
+ *_waitpid = \&_NT_waitpid;
+ }
+}
+
+1;
diff --git a/QC/lib/Parallel/ForkManager/callback.pl b/QC/lib/Parallel/ForkManager/callback.pl
new file mode 100755
index 0000000..56a52e2
--- /dev/null
+++ b/QC/lib/Parallel/ForkManager/callback.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+use lib '.';
+use strict;
+use Parallel::ForkManager;
+
+my $max_procs = 5;
+my @names = qw( Fred Jim Lily Steve Jessica Bob Dave Christine Rico Sara );
+# hash to resolve PID's back to child specific information
+
+my $pm = new Parallel::ForkManager($max_procs);
+
+# Setup a callback for when a child finishes up so we can
+# get it's exit code
+$pm->run_on_finish(
+ sub { my ($pid, $exit_code, $ident) = @_;
+ print "** $ident just got out of the pool ".
+ "with PID $pid and exit code: $exit_code\n";
+ }
+);
+
+$pm->run_on_start(
+ sub { my ($pid,$ident)=@_;
+ print "** $ident started, pid: $pid\n";
+ }
+);
+
+$pm->run_on_wait(
+ sub {
+ print "** Have to wait for one children ...\n"
+ },
+ 0.5,
+);
+
+foreach my $child ( 0 .. $#names ) {
+ my $pid = $pm->start($names[$child]) and next;
+
+ # This code is the child process
+ print "This is $names[$child], Child number $child\n";
+ sleep ( 2 * $child );
+ print "$names[$child], Child $child is about to get out...\n";
+ sleep 1;
+ $pm->finish($child); # pass an exit code to finish
+}
+
+print "Waiting for Children...\n";
+$pm->wait_all_children;
+print "Everybody is out of the pool!\n";
+
diff --git a/QC/lib/Parallel/ForkManager/parallel_get.pl b/QC/lib/Parallel/ForkManager/parallel_get.pl
new file mode 100755
index 0000000..5c6f3cd
--- /dev/null
+++ b/QC/lib/Parallel/ForkManager/parallel_get.pl
@@ -0,0 +1,17 @@
+#!/usr/bin/perl -w
+use Parallel::ForkManager;
+use LWP::Simple;
+my $pm=new Parallel::ForkManager(10);
+for my $link (@ARGV) {
+ $pm->start and next;
+ my ($fn)= $link =~ /^.*\/(.*?)$/;
+ if (!$fn) {
+ warn "Cannot determine filename from $fn\n";
+ } else {
+ $0.=" ".$fn;
+ print "Getting $fn from $link\n";
+ my $rc=getstore($link,$fn);
+ print "$link downloaded. response code: $rc\n";
+ };
+ $pm->finish;
+};
diff --git a/QC/lib/Parallel/MANIFEST b/QC/lib/Parallel/MANIFEST
new file mode 100644
index 0000000..fae0a05
--- /dev/null
+++ b/QC/lib/Parallel/MANIFEST
@@ -0,0 +1,8 @@
+TODO
+Changes
+ForkManager.pm
+MANIFEST
+Makefile.PL
+test.pl
+ForkManager/parallel_get.pl
+ForkManager/callback.pl
diff --git a/QC/lib/Parallel/Makefile.PL b/QC/lib/Parallel/Makefile.PL
new file mode 100644
index 0000000..b18ea52
--- /dev/null
+++ b/QC/lib/Parallel/Makefile.PL
@@ -0,0 +1,7 @@
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+ 'NAME' => 'Parallel::ForkManager',
+ 'VERSION_FROM' => 'ForkManager.pm', # finds $VERSION
+);
diff --git a/QC/lib/Parallel/TODO b/QC/lib/Parallel/TODO
new file mode 100644
index 0000000..301cd3b
--- /dev/null
+++ b/QC/lib/Parallel/TODO
@@ -0,0 +1,2 @@
+- Test, test, test
+
diff --git a/QC/lib/Parallel/test.pl b/QC/lib/Parallel/test.pl
new file mode 100644
index 0000000..9634ece
--- /dev/null
+++ b/QC/lib/Parallel/test.pl
@@ -0,0 +1,20 @@
+# Before `make install' is performed this script should be runnable with
+# `make test'. After `make install' it should work as `perl test.pl'
+
+######################### We start with some black magic to print on failure.
+
+# Change 1..1 below to 1..last_test_to_print .
+# (It may become useful if the test is moved to ./t subdirectory.)
+
+BEGIN { $| = 1; print "1..1\n"; }
+END {print "not ok 1\n" unless $loaded;}
+use Parallel::ForkManager;
+$loaded = 1;
+print "ok 1\n";
+
+######################### End of black magic.
+
+# Insert your test code below (better if it prints "ok 13"
+# (correspondingly "not ok 13") depending on the success of chunk 13
+# of the test code):
+
diff --git a/QC/lib/html.pl b/QC/lib/html.pl
new file mode 100644
index 0000000..9066ae9
--- /dev/null
+++ b/QC/lib/html.pl
@@ -0,0 +1,470 @@
+sub htmlPrint{
+ my ($progPath, $prog, $htF, $iFol, $isPairedEnd, $isOnlyStat, $inpFs, $seqFormatName, $statFile, $oFol, $fileNames4HTML) = @_;
+ my $imgPath = $progPath . "lib/imgs";
+ my $cssPath = $progPath . "lib";
+ my $analMsg1 = ($isOnlyStat)?"":"and filtering";
+ my $analMsg2 = ($isPairedEnd)?"paired end data":"single end data";
+ my ($file1, $file2) = split(":::::", $inpFs);
+ open(I, "<$statFile") or print "Can not open statistics file: $statFile\n";
+ my @statFData = <I>;
+ close(I);
+ my $statFileOnlyName = getFileName($statFile);
+ my ($t, $priAdaLib, $cutLen, $cutQual, $nCPUs, $onlySOnOff);
+ ($t, $t, $priAdaLib) = split(/ {2,}|\t/, $statFData[3]);
+ ($t, $t, $cutLen) = split(/ {2,}|\t/, $statFData[4]);
+ ($t, $t, $cutQual) = split(/ {2,}|\t/, $statFData[5]);
+ ($t, $t, $nCPUs) = split(/ {2,}|\t/, $statFData[7]);
+ $onlySOnOff = ($isOnlyStat)?"On":"Off";
+ my ($outFile1, $outFile2, $unPaired, $avgQF1, $avgQF2, $avgQRangeF1, $avgQRangeF2, $baseCntF1, $baseCntF2, $gcDistF1, $gcDistF2, $qualDistF1, $qualDistF2, $sumPieF);
+ ($outFile1, $avgQF1, $baseCntF1, $gcDistF1, $qualDistF1, $sumPieF, $avgQRangeF1) = @$fileNames4HTML;
+ ($outFile1, $outFile2, $unPaired, $avgQF1, $avgQF2, $baseCntF1, $baseCntF2, $gcDistF1, $gcDistF2, $qualDistF1, $qualDistF2, $sumPieF, $avgQRangeF1, $avgQRangeF2) = @$fileNames4HTML if($isPairedEnd);
+ $outFile1 = getFileName($outFile1);
+ $outFile2 = getFileName($outFile2) if($isPairedEnd);
+ $unPaired = getFileName($unPaired) if($isPairedEnd);
+ my ($avgQRangeRF1, $avgQRangeFF1) = split(":::", $avgQRangeF1);
+ my ($avgQRangeRF2, $avgQRangeFF2) = split(":::", $avgQRangeF2);
+ my $inpFilesMsg;
+ $inpFilesMsg = "input file, $file1(A),";
+ $inpFilesMsg = "both input files, $file1(A) and $file2(B)," if($isPairedEnd);
+ my $b4A4Msg;
+ $b4A4Msg = "before and after QC";
+ $b4A4Msg = "before QC" if($isOnlyStat);
+ #### Getting current time
+ my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
+ my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun);
+ my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
+ my $year = 1900 + $yearOffset;
+ #my $theTime = "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
+ my $theTime = "$weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
+ open(O,">$htF") or die "Can not create HTML file: $htF\n";
+ open(V, $progPath."lib/version");
+ my $version = <V>;
+ close(V);
+
+print O <<EOF;
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+ <html>
+ <head>
+ <title>NGS QC Toolkit</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+
+
+ <style>
+ BODY {
+ margin-top: 0;
+ background-repeat: repeat;
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 5px;
+ margin-bottom: 0;
+ }
+ .cnt {
+ font-family: Verdana, Arial, Helvetica, sans-serif;
+ font-size: 12px;
+ line-height: 20px;
+ padding: 5px 20px 0px;
+ }
+ Table .exp {
+ font-family: Verdana, Arial, Helvetica, sans-serif;
+ font-size: 13px;
+ line-height: 20px;
+ }
+ TD .padding {
+ padding: 0px 20px;
+ }
+ .head1 {
+ font-size: 18px;
+ font-weight: bold;
+ padding: 5px 0px;
+ }
+ .head2 {
+ font-size: 14px;
+ font-weight: bold;
+ padding: 5px 0px;
+ }
+ .head3 {
+ font-size: 12px;
+ font-weight: bold;
+ }
+ A {
+ text-decoration: none;
+ color: #0000FF;
+ }
+ .tblBg TABLE TD {
+ background-color: #EEEEEE;
+ }
+ .tblBg2 TABLE TD {
+ background-color: #E1E1E1;
+ }
+ </style>
+
+
+
+ </head>
+
+ <body bgcolor="#bbbbbb">
+ <table align="center" width="900" cellspacing="0" cellpadding="0" bgcolor="#ffffff" border="0">
+
+ <tr>
+ <td width="17" rowspan="6"> </td>
+ <td height="150" bgcolor="#E1E1E1"><center><a href="http://www.nipgr.res.in/ngsqctoolkit.html"><b><font style="font-size: 50px;">NGS QC T</font><font style="font-size: 40px;">OOLKIT</font></b></center></a>
+ </td>
+ <td width="17" rowspan="6"></td>
+ </tr>
+ <tr>
+ <td valign="top" class="tblBg">
+ <div class="cnt">
+ <table class="cnt" width="100%" border="0">
+ <tr>
+ <td class="head1">Results of quality control (QC) using $prog v$version <font style="font-size:10px">($theTime)</font></td>
+ </tr>
+ <tr>
+ <td class="head2">Input files and parameters:</td>
+ </tr>
+ <tr><td class="tblBg2">
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td>Analysis type</td><td>Quality check $analMsg1 of $analMsg2</td>
+ </tr>
+ <tr>
+ <td>Input file directory</td><td><a href="file://$iFol" target="_blank">$iFol</a></td>
+ </tr>
+ <tr>
+ <td>Input file 1</td><td>$file1</td>
+ </tr>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <tr>
+ <td>Input file 2</td><td>$file2</td>
+ </tr>
+EOF
+}
+print O <<EOF;
+ <tr>
+ <td>Input file format</td><td>FASTQ ($seqFormatName variant)</td>
+ </tr>
+ <tr>
+ <td>Primer/Adaptor library</td><td>$priAdaLib</td>
+ </tr>
+ <tr>
+ <td>Cut-off read length for HQ</td><td>$cutLen</td>
+ </tr>
+ <tr>
+ <td>Cut-off quality score</td><td>$cutQual</td>
+ </tr>
+ <tr>
+ <td>Only statistics</td><td>$onlySOnOff</td>
+ </tr>
+ <tr>
+EOF
+print O "
+ <td>Number of ". (($prog=~/_PRLL/)?"CPUs":"processes") ."</td><td>$nCPUs</td>
+\n";
+print O <<EOF;
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head2">Output files:</td>
+ </tr>
+ <tr><td class="tblBg2">
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td>Output folder</td><td><a href="file://$oFol" target="_blank">$oFol</a></td>
+ </tr>
+ <tr>
+ <td>QC statistics</td><td><a href="$statFileOnlyName" target="_blank">$statFileOnlyName</a></td>
+ </tr>
+EOF
+if(!$isOnlyStat) {
+print O <<EOF;
+ <tr>
+ <td>High quality filtered file 1</td><td>$outFile1</td>
+ </tr>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <tr>
+ <td>High quality filtered file 2</td><td>$outFile2</td>
+ </tr>
+ <tr>
+ <td>High quality filtered un-paired data</td><td>$unPaired</td>
+ </tr>
+EOF
+}
+}
+print O <<EOF;
+ <tr>
+ <td>Per base average quality score for file 1</td><td><a href="$avgQF1" target="_blank">$avgQF1</a></td>
+ </tr>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <tr>
+ <td>Per base average quality score for file 2</td><td><a href="$avgQF2" target="_blank">$avgQF2</a></td>
+ </tr>
+EOF
+}
+print O <<EOF;
+ <tr>
+ <td>Read count (%) per base for different quality ranges for file 1 (Raw)</td><td><a href="$avgQRangeRF1" target="_blank">$avgQRangeRF1</a></td>
+ </tr>
+EOF
+if(!$isOnlyStat) {
+print O <<EOF;
+ <tr>
+ <td>Read count (%) per base for different quality ranges for file 1 (Filtered)</td><td><a href="$avgQRangeFF1" target="_blank">$avgQRangeFF1</a></td>
+ </tr>
+EOF
+}
+if($isPairedEnd) {
+print O <<EOF;
+ <tr>
+ <td>Read count (%) per base for different quality ranges for file 2 (Raw)</td><td><a href="$avgQRangeRF2" target="_blank">$avgQRangeRF2</a></td>
+ </tr>
+EOF
+if(!$isOnlyStat) {
+print O <<EOF;
+ <tr>
+ <td>Read count (%) per base for different quality ranges for file 2 (Filtered)</td><td><a href="$avgQRangeFF2" target="_blank">$avgQRangeFF2</a></td>
+ </tr>
+EOF
+}
+}
+print O <<EOF;
+ <tr>
+ <td>Base composition for file 1</td><td><a href="$baseCntF1" target="_blank">$baseCntF1</a></td>
+ </tr>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <tr>
+ <td>Base composition for file 2</td><td><a href="$baseCntF2" target="_blank">$baseCntF2</a></td>
+ </tr>
+EOF
+}
+print O <<EOF;
+ <tr>
+ <td>GC content distribution for file 1</td><td><a href="$gcDistF1" target="_blank">$gcDistF1</a></td>
+ </tr>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <tr>
+ <td>GC content distribution for file 2</td><td><a href="$gcDistF2" target="_blank">$gcDistF2</a></td>
+ </tr>
+EOF
+}
+print O <<EOF;
+ <tr>
+ <td>Quality distribution for file 1</td><td><a href="$qualDistF1" target="_blank">$qualDistF1</a></td>
+ </tr>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <tr>
+ <td>Quality distribution for file 2</td><td><a href="$qualDistF2" target="_blank">$qualDistF2</a></td>
+ </tr>
+EOF
+}
+print O <<EOF;
+ <tr>
+ <td>Summary of QC</td><td><a href="$sumPieF" target="_blank">$sumPieF</a></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head2" style="background-color: #ffffff;"> </td>
+ </tr>
+ <tr>
+ <td class="head2">Results of QC</td>
+ </tr>
+EOF
+my $flag = 0;
+for(my $i=0; $i<@statFData; $i++) {
+ my $line = $statFData[$i];
+ if($line =~ /^QC statistics/) {
+ $flag = 1;
+ print O "<tr><td class=\"head3\">QC statistics</td></tr>\n<tr><td class=\"tblBg2\"><table width=\"100%\" border=\"0\" class=\"cnt\">\n";
+ next;
+ }
+ if($line =~ /^Detailed QC statistics/) {
+ $flag = 2;
+ print O "</table></td></tr>\n<tr><td class=\"head3\">Detailed QC statistics</td></tr>\n<tr><td class=\"tblBg2\"><table width=\"100%\" border=\"0\" class=\"cnt\">\n";
+ next;
+ }
+ if($line =~ /^Average quality score/) {
+ $flag = 0;
+ next;
+ }
+ chomp($line);
+ if($flag != 0 && $line ne "") {
+ my @clms = split(/ {2,}|\t/, $line);
+ shift(@clms);
+ print O "<tr>";
+ foreach my $f (@clms) {
+ print O "<td>$f</td>";
+ }
+ print O "</tr>\n";
+ }
+}
+print O "</table></td></tr>\n";
+print O <<EOF;
+ <tr>
+ <td class="head3">Summary of QC</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following pie chart shows the summary of QC depicting percentage of high quality, low quality and contaminated reads.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$sumPieF" border="1"><br><b>(A)</b></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Per base average quality scores</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show per base average PHRED quality scores for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$avgQF1" border="1"><br><b>(A)</b>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <br><br><img src="$avgQF2" border="1"><br><b>(B)</b></td>
+EOF
+}
+print O <<EOF;
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Read count (%) per base for different quality score ranges</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show per base read count (%) for different quality score ranges for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$avgQRangeRF1" border="1">
+EOF
+if(!defined($isOnlyStat)) {
+print O <<EOF;
+ <br><br><img src="$avgQRangeFF1" border="1">
+EOF
+}
+print O <<EOF;
+ <br><b>(A)</b>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <br><br><img src="$avgQRangeRF2" border="1">
+EOF
+if(!defined($isOnlyStat)) {
+print O <<EOF;
+ <br><br><img src="$avgQRangeFF2" border="1">
+EOF
+}
+print O <<EOF;
+ <br><b>(B)</b></td>
+EOF
+}
+print O <<EOF;
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">GC content distribution</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show number of reads for distinct average GC content (%) ranges for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$gcDistF1" border="1"><br><b>(A)</b>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <br><br><img src="$gcDistF2" border="1"><br><b>(B)</b></td>
+EOF
+}
+print O <<EOF;
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Quality distribution</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show number of reads for different average PHRED quality scores for $inpFilesMsg $b4A4Msg.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$qualDistF1" border="1"><br><b>(A)</b>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <br><br><img src="$qualDistF2" border="1"><br><b>(B)</b></td>
+EOF
+}
+print O <<EOF;
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="head3">Base composition</td>
+ </tr>
+ <tr>
+ <td>
+ <table width="100%" border="0" class="cnt">
+ <tr>
+ <td><div align="justify">Following graph(s) show base composition (count) for $inpFilesMsg $b4A4Msg with percentage of bases at the bottom.</div></td>
+ </tr>
+ <tr>
+ <td align="center"><img src="$baseCntF1" border="1"><br><b>(A)</b>
+EOF
+if($isPairedEnd) {
+print O <<EOF;
+ <br><br><img src="$baseCntF2" border="1"><br><b>(B)</b></td>
+EOF
+}
+print O <<EOF;
+ </tr>
+ </table>
+ </td>
+ </tr>
+ </table>
+ </div>
+ </td>
+ </tr>
+ <tr><td style="font-size: 11px;" align="center" valign="middle" height="25" background="$imgPath/btmLine.png" bgcolor="#EEEEEE">For Questions and Suggestions, contact <a href="mailto:mjain\@nipgr.res.in">Mukesh Jain (mjain\@nipgr.res.in)</a>; <a href="mailto:ravipatel\@nipgr.res.in">Ravi Patel (ravipatel\@nipgr.res.in)</a></td></tr>
+
+ <!--- <tr><td colspan="3"><img src="$imgPath/down.png"></td></tr> --->
+ </table>
+ </body>
+ </html>
+EOF
+
+close(O);
+}
+1;
diff --git a/QC/lib/main.css b/QC/lib/main.css
new file mode 100644
index 0000000..0ef4151
--- /dev/null
+++ b/QC/lib/main.css
@@ -0,0 +1,46 @@
+/* CSS Document */
+.left {
+ background: url(imgs/left.png) repeat-y left bottom;
+}
+.right {
+ background: url(imgs/right.png) repeat-y left bottom;
+}
+BODY {
+ margin-top: 0;
+ background-repeat: repeat;
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 5px;
+ margin-bottom: 0;
+}
+.cnt {
+ font-family: Verdana, Arial, Helvetica, sans-serif;
+ font-size: 12px;
+ line-height: 20px;
+ padding: 5px 20px 0px;
+}
+Table .exp {
+ font-family: Verdana, Arial, Helvetica, sans-serif;
+ font-size: 13px;
+ line-height: 20px;
+}
+TD .padding {
+ padding: 0px 20px;
+}
+.head1 {
+ font-size: 18px;
+ font-weight: bold;
+ padding: 5px 0px;
+}
+.head2 {
+ font-size: 14px;
+ font-weight: bold;
+ padding: 5px 0px;
+}
+.head3 {
+ font-size: 12px;
+ font-weight: bold;
+}
+A {
+ text-decoration: none;
+ color: #0000FF;
+}
\ No newline at end of file
diff --git a/QC/lib/version b/QC/lib/version
new file mode 100644
index 0000000..0bee604
--- /dev/null
+++ b/QC/lib/version
@@ -0,0 +1 @@
+2.3.3
diff --git a/QC/lib/version~ b/QC/lib/version~
new file mode 100644
index 0000000..530cdd9
--- /dev/null
+++ b/QC/lib/version~
@@ -0,0 +1 @@
+2.2.4
diff --git a/Statistics/AvgQuality.pl b/Statistics/AvgQuality.pl
new file mode 100644
index 0000000..7d27432
--- /dev/null
+++ b/Statistics/AvgQuality.pl
@@ -0,0 +1,108 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use File::Basename;
+
+# Parameter variables
+my $file;
+my $helpAsked;
+my $outFile = "";
+
+GetOptions(
+ "i=s" => \$file,
+ "h|help" => \$helpAsked,
+ "o|outputFile=s" => \$outFile,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(!defined($file)) {
+ prtError("No input files are provided");
+}
+
+my ($fileName, $filePath) = fileparse($file);
+$outFile = $file . "_qual_stat" if($outFile eq "");
+
+open(I, "<$file") or die "Can not open file: $file\n";
+open(O, ">$outFile") or die "Can not open file: $outFile\n";
+
+
+my $prevFastaSeqId = "";
+my $fastaSeqId = "";
+my $fastaSeq = "";
+my $seqCount = 0;
+my $ttlQual = 0;
+
+while(my $line = <I>) {
+ chomp $line;
+ if($line =~ /^>/) {
+ $line =~ s/^>//;
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ if($fastaSeq ne "") {
+ prtQuality($prevFastaSeqId, $fastaSeq);
+ }
+ $fastaSeq = "";
+ }
+ else {
+ $fastaSeq .= $line . " ";
+ }
+}
+if($fastaSeq ne "") {
+ $prevFastaSeqId = $fastaSeqId;
+ prtQuality($prevFastaSeqId, $fastaSeq);
+}
+
+printf O "Final quality average: %0.2f\n", $ttlQual/$seqCount;
+close(O);
+close(I);
+
+exit;
+
+
+sub prtQuality {
+ my $id = $_[0];
+ my $qualStr = $_[1];
+ chop $qualStr;
+ $seqCount++;
+ my @qVal = split(/\s+/, $qualStr);
+ my $sum = sum(@qVal);
+ my $qual = sprintf "%0.2f", $sum/(scalar @qVal);
+ $ttlQual += $qual;
+ print O "$id\t$qual\n";
+}
+
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input quality (FASTA) (Required)\n";
+ print " -i <Quality file>\n";
+ print " Quality file in FASTA format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print " -o | -outputFile <Output file name>\n";
+ print " Output will be stored in the given file\n";
+ print " default: By default, quality statistics file will be stored where the input file is\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
diff --git a/Statistics/N50Stat.pl b/Statistics/N50Stat.pl
new file mode 100644
index 0000000..cb5e836
--- /dev/null
+++ b/Statistics/N50Stat.pl
@@ -0,0 +1,172 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+use Getopt::Long;
+use File::Basename;
+
+my $As = 0;
+my $Ts = 0;
+my $Gs = 0;
+my $Cs = 0;
+my $Ns = 0;
+
+# Parameter variables
+my $file;
+my $helpAsked;
+my $outFile = "";
+
+GetOptions(
+ "i=s" => \$file,
+ "h|help" => \$helpAsked,
+ "o|outputFile=s" => \$outFile,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(!defined($file)) {
+ prtError("No input files are provided");
+}
+
+my ($fileName, $filePath) = fileparse($file);
+$outFile = $file . "_n50_stat" if($outFile eq "");
+
+open(I, "<$file") or die "Can not open file: $file\n";
+open(O, ">$outFile") or die "Can not open file: $outFile\n";
+
+my @len = ();
+
+my $prevFastaSeqId = "";
+my $fastaSeqId = "";
+my $fastaSeq = "";
+
+while(my $line = <I>) {
+ chomp $line;
+ if($line =~ /^>/) {
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ if($fastaSeq ne "") {
+ push(@len, length $fastaSeq);
+ baseCount($fastaSeq);
+ }
+ $fastaSeq = "";
+ }
+ else {
+ $fastaSeq .= $line;
+ }
+}
+if($fastaSeq ne "") {
+ $prevFastaSeqId = $fastaSeqId;
+ push(@len, length $fastaSeq);
+ baseCount($fastaSeq);
+}
+
+my $totalReads = scalar @len;
+my $bases = sum(@len);
+my $minReadLen = min(@len);
+my $maxReadLen = max(@len);
+my $avgReadLen = sprintf "%0.2f", $bases/$totalReads;
+my $medianLen = calcMedian(@len);
+my $n25 = calcN50(\@len, 25);
+my $n50 = calcN50(\@len, 50);
+my $n75 = calcN50(\@len, 75);
+my $n90 = calcN50(\@len, 90);
+my $n95 = calcN50(\@len, 95);
+
+printf O "%-25s %d\n" , "Total sequences", $totalReads;
+printf O "%-25s %d\n" , "Total bases", $bases;
+printf O "%-25s %d\n" , "Min sequence length", $minReadLen;
+printf O "%-25s %d\n" , "Max sequence length", $maxReadLen;
+printf O "%-25s %0.2f\n", "Average sequence length", $avgReadLen;
+printf O "%-25s %0.2f\n", "Median sequence length", $medianLen;
+printf O "%-25s %d\n", "N25 length", $n25;
+printf O "%-25s %d\n", "N50 length", $n50;
+printf O "%-25s %d\n", "N75 length", $n75;
+printf O "%-25s %d\n", "N90 length", $n90;
+printf O "%-25s %d\n", "N95 length", $n95;
+printf O "%-25s %0.2f %s\n", "As", $As/$bases*100, "%";
+printf O "%-25s %0.2f %s\n", "Ts", $Ts/$bases*100, "%";
+printf O "%-25s %0.2f %s\n", "Gs", $Gs/$bases*100, "%";
+printf O "%-25s %0.2f %s\n", "Cs", $Cs/$bases*100, "%";
+printf O "%-25s %0.2f %s\n", "(A + T)s", ($As+$Ts)/$bases*100, "%";
+printf O "%-25s %0.2f %s\n", "(G + C)s", ($Gs+$Cs)/$bases*100, "%";
+printf O "%-25s %0.2f %s\n", "Ns", $Ns/$bases*100, "%";
+
+print "N50 Statisitcs file: $outFile\n";
+
+exit;
+
+sub calcN50 {
+ my @x = @{$_[0]};
+ my $n = $_[1];
+ @x=sort{$b<=>$a} @x;
+ my $total = sum(@x);
+ my ($count, $n50)=(0,0);
+ for (my $j=0; $j<@x; $j++){
+ $count+=$x[$j];
+ if(($count>=$total*$n/100)){
+ $n50=$x[$j];
+ last;
+ }
+ }
+ return $n50;
+}
+
+sub calcMedian {
+ my @arr = @_;
+ my @sArr = sort{$a<=>$b} @arr;
+ my $arrLen = @arr;
+ my $median;
+ if($arrLen % 2 == 0) {
+ $median = ($sArr[$arrLen/2-1] + $sArr[$arrLen/2])/2;
+ }
+ else {
+ $median = $sArr[$arrLen/2];
+ }
+ return $median;
+}
+
+sub baseCount {
+ my $seq = $_[0];
+ my $tAs += $seq =~ s/A/A/gi;
+ my $tTs += $seq =~ s/T/T/gi;
+ my $tGs += $seq =~ s/G/G/gi;
+ my $tCs += $seq =~ s/C/C/gi;
+ $Ns += (length $seq) - $tAs - $tTs - $tGs - $tCs;
+ $As += $tAs;
+ $Ts += $tTs;
+ $Gs += $tGs;
+ $Cs += $tCs;
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads/sequences (FASTA) (Required)\n";
+ print " -i <Read/Sequence file>\n";
+ print " Read/Sequence in fasta format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print " -o | -outputFile <Output file name>\n";
+ print " Output will be stored in the given file\n";
+ print " default: By default, N50 statistics file will be stored where the input file is\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
diff --git a/Trimming/AmbiguityFiltering.pl b/Trimming/AmbiguityFiltering.pl
new file mode 100644
index 0000000..9d5358f
--- /dev/null
+++ b/Trimming/AmbiguityFiltering.pl
@@ -0,0 +1,405 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Basename;
+use List::Util qw(sum min max);
+
+# Parameter variables
+my $file;
+my $file2;
+my $helpAsked;
+my $numNBases = -1;
+my $perNBases = -1;
+my $end5Trim;
+my $end3Trim;
+my $lenCutOff = -1;
+my $outFile = "";
+my $processingFlag = 0; # 1: N count filter, 2: N percent filter, 3: End N trim
+
+GetOptions(
+ "i=s" => \$file,
+ "irev=s" => \$file2,
+ "h|help" => \$helpAsked,
+ "c|countN=i" => \$numNBases,
+ "o|outputFile=s" => \$outFile,
+ "p|percentN=i" => \$perNBases,
+ "t5|trim5EndN" => \$end5Trim,
+ "t3|trim3EndN" => \$end3Trim,
+ "n|lenCutOff=i" => \$lenCutOff,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(!defined($file)) {
+ prtError("No input files are provided");
+}
+
+### Validating input filtering options
+if($numNBases == -1 && $perNBases == -1 && !defined($end5Trim) && !defined($end3Trim) && $lenCutOff == -1) {
+ print "Filtering or trimming parameters are not set.\nNothing to do.\nExiting...\n";
+ exit;
+}
+if($numNBases != -1) {
+ $processingFlag = 1;
+ print "Filtering out reads containing ambiguous base count > $numNBases\n";
+}
+elsif($perNBases != -1) {
+ $processingFlag = 2;
+ print "Filtering out reads containing ambiguous base percentage > $perNBases%\n";
+}
+else {
+ $processingFlag = 3;
+ print "Trimming ambiguous bases from end(s) of reads followed by length filtering (< $lenCutOff bp)\n";
+}
+
+
+
+if($file2) {
+ $outFile = $file . "_trimmed";
+ my $outFile2 = $file2 . "_trimmed";
+ open(I1, "$file") or die "Can not open file $file\n";
+ open(I2, "$file2") or die "Can not open file $file2\n";
+ open(O1, ">$outFile") or die "Can not create file $outFile\n";
+ open(O2, ">$outFile2") or die "Can not create file $outFile2\n";
+ my $tmpLine = <I1>;
+ close(I1);
+ if($tmpLine =~ /^@/) {
+ print "Input read/sequence format: FASTQ (Paired-end)\n";
+ print "Checking FASTQ format: File $file...\n";
+ my $nLines = checkFastQFile($file, 1);
+
+ print "Checking FASTQ format: File $file2...\n";
+ my $nLines2 = checkFastQFile($file2, 1);
+
+ if($nLines != $nLines2) {
+ prtErrorExit("Number of reads in paired-end data files are not same.\n\t\tFiles: $file, $file2");
+ }
+
+ open(I1, "$file") or die "Can not open file $file\n";
+ my $c = 0;
+ my $currId1 = "";
+ my $currId2 = "";
+ my $currSeq = "";
+ my $currQual = "";
+ my $curr2Id1 = "";
+ my $curr2Id2 = "";
+ my $currSeq2 = "";
+ my $currQual2 = "";
+
+
+ while(my $line = <I1>) {
+ my $line2 = <I2>;
+ chomp $line;
+ chomp $line2;
+ $c++;
+ if($c == 5) {
+ $c = 1;
+ }
+ if($c == 1) {
+ $currId1 = $line;
+ $curr2Id1 = $line2;
+ }
+ if($c == 3) {
+ $currId2 = $line;
+ $curr2Id2 = $line2;
+ }
+ if($c == 2) {
+ $currSeq = $line;
+ $currSeq2 = $line2;
+ }
+ if($c == 4) {
+ $currQual = $line;
+ $currQual2 = $line2;
+ if($processingFlag == 1) {
+ my $nC1 = getNCount($currSeq);
+ my $nC2 = getNCount($currSeq2);
+ if($nC1 <= $numNBases && $nC2 <= $numNBases) {
+ print O1 "$currId1\n$currSeq\n$currId2\n$currQual\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ print O2 "$curr2Id1\n$currSeq2\n$curr2Id2\n$currQual2\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ }
+ }
+ elsif($processingFlag == 2) {
+ my $nP1 = getNPercent($currSeq);
+ my $nP2 = getNPercent($currSeq2);
+ if($nP1 <= $perNBases && $nP2 <= $perNBases) {
+ print O1 "$currId1\n$currSeq\n$currId2\n$currQual\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ print O2 "$curr2Id1\n$currSeq2\n$curr2Id2\n$currQual2\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ }
+ }
+ elsif($processingFlag == 3) {
+ ($currSeq, $currQual) = trimNsAndLenFilter($currSeq, $currQual);
+ ($currSeq2, $currQual2) = trimNsAndLenFilter($currSeq2, $currQual2);
+ if($currSeq ne "-1" && $currSeq2 ne "-1") {
+ print O1 "$currId1\n$currSeq\n$currId2\n$currQual\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ print O2 "$curr2Id1\n$currSeq2\n$curr2Id2\n$currQual2\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ }
+ }
+ }
+ }
+ print "Filtered files are generated: $outFile $outFile2\n";
+ }
+ else {
+ print "Error:::\n\tPaired-end sequeneing data need to be in FASTQ format\n";
+ exit;
+ }
+ close(O2);
+ close(O1);
+ close(I2);
+ close(I1);
+}
+else {
+ $outFile = $file . "_trimmed" if($outFile eq "");
+
+ open(I, "$file") or die "Can not open file $file\n";
+ open(O, ">$outFile") or die "Can not create file $outFile\n";
+ my $tmpLine = <I>;
+ close(I);
+ if($tmpLine =~ /^@/) {
+ print "Input read/sequence format: FASTQ\n";
+ print "Checking FASTQ variant: File $file...\n";
+ my $nLines = checkFastQFile($file, 1);
+
+ open(I, "$file") or die "Can not open file $file\n";
+ my $c = 0;
+ my $currId1 = "";
+ my $currId2 = "";
+ my $currSeq = "";
+ my $currQual = "";
+
+
+ while(my $line = <I>) {
+ chomp $line;
+ $c++;
+ if($c == 5) {
+ $c = 1;
+ }
+ if($c == 1) {
+ $currId1 = $line;
+ }
+ if($c == 3) {
+ $currId2 = $line;
+ }
+ if($c == 2) {
+ $currSeq = $line;
+ }
+ if($c == 4) {
+ $currQual = $line;
+ if($processingFlag == 1) {
+ my $nC1 = getNCount($currSeq);
+ if($nC1 <= $numNBases) {
+ print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
+ }
+ }
+ elsif($processingFlag == 2) {
+ my $nP1 = getNPercent($currSeq);
+ if($nP1 <= $perNBases) {
+ print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
+ }
+ }
+ elsif($processingFlag == 3) {
+ ($currSeq, $currQual) = trimNsAndLenFilter($currSeq, $currQual);
+ if($currSeq ne "-1") {
+ print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
+ }
+ }
+ }
+ }
+ print "Filtered file is generated: $outFile\n";
+ }
+ else {
+ print "Input read/sequence format: FASTA\n";
+
+ open(I, "$file") or die "Can not open file $file\n";
+ my $prevFastaSeqId = "";
+ my $fastaSeqId = "";
+ my $fastaSeq = "";
+
+ while(my $line = <I>) {
+ chomp $line;
+ if($line =~ /^>/) {
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ if($fastaSeq ne "") {
+ processFastaSeq($prevFastaSeqId, $fastaSeq);
+ }
+ $fastaSeq = "";
+ }
+ else {
+ $fastaSeq .= $line;
+ }
+ }
+ if($fastaSeq ne "") {
+ $prevFastaSeqId = $fastaSeqId;
+ processFastaSeq($prevFastaSeqId, $fastaSeq);
+ }
+ print "Filtered file is generated: $outFile\n";
+ }
+ close(O);
+ close(I);
+}
+
+sub processFastaSeq {
+ my ($prevFastaSeqId, $fastaSeq) = @_;
+ if($processingFlag == 1) {
+ my $nC1 = getNCount($fastaSeq);
+ if($nC1 <= $numNBases) {
+ print O "$prevFastaSeqId\n", formatSeq($fastaSeq), "\n" if(length $fastaSeq >= $lenCutOff);
+ }
+ }
+ elsif($processingFlag == 2) {
+ my $nP1 = getNPercent($fastaSeq);
+ if($nP1 <= $perNBases) {
+ print O "$prevFastaSeqId\n", formatSeq($fastaSeq), "\n" if(length $fastaSeq >= $lenCutOff);
+ }
+ }
+ elsif($processingFlag == 3) {
+ ($fastaSeq) = trimNsAndLenFilter($fastaSeq);
+ if($fastaSeq ne "-1") {
+ print O "$prevFastaSeqId\n", formatSeq($fastaSeq), "\n" if(length $fastaSeq >= $lenCutOff);
+ }
+ }
+}
+
+sub formatSeq {
+ my $seq = $_[0];
+ my $newSeq = "";
+ my $ch = 60;
+ for(my $i=0; $i<length $seq; $i+=$ch) {
+ $newSeq .= substr($seq, $i, $ch) . "\n";
+ }
+ chomp($newSeq); # To remove \n at the end of the whole sequence..
+ return $newSeq;
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads/sequences (FASTQ/FASTA) (Required)\n";
+ print " -i <Forward read/sequence file>\n";
+ print " File containing reads/sequences in either FASTQ or FASTA format\n";
+ print "\n";
+ print "### Input reads/sequences (FASTQ) [Optional]\n";
+ print " -irev <Reverse read/sequence file of paired-end data>\n";
+ print " File containing reverse reads/sequences of paired-end data in FASTQ format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print "--------------------------------- Trimming Options ---------------------------------\n";
+ print " -c | -countN <Integer>\n";
+ print " Maximum number of allowed ambiguous bases\n";
+ print " default: 0\n";
+ print " -p | -percentN <Integer>\n";
+ print " Maximum percentage of allowed ambiguous bases\n";
+ print " default: 0\n";
+ print " -t5 | -trim5EndN\n";
+ print " Trim ambiguous bases from 5' end of the sequence\n";
+ print " default: off\n";
+ print " -t3 | -trim3EndN\n";
+ print " Trim ambiguous bases from 3' end of the sequence\n";
+ print " default: off\n";
+ print " -n | -lenCutOff <Integer>\n";
+ print " Sequence length cut-off\n";
+ print " Sequences shorter than given length will be discarded\n";
+ print " default: -1 (i.e. length filtering is OFF)\n";
+ print " NOTE: filtering can be performed using any one of (-c), (-p) and (-t5 and/or -t3) switches at a time\n";
+ print "--------------------------------- Output Options ---------------------------------\n";
+ print " -o | -outputFile <Output file name>\n";
+ print " Output will be stored in the given file\n";
+ print " default: By default, output file will be stored where the input file is\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtErrorExit {
+ my $errmsg = $_[0];
+ print STDERR "Error:\t", $errmsg, "\n";
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+sub checkFastQFile { # Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file.
+ my $file = $_[0];
+ my $lines = 0;
+ open(F, "<$file") or die "Can not open file $file\n";
+ my $counter = 0;
+ while(my $line = <F>) {
+ $lines++;
+ $counter++;
+ next if($line =~ /^\n$/);
+ if($counter == 1 && $line !~ /^\@/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 3 && $line !~ /^\+/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 4) {
+ $counter = 0;
+ }
+ }
+ close(F);
+ return $lines;
+}
+
+sub getNCount {
+ my ($seq) = @_;
+ my $len = length $seq;
+ my $c = 0;
+ while($seq =~ /[^ATGC]/ig){$c++;}
+ return $c;
+}
+
+sub getNPercent {
+ my ($seq) = @_;
+ my $len = length $seq;
+ my $c = 0;
+ while($seq =~ /[^ATGC]/ig){$c++;}
+ return $c/$len*100;
+}
+
+sub trimNsAndLenFilter {
+ my ($seq, $qual) = @_;
+ if($end3Trim) {
+ if($seq =~ s/([^ATGC]+)$//) {
+ if($qual) {
+ my $nCount = length $1;
+ $qual =~ s/[^\n]{$nCount}$//;
+ }
+ }
+ }
+ if($end5Trim) {
+ if($seq =~ s/^([^ATGC]+)//) {
+ if($qual) {
+ my $nCount = length $1;
+ $qual =~ s/^[^\n]{$nCount}//;
+ }
+ }
+ }
+ if($seq) {
+ if((length $seq) >= $lenCutOff) {
+ return ($seq, $qual) if($qual);
+ return ($seq) if(!$qual);
+ }
+ else {
+ return -1;
+ }
+ }
+ else {
+ return -1;
+ }
+}
diff --git a/Trimming/HomopolymerTrimming.pl b/Trimming/HomopolymerTrimming.pl
new file mode 100644
index 0000000..4d4b2fe
--- /dev/null
+++ b/Trimming/HomopolymerTrimming.pl
@@ -0,0 +1,233 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Basename;
+
+# Parameter variables
+my @files;
+my $helpAsked;
+my $homoPolyLen = 8;
+my $outFolder = "";
+my $minReadLen = 100;
+
+GetOptions(
+ "i=s{1,2}" => \@files,
+ "h|help" => \$helpAsked,
+ "l|minReadLen=i" => \$minReadLen,
+ "n|homoPolyLen=i" => \$homoPolyLen,
+ "o|outputFolder=s" => \$outFolder,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(@files == 0) {
+ prtError("No input files are provided");
+}
+
+# Variables
+my $seqFile = $files[0];
+my $qualFile;
+$qualFile = $files[1] if(scalar(@files) == 2);
+my $outSeqFile;
+my $outQualFile;
+my $prevFastaSeqId = "";
+my $fastaSeqId = "";
+my $fastaSeq = "";
+my $qualSeqId = "";
+my $qualSeq = "";
+my $seqCount = 0;
+my $trimCount = 0;
+my $trashCount = 0;
+
+ my ($seqFileName, $filePath) = fileparse($seqFile);
+ my ($qualFileName) = fileparse($qualFile) if(scalar(@files) == 2);
+ $outFolder = $filePath if($outFolder eq "");
+ $outFolder .= "/" if($outFolder !~ /\/$/);
+ if(! -e $outFolder) {
+ mkdir($outFolder) or die "Can not create output folder: $outFolder\n";
+ }
+ $outSeqFile = $outFolder . $seqFileName . "_trimmed";
+ $outQualFile = $outFolder . $qualFileName . "_trimmed" if(scalar(@files) == 2);
+
+ open(I, "<$seqFile") or die "Can not open file: $seqFile\n";
+ open(Q, "<$qualFile") or die "Can not open file: $qualFile\n" if(scalar(@files) == 2);
+ open(OI, ">$outSeqFile") or die "Can not open file: $outSeqFile\n";
+ open(OQ, ">$outQualFile") or die "Can not open file: $outQualFile\n" if(scalar(@files) == 2);
+
+if(scalar(@files) == 2) {
+ while(my $line = <I>) {
+ chomp $line;
+ my $qualLine = <Q>;
+ chomp($qualLine);
+ if($line =~ /^>/) {
+ $seqCount++;
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ $qualSeqId = $qualLine;
+ if($fastaSeqId ne $qualSeqId) {
+ print STDERR "Error: Read Id doesn't match in sequence and quality file for read number $seqCount in sequence file.\n";
+ exit(-1);
+ }
+ if($fastaSeq ne "") {
+ processSeq();
+ }
+ $fastaSeq = "";
+ $qualSeq = "";
+ }
+ else {
+ $fastaSeq .= $line;
+ $qualSeq .= $qualLine . " ";
+ }
+ }
+ if($fastaSeq ne "") {
+ $prevFastaSeqId = $fastaSeqId;
+ processSeq();
+ }
+}
+else {
+ while(my $line = <I>) {
+ chomp $line;
+ if($line =~ /^>/) {
+ $seqCount++;
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ if($fastaSeq ne "") {
+ processSeq();
+ }
+ $fastaSeq = "";
+ }
+ else {
+ $fastaSeq .= $line;
+ }
+ }
+ if($fastaSeq ne "") {
+ $prevFastaSeqId = $fastaSeqId;
+ processSeq();
+ }
+}
+
+print "Number of reads/sequences trashed with length < $minReadLen: $trashCount\n";
+print "Number of reads/sequences trimmed containing homopolymer: $trimCount\n";
+print "Trimmed read/sequence file: $outSeqFile\n";
+print "Trimmed quality file: $outQualFile\n" if(scalar(@files) == 2);
+exit;
+
+
+sub processSeq {
+ if(length $fastaSeq < $minReadLen) {
+ $trashCount++;
+ return;
+ }
+ if($homoPolyLen != 0) {
+ if(hasPolyChar(\$fastaSeq)) {
+ $trimCount++;
+ $qualSeq = trimQualSeq($qualSeq, length $fastaSeq) if(scalar(@files) == 2);
+ }
+ }
+ if(length $fastaSeq < $minReadLen) {
+ $trashCount++;
+ return;
+ }
+ print OI "$prevFastaSeqId\n";
+ print OI formatSeq($fastaSeq), "\n";
+ print OQ "$prevFastaSeqId\n" if(scalar(@files) == 2);
+ print OQ formatQualSeq($qualSeq), "\n" if(scalar(@files) == 2);
+}
+
+sub hasPolyChar {
+ my $seqRef = $_[0];
+ my $flag = 0;
+ if($$seqRef =~ s/(A{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(T{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(G{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ if($$seqRef =~ s/(C{$homoPolyLen,}).*//i) {
+ $flag = 1;
+ }
+ return $flag;
+}
+
+sub trimQualSeq {
+ my $qualSeq = $_[0];
+ my $seqLen = $_[1];
+ $qualSeq =~ /^((\d{1,2}\s+){$seqLen})/;
+ my $trimmedQualSeq = $1;
+ $trimmedQualSeq =~ s/\s+$//;
+ return $trimmedQualSeq;
+}
+
+sub formatSeq {
+ my $seq = $_[0];
+ my $newSeq = "";
+ my $ch = 60;
+ for(my $i=0; $i<length $seq; $i+=$ch) {
+ $newSeq .= substr($seq, $i, $ch) . "\n";
+ }
+ chomp($newSeq); # To remove \n at the end of the whole sequence..
+ return $newSeq;
+}
+
+sub formatQualSeq {
+ my $qualSeq = $_[0];
+ my $fQSeq = "";
+ my $ch = 60;
+ my $valCount = 0;
+ my @arr = split(" ", $qualSeq);
+ for(my $i=0; $i<@arr; $i++) {
+ $valCount++;
+ if($valCount % $ch == 0) {
+ $fQSeq .= $arr[$i] . "\n";
+ }
+ else {
+ $fQSeq .= $arr[$i] . " ";
+ }
+ }
+ $fQSeq =~ s/\s+$//;
+ return $fQSeq;
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads/sequences (FASTA format; .fna and .qual files) (Required)\n";
+ print " -i <Read/Sequence file> [Quality file (optional)]\n";
+ print " Read/Sequence and quality file in FASTA format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print " -l | -minReadLen <Integer>\n";
+ print " Minimum length of a read/sequence to be retained in output\n";
+ print " default: 100\n";
+ print " -n | -homoPolyLen <Integer>\n";
+ print " Minimum length of the homopolymer to be trimmed\n";
+ print " For eg.: -n 8, will trim the right end of read/sequence from the homopolymer of at least 8 bases long\n";
+ print " Note:- use -n 0 to skip homopolymer trimming (for only length filtering)\n";
+ print " default: 8\n";
+ print " -o | -outputFolder <Output folder name/path>\n";
+ print " Output will be stored in the given folder\n";
+ print " default: By default, files will be stored where the input files are\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
diff --git a/Trimming/TrimmingReads.pl b/Trimming/TrimmingReads.pl
new file mode 100644
index 0000000..98071c6
--- /dev/null
+++ b/Trimming/TrimmingReads.pl
@@ -0,0 +1,446 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Basename;
+use List::Util qw(sum min max);
+
+my $seqFormat = "a"; # 1: Sanger; 2: Solexa; 3: Illumina 1.3+; 4: Illumina 1.5+;
+my $subVal;
+my $subVal2;
+
+# Parameter variables
+my $file;
+my $file2;
+my $helpAsked;
+my $rTrimBases = 0;
+my $lTrimBases = 0;
+my $qCutOff = 0;
+my $lenCutOff = -1;
+my $outFile = "";
+my $isQualTrimming = 1;
+
+GetOptions(
+ "i=s" => \$file,
+ "irev=s" => \$file2,
+ "h|help" => \$helpAsked,
+ "l|leftTrimBases=i" => \$lTrimBases,
+ "o|outputFile=s" => \$outFile,
+ "r|rightTrimBases=i" => \$rTrimBases,
+ "q|qualCutOff=i" => \$qCutOff,
+ "n|lenCutOff=i" => \$lenCutOff,
+ );
+if(defined($helpAsked)) {
+ prtUsage();
+ exit;
+}
+if(!defined($file)) {
+ prtError("No input files are provided");
+}
+
+if($file2) {
+ $outFile = $file . "_trimmed";
+ my $outFile2 = $file2 . "_trimmed";
+ open(I1, "$file") or die "Can not open file $file\n";
+ open(I2, "$file2") or die "Can not open file $file2\n";
+ open(O1, ">$outFile") or die "Can not create file $outFile\n";
+ open(O2, ">$outFile2") or die "Can not create file $outFile2\n";
+ my $tmpLine = <I1>;
+ close(I1);
+ if($tmpLine =~ /^@/) {
+ print "Input read/sequence format: FASTQ (Paired-end)\n";
+ if($lTrimBases == 0 && $rTrimBases == 0 && $qCutOff == 0 && $lenCutOff == -1) {
+ print "Trimming parameters are not set.\nNothing to do.\nExiting...\n";
+ unlink($outFile);
+ unlink($outFile2);
+ exit;
+ }
+ print "Checking FASTQ variant: File $file...\n";
+ my $nLines = checkFastQFormat($file, 1);
+ $subVal = getSubVal($seqFormat);
+
+ print "Checking FASTQ variant: File $file2...\n";
+ my $nLines2 = checkFastQFormat($file2, 1);
+ $subVal2 = getSubVal($seqFormat);
+
+ if($nLines != $nLines2) {
+ prtErrorExit("Number of reads in paired-end data files are not same.\n\t\tFiles: $file, $file2");
+ }
+
+ if($subVal != $subVal2) {
+ prtErrorExit("FASTQ variant of paired-end data files are not same.\n\t\tFiles: $file, $file2");
+ }
+
+ if($lTrimBases != 0 || $rTrimBases != 0) {
+ print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end";
+ $isQualTrimming = 0;
+ }
+ else {
+ print "Trimming based on PHRED quality score (< $qCutOff)";
+ }
+ print " followed by length filtering (< $lenCutOff bp)\n";
+
+ open(I1, "$file") or die "Can not open file $file\n";
+ my $c = 0;
+ my $currId1 = "";
+ my $currId2 = "";
+ my $currSeq = "";
+ my $currQual = "";
+ my $curr2Id1 = "";
+ my $curr2Id2 = "";
+ my $currSeq2 = "";
+ my $currQual2 = "";
+
+
+ while(my $line = <I1>) {
+ my $line2 = <I2>;
+ chomp $line;
+ chomp $line2;
+ $c++;
+ if($c == 5) {
+ $c = 1;
+ }
+ if($c == 1) {
+ $currId1 = $line;
+ $curr2Id1 = $line2;
+ }
+ if($c == 3) {
+ $currId2 = $line;
+ $curr2Id2 = $line2;
+ }
+ if($isQualTrimming == 0) {
+ if($c == 2) {
+ $currSeq = trimSeq($line);
+ $currSeq2 = trimSeq($line2);
+ }
+ elsif($c == 4) {
+ $currQual = trimSeq($line);
+ $currQual2 = trimSeq($line2);
+ print O1 "$currId1\n$currSeq\n$currId2\n$currQual\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ print O2 "$curr2Id1\n$currSeq2\n$curr2Id2\n$currQual2\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ }
+ }
+ else {
+ if($c == 4) {
+ $currQual = trimSeq4Qual($line);
+ $currQual2 = trimSeq4Qual($line2);
+ if(defined($currQual) && defined($currQual2)) {
+ my $len = length $currQual;
+ my $len2 = length $currQual2;
+ $currSeq =~ /^(.{$len})/;
+ $currSeq = $1;
+ $currSeq2 =~ /^(.{$len2})/;
+ $currSeq2 = $1;
+ print O1 "$currId1\n$currSeq\n$currId2\n$currQual\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ print O2 "$curr2Id1\n$currSeq2\n$curr2Id2\n$currQual2\n" if((length $currSeq >= $lenCutOff) && (length $currSeq2 >= $lenCutOff));
+ }
+ }
+ elsif($c == 2) {
+ $currSeq = $line;
+ $currSeq2 = $line2;
+ }
+ }
+ }
+ print "Trimmed files are generated: $outFile $outFile2\n";
+ }
+ else {
+ print "Error:::\n\tPaired-end sequeneing data need to be in FASTQ format\n";
+ exit;
+ }
+ close(O2);
+ close(O1);
+ close(I2);
+ close(I1);
+}
+else {
+ $outFile = $file . "_trimmed" if($outFile eq "");
+
+ open(I, "$file") or die "Can not open file $file\n";
+ open(O, ">$outFile") or die "Can not create file $outFile\n";
+ my $tmpLine = <I>;
+ close(I);
+ if($tmpLine =~ /^@/) {
+ print "Input read/sequence format: FASTQ\n";
+ if($lTrimBases == 0 && $rTrimBases == 0 && $qCutOff == 0 && $lenCutOff == -1) {
+ print "Trimming parameters are not set.\nNothing to do.\nExiting...\n";
+ unlink($outFile);
+ exit;
+ }
+ print "Checking FASTQ variant: File $file...\n";
+ my $nLines = checkFastQFormat($file, 1);
+ $subVal = getSubVal($seqFormat);
+
+
+ if($lTrimBases != 0 || $rTrimBases != 0) {
+ print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end";
+ $isQualTrimming = 0;
+ }
+ else {
+ print "Trimming based on PHRED quality score (< $qCutOff)";
+ }
+ print " followed by length filtering (< $lenCutOff bp)\n";
+
+ open(I, "$file") or die "Can not open file $file\n";
+ my $c = 0;
+ my $currId1 = "";
+ my $currId2 = "";
+ my $currSeq = "";
+ my $currQual = "";
+
+
+ while(my $line = <I>) {
+ chomp $line;
+ $c++;
+ if($c == 5) {
+ $c = 1;
+ }
+ if($c == 1) {
+ $currId1 = $line;
+ }
+ if($c == 3) {
+ $currId2 = $line;
+ }
+ if($isQualTrimming == 0) {
+ if($c == 2) {
+ $currSeq = trimSeq($line);
+ }
+ elsif($c == 4) {
+ $currQual = trimSeq($line);
+ print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
+ }
+ }
+ else {
+ if($c == 4) {
+ $currQual = trimSeq4Qual($line);
+ if(defined($currQual)) {
+ my $len = length $currQual;
+ $currSeq =~ /^(.{$len})/;
+ $currSeq = $1;
+ print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
+ }
+ }
+ elsif($c == 2) {
+ $currSeq = $line;
+ }
+ }
+ }
+ print "Trimmed file is generated: $outFile\n";
+ }
+ else {
+ print "Input read/sequence format: FASTA\n";
+ if($qCutOff != 0) {
+ print "Warning: Quality trimming can not be performed for FASTA files\n";
+ $qCutOff = 0;
+ }
+ if($lTrimBases == 0 && $rTrimBases == 0 && $lenCutOff == -1) {
+ print "Trimming parameters are not set.\nNothing to do.\nExiting...\n";
+ unlink($outFile);
+ exit;
+ }
+ print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end followed by length filtering (< $lenCutOff bp)\n";
+ open(I, "$file") or die "Can not open file $file\n";
+ my $prevFastaSeqId = "";
+ my $fastaSeqId = "";
+ my $fastaSeq = "";
+
+ while(my $line = <I>) {
+ chomp $line;
+ if($line =~ /^>/) {
+ $prevFastaSeqId = $fastaSeqId;
+ $fastaSeqId = $line;
+ if($fastaSeq ne "") {
+ my $outSeq = trimSeq($fastaSeq);
+ print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff);
+ }
+ $fastaSeq = "";
+ }
+ else {
+ $fastaSeq .= $line;
+ }
+ }
+ if($fastaSeq ne "") {
+ $prevFastaSeqId = $fastaSeqId;
+ my $outSeq = trimSeq($fastaSeq);
+ print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff);
+ }
+ print "Trimmed read/sequence file is generated: $outFile\n";
+ }
+ close(O);
+ close(I);
+}
+
+
+sub trimSeq {
+ my $seq = $_[0];
+ $seq =~ /^.{$lTrimBases}(.+).{$rTrimBases}$/;
+ return $1;
+}
+
+sub trimSeq4Qual {
+ my $qual = $_[0];
+ my @ASCII = unpack("C*", $qual);
+ my $trimCount = 0;
+ for(my $i=@ASCII; $i>0; $i--) {
+ my $val = $ASCII[$i-1];
+ $val -= $subVal;
+ if($val < $qCutOff) {
+ $trimCount++;
+ }
+ else {
+ last;
+ }
+ }
+ $qual =~ /^(.+).{$trimCount}$/;
+ return $1;
+}
+
+sub formatSeq {
+ my $seq = $_[0];
+ my $newSeq = "";
+ my $ch = 60;
+ for(my $i=0; $i<length $seq; $i+=$ch) {
+ $newSeq .= substr($seq, $i, $ch) . "\n";
+ }
+ chomp($newSeq); # To remove \n at the end of the whole sequence..
+ return $newSeq;
+}
+
+sub prtHelp {
+ print "\n$0 options:\n\n";
+ print "### Input reads/sequences (FASTQ/FASTA) (Required)\n";
+ print " -i <Forward read/sequence file>\n";
+ print " File containing reads/sequences in either FASTQ or FASTA format\n";
+ print "\n";
+ print "### Input reads/sequences (FASTQ) [Optional]\n";
+ print " -irev <Reverse read/sequence file of paired-end data>\n";
+ print " File containing reverse reads/sequences of paired-end data in FASTQ format\n";
+ print "\n";
+ print "### Other options [Optional]\n";
+ print " -h | -help\n";
+ print " Prints this help\n";
+ print "--------------------------------- Trimming Options ---------------------------------\n";
+ print " -l | -leftTrimBases <Integer>\n";
+ print " Number of bases to be trimmed from left end (5' end)\n";
+ print " default: 0\n";
+ print " -r | -rightTrimBases <Integer>\n";
+ print " Number of bases to be trimmed from right end (3' end)\n";
+ print " default: 0\n";
+ print " -q | -qualCutOff <Integer> (Only for FASTQ files)\n";
+ print " Cut-off PHRED quality score for trimming reads from right end (3' end)\n";
+ print " For eg.: -q 20, will trim bases having PHRED quality score less than 20 at 3' end of the read\n";
+ print " Note: Quality trimming can be performed only if -l and -r are not used\n";
+ print " default: 0 (i.e. quality trimming is OFF)\n";
+ print " -n | -lenCutOff <Integer>\n";
+ print " Read length cut-off\n";
+ print " Reads shorter than given length will be discarded\n";
+ print " default: -1 (i.e. length filtering is OFF)\n";
+ print "--------------------------------- Output Options ---------------------------------\n";
+ print " -o | -outputFile <Output file name>\n";
+ print " Output will be stored in the given file\n";
+ print " default: By default, output file will be stored where the input file is\n";
+ print "\n";
+}
+
+sub prtError {
+ my $msg = $_[0];
+ print STDERR "+======================================================================+\n";
+ printf STDERR "|%-70s|\n", " Error:";
+ printf STDERR "|%-70s|\n", " $msg";
+ print STDERR "+======================================================================+\n";
+ prtUsage();
+ exit;
+}
+
+sub prtErrorExit {
+ my $errmsg = $_[0];
+ print STDERR "Error:\t", $errmsg, "\n";
+ exit;
+}
+
+sub prtUsage {
+ print "\nUsage: perl $0 <options>\n";
+ prtHelp();
+}
+
+sub checkFastQFormat { # Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file.
+ my $file = $_[0];
+ my $isVariantIdntfcntOn = $_[1];
+ my $lines = 0;
+ open(F, "<$file") or die "Can not open file $file\n";
+ my $counter = 0;
+ my $minVal = 1000;
+ my $maxVal = 0;
+ while(my $line = <F>) {
+ $lines++;
+ $counter++;
+ next if($line =~ /^\n$/);
+ if($counter == 1 && $line !~ /^\@/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 3 && $line !~ /^\+/) {
+ prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
+ }
+ if($counter == 4 && $lines < 1000000) {
+ chomp $line;
+ my @ASCII = unpack("C*", $line);
+ $minVal = min(min(@ASCII), $minVal);
+ $maxVal = max(max(@ASCII), $maxVal);
+ }
+ if($counter == 4) {
+ $counter = 0;
+ }
+ }
+ close(F);
+ my $tseqFormat = 0;
+ if($minVal >= 33 && $minVal <= 73 && $maxVal >= 33 && $maxVal <= 73) {
+ $tseqFormat = 1;
+ }
+ elsif($minVal >= 66 && $minVal <= 105 && $maxVal >= 66 && $maxVal <= 105) {
+ $tseqFormat = 4; # Illumina 1.5+
+ }
+ elsif($minVal >= 64 && $minVal <= 105 && $maxVal >= 64 && $maxVal <= 105) {
+ $tseqFormat = 3; # Illumina 1.3+
+ }
+ elsif($minVal >= 59 && $minVal <= 105 && $maxVal >= 59 && $maxVal <= 105) {
+ $tseqFormat = 2; # Solexa
+ }
+ elsif($minVal >= 33 && $minVal <= 74 && $maxVal >= 33 && $maxVal <= 74) {
+ $tseqFormat = 5; # Illumina 1.8+
+ }
+ if($isVariantIdntfcntOn) {
+ $seqFormat = $tseqFormat;
+ }
+ else {
+ if($tseqFormat != $seqFormat) {
+ print STDERR "Warning: It seems the specified variant of FASTQ doesn't match the quality values in input FASTQ files.\n";
+ }
+ }
+ return $lines;
+}
+
+sub getSubVal {
+ my $seqFormat = $_[0];
+ my $subVal = 0;
+ if($seqFormat == 1) {
+ $subVal = 33;
+ print "Input FASTQ file format: Sanger\n";
+ }
+ if($seqFormat == 2) {
+ $subVal = 64;
+ print "Input FASTQ file format: Solexa\n";
+ }
+ if($seqFormat == 3) {
+ $subVal = 64;
+ print "Input FASTQ file format: Illumina 1.3+\n";
+ }
+ if($seqFormat == 4) {
+ $subVal = 64;
+ print "Input FASTQ file format: Illumina 1.5+\n";
+ }
+ if($seqFormat == 5) {
+ $subVal = 33;
+ print "Input FASTQ file format: Illumina 1.8+\n";
+ }
+ return $subVal;
+}
\ No newline at end of file
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 3faae99..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-ngsqctoolkit (2.3.3-1) UNRELEASED; urgency=low
-
- * Initial release (Closes: #<bug>)
-
- -- Andreas Tille <tille at debian.org> Fri, 14 Nov 2014 16:21:56 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 6847097..0000000
--- a/debian/control
+++ /dev/null
@@ -1,26 +0,0 @@
-Source: ngsqctoolkit
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: science
-Priority: optional
-Build-Depends: debhelper (>= 9)
-Standards-Version: 3.9.6
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/ngsqctoolkit/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/ngsqctoolkit/trunk/
-Homepage: http://59.163.192.90:8080/ngsqctoolkit/
-
-Package: ngsqctoolkit
-Architecture: any
-Depends: ${shlibs:Depends},
- ${misc:Depends},
- libgd-graph-perl,
- libgd-text-perl,
- libstring-approx-perl
-Description: toolkit for the quality control of next generation sequencing data
- NGS QC Toolkit: A toolkit for the quality control (QC) of next
- generation sequencing (NGS) data. The toolkit comprises of user-friendly
- stand alone tools for quality control of the sequence data generated
- using Illumina and Roche 454 platforms with detailed results in the form
- of tables and graphs, and filtering of high-quality sequence data. It
- also includes few other tools, which are helpful in NGS data quality
- control and analysis.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index cb158be..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,12 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: NGS QC Toolkit
-Upstream-Contact: Mukesh Jain <mjain at nipgr.ac.in> (mjainanid at gmail.com)
-Source: http://59.163.192.90:8080/ngsqctoolkit/
-
-Files: *
-Copyright: © 2010-2014 Mukesh Jain
-License: to_be_clarified
-
-Files: debian/*
-Copyright: © 2014 maintainername <maintainer at e.mail>
-License: <license>
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index 53c1ee8..0000000
--- a/debian/docs
+++ /dev/null
@@ -1 +0,0 @@
-NGSQCToolkitv2.3.3_manual.pdf
diff --git a/debian/install b/debian/install
deleted file mode 100644
index 21531d9..0000000
--- a/debian/install
+++ /dev/null
@@ -1,2 +0,0 @@
-QC/lib/* usr/share/perl5/ngsqctoolkit
-*/*.pl usr/share/ngsqctoolkit
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index fc9ccde..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/make -f
-
-# DH_VERBOSE := 1
-
-DEBPKGNAME := $(shell dpkg-parsechangelog | awk '/^Source:/ {print $$2}')
-
-%:
- dh $@
-
-override_dh_fixperms:
- dh_fixperms
- chmod +x debian/$(DEBPKGNAME)/usr/share/ngsqctoolkit/*.pl
-
-override_dh_link:
- mkdir -p debian/$(DEBPKGNAME)/usr/bin
- for pl in debian/$(DEBPKGNAME)/usr/share/ngsqctoolkit/*.pl ; do \
- ln -s ../share/ngsqctoolkit/`basename $${pl}` debian/$(DEBPKGNAME)/usr/bin/`basename $${pl} .pl` ; \
- done
-
-#get-orig-source:
-# . debian/get-orig-source
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
deleted file mode 100644
index 684d8ff..0000000
--- a/debian/upstream/metadata
+++ /dev/null
@@ -1,12 +0,0 @@
-Reference:
- Author: Ravi K. Patel and Mukesh Jain
- Title: "NGS QC Toolkit: A Toolkit for Quality Control of Next Generation Sequencing Data"
- Journal: PLoS One
- Year: 2012
- Volume: 7
- Number: 2
- Pages: e30619
- DOI: 10.1371/journal.pone.0030619
- PMID: 22312429
- URL: http://www.plosone.org/article/info:doi/10.1371/journal.pone.0030619
- eprint: http://www.plosone.org/article/fetchObject.action?uri=info%3Adoi%2F10.1371%2Fjournal.pone.0030619&representation=PDF
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index d428d33..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,7 +0,0 @@
-version=4
-
-#http://nipgr.res.in/ngsqctoolkit.html .*/NGSQCToolkit_v([\d.]+)\.zip
-opts=repack,compress=xz \
- http://14.139.61.3:8080/ngsqctoolkit/ .*/NGSQCToolkit_v([\d.]+)\.zip
-
-# http://59.163.192.90:8080/ngsqctoolkit/ .*download.pl\?toolkit=NGSQCToolkit_v([\d.]+)\.zip
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ngsqctoolkit.git
More information about the debian-med-commit
mailing list