[med-svn] [surankco] 01/03: Imported Upstream version 0.0.r5+dfsg
Andreas Tille
tille at debian.org
Thu Mar 24 13:58:49 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository surankco.
commit 8076ab1a04a14f23ba8688e2aa5dc039213d10dc
Author: Andreas Tille <tille at debian.org>
Date: Thu Mar 24 14:48:36 2016 +0100
Imported Upstream version 0.0.r5+dfsg
---
license.txt | 24 +
r/covcurv.R | 167 ++++++
r/coverage.R | 47 ++
r/expofits.R | 134 +++++
r/feature.R | 164 ++++++
r/functions.R | 35 ++
r/import.R | 81 +++
r/parameter.R | 577 +++++++++++++++++++++
r/pslMatchFilter | 27 +
r/rf.R | 103 ++++
r/scores.R | 88 ++++
r/voting.R | 42 ++
readme.txt | 354 +++++++++++++
src/de/rki/ng4/surankco/data/AceContig.java | 220 ++++++++
src/de/rki/ng4/surankco/data/AceRead.java | 182 +++++++
src/de/rki/ng4/surankco/data/Assembly.java | 30 ++
src/de/rki/ng4/surankco/data/AssemblyAce.java | 162 ++++++
src/de/rki/ng4/surankco/data/AssemblySam.java | 174 +++++++
src/de/rki/ng4/surankco/data/Contig.java | 100 ++++
src/de/rki/ng4/surankco/data/FastaContig.java | 24 +
src/de/rki/ng4/surankco/data/FastqReads.java | 109 ++++
src/de/rki/ng4/surankco/data/Read.java | 91 ++++
src/de/rki/ng4/surankco/data/Reads.java | 109 ++++
src/de/rki/ng4/surankco/data/SamContig.java | 43 ++
src/de/rki/ng4/surankco/feature/Coverage.java | 166 ++++++
src/de/rki/ng4/surankco/feature/Kmer.java | 279 ++++++++++
src/de/rki/ng4/surankco/feature/Main.java | 257 +++++++++
src/de/rki/ng4/surankco/feature/Several.java | 287 ++++++++++
src/de/rki/ng4/surankco/files/Ace.java | 175 +++++++
src/de/rki/ng4/surankco/files/ExportR.java | 131 +++++
src/de/rki/ng4/surankco/files/Fasta.java | 52 ++
src/de/rki/ng4/surankco/files/Fastq.java | 217 ++++++++
src/de/rki/ng4/surankco/files/Qual.java | 61 +++
src/de/rki/ng4/surankco/files/Sam.java | 72 +++
src/de/rki/ng4/surankco/scoring/Main.java | 32 ++
src/de/rki/ng4/surankco/scoring/MetricsPSL.java | 349 +++++++++++++
src/de/rki/ng4/surankco/scoring/PSLHit.java | 139 +++++
.../rki/ng4/surankco/scoring/PSLPrettyParser.java | 93 ++++
.../scoring/scores/NormedContigLength1.java | 29 ++
.../scoring/scores/NormedContigLength2.java | 28 +
src/de/rki/ng4/surankco/scoring/scores/Score.java | 15 +
src/de/rki/ng4/surankco/tools/Statistic.java | 111 ++++
surankco-feature | 111 ++++
surankco-prediction | 78 +++
surankco-score | 151 ++++++
surankco-training | 93 ++++
46 files changed, 6013 insertions(+)
diff --git a/license.txt b/license.txt
new file mode 100755
index 0000000..205b3db
--- /dev/null
+++ b/license.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2014,
+Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * The name of the author may not be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL Mathias Kuhring BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/r/covcurv.R b/r/covcurv.R
new file mode 100755
index 0000000..7899081
--- /dev/null
+++ b/r/covcurv.R
@@ -0,0 +1,167 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+
+# Calculate 25% and 50% drops in the contig coverage
+# Input:
+# coverage List of basewise coverage per contig
+# readlengthsmean List of ReadLengthMean per contig
+calcFeatureCoverageDrops <- function(coverage, readlengthsmean){
+
+ drop.result <- data.frame(CoverageDrop25Count=vector(mode="numeric", length(coverage)),
+ CoverageDrop25Max=vector(mode="numeric", length(coverage)),
+ CoverageDrop50Count=vector(mode="numeric", length(coverage)),
+ CoverageDrop50Max=vector(mode="numeric", length(coverage)))
+
+ drops <- lapply(1:length(coverage), function(i){ coverageCurve(coverage[[i]],readlengthsmean[i], 0.75) })
+ # if no contig has drops, 'apply' returns 'numeric(0)' instead of a list with lenght = # contigs
+ # so length > 0 has to be tested
+ if (length(drops)>0){
+ for (i in 1:length(drops)){
+ drop.result$CoverageDrop25Count[i] = dim(drops[[i]])[1]
+ if (drop.result$CoverageDrop25Count[i] > 0)
+ drop.result$CoverageDrop25Max[i] = max(c(drops[[i]][,4]-drops[[i]][,2],drops[[i]][,6]-drops[[i]][,2]))
+ }
+ }
+
+ drops <- lapply(1:length(coverage), function(i){ coverageCurve(coverage[[i]],readlengthsmean[i], 0.50) })
+ if (length(drops)>0){
+ for (i in 1:length(drops)){
+ drop.result$CoverageDrop50Count[i] = dim(drops[[i]])[1]
+ if (drop.result$CoverageDrop50Count[i] > 0)
+ drop.result$CoverageDrop50Max[i] = max(c(drops[[i]][,4]-drops[[i]][,2],drops[[i]][,6]-drops[[i]][,2]))
+ }
+ }
+
+ return(drop.result)
+}
+
+
+dupdel <- function(data){
+ l = length(data)
+ for (i in l:2)
+ if (data[i] == data[i-1]){
+ data <- data[-i]
+ }
+ return(data)
+}
+
+window <- function(data, w, i){
+ return (data[(max(1, i-w)):(min(i+w, length(data)))])
+}
+
+left <- function(data, w, i){
+ return (data[(max(1, i-w)):(i-1)])
+}
+
+right <- function(data, w, i){
+ return (data[(i+1):(min(i+w, length(data)))])
+}
+
+leftMax <- function(data, w, i){
+ d <- left(data, w, i)
+ y <- max(d)
+ idx <- which(d==y)
+ x <- tail(idx,1) + (i-w-1)
+ return(c(x,y))
+}
+
+rightMax <- function(data, w, i){
+ d <- right(data, w, i)
+ y <- max(d)
+ idx <- which(d==y)
+ x <- head(idx,1) + i
+ return(c(x,y))
+}
+
+size <- function(cl, rl){
+ if (rl <= 0.1 * cl)
+ return (rl)
+ else
+ return (cl * 0.1)
+}
+
+mySmooth <- function(data, w){
+ l <- length(data)
+ output <- vector(mode="numeric", length=l)
+ for (i in 1:l){
+ output[i] = mean(window(data, w, i))
+ }
+ return(output)
+}
+
+covDrop <- function(data, w, p){
+ l <- length(data)
+ maxl <- vector(mode="numeric")
+ maxr <- vector(mode="numeric")
+ min <- vector(mode="numeric")
+ # find minima
+ for (i in 2:(l-1)){
+ left <- leftMax(data, w, i)
+ right <- rightMax(data, w, i)
+ if ((data[i]/left[2]<=p) && (data[i]/right[2]<=p)){
+ maxl <- rbind(maxl, left)
+ maxr <- rbind(maxr, right)
+ min <- rbind(min, c(i,data[i]))
+ }
+ }
+ # merge minima
+ maxlf <- vector(mode="numeric")
+ maxrf <- vector(mode="numeric")
+ minf <- vector(mode="numeric")
+ if (length(dim(min))>0){
+ group <- 1
+ if (dim(min)[1] > 1){
+ for (i in 2:dim(min)[1]){
+ if (maxl[i,1] < min[tail(group,1),1]){
+ group <- c(group,i)
+ }
+ else{
+ maxlf <- rbind(maxlf, c(mean(maxl[group,1][maxl[group,2]==max(maxl[group,2])]), max(maxl[group,2])))
+ maxrf <- rbind(maxrf, c(mean(maxr[group,1][maxr[group,2]==max(maxr[group,2])]), max(maxr[group,2])))
+ minf <- rbind(minf, c(mean(min[group,1][min[group,2]==min(min[group,2])]), min(min[group,2])))
+ group <- i
+ }
+ }
+ }
+ maxlf <- rbind(maxlf, c(mean(maxl[group,1][maxl[group,2]==max(maxl[group,2])]), max(maxl[group,2])))
+ maxrf <- rbind(maxrf, c(mean(maxr[group,1][maxr[group,2]==max(maxr[group,2])]), max(maxr[group,2])))
+ minf <- rbind(minf, c(mean(min[group,1][min[group,2]==min(min[group,2])]), min(min[group,2])))
+ }
+ return(cbind(minf,maxlf,maxrf))
+}
+
+
+covcurv <- function(input, num, p){
+ cov <- unlist(input$Coverage[num])
+ x <- 1:length(cov)
+ r <- input$ReadLengthsMean[num]
+ w <- size(length(cov),r)
+ sm <- mySmooth(cov,w)
+ title <- paste('contig', num, ', drop', (1-p)*100,'%',collapse=' ')
+ plot(cov,col="grey", main=title, xlab='contig', ylab='coverage')
+ lines(sm, col='red', lwd=2)
+ test <- covDrop(sm, 2*w, p)
+ if (length(test) > 0){
+ points(test[,1],test[,2],col='blue',lwd=2, cex=2)
+ points(test[,3],test[,4],col='purple',lwd=2, cex=2)
+ points(test[,5],test[,6],col='purple',lwd=2, cex=2)
+ }
+}
+
+coverageCurve <- function(cov, read, p){
+ w <- size(length(cov),read)
+ sm <- mySmooth(cov,w)
+ drops <- covDrop(sm, 2*w, p)
+ return(drops)
+}
+
+mypdf <- function(input, max){
+ pdf(height=5)
+ for (i in 1:max){
+ covcurv(input,i,0.75)
+ covcurv(input,i,0.50)
+ }
+ dev.off()
+}
\ No newline at end of file
diff --git a/r/coverage.R b/r/coverage.R
new file mode 100755
index 0000000..0f61871
--- /dev/null
+++ b/r/coverage.R
@@ -0,0 +1,47 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+
+# BaseConfirmation for the ends
+endConfirmation <- function(coverage, corecoverage, endSize){
+
+ num <- length(coverage)
+ leftconf <- vector(mode="list", length=num)
+ rightconf <- vector(mode="list", length=num)
+ for (i in 1:num){
+ leftc <- getEnd(coverage[[i]], endSize[i], "left")
+ leftcc <- getEnd(corecoverage[[i]], endSize[i], "left")
+ leftconf[[i]] <- apply(cbind(leftcc,leftc), 1, mybinom)
+
+ rightc <- getEnd(coverage[[i]], endSize[i], "right")
+ rightcc <- getEnd(corecoverage[[i]], endSize[i], "right")
+ rightconf[[i]] <- apply(cbind(rightcc,rightc), 1, mybinom)
+ }
+
+ leftright <- data.frame(sapply(leftconf, mean), sapply(leftconf, sd),
+ sapply(leftconf, median), sapply(leftconf, mad),
+ sapply(leftconf, min), sapply(leftconf, max),
+ sapply(rightconf, mean), sapply(rightconf, sd),
+ sapply(rightconf, median), sapply(rightconf, mad),
+ sapply(rightconf, min), sapply(rightconf, max))
+
+ ordered <- t(apply(leftright, 1, orderMinMax))
+ colnames(ordered) <- c('BaseConfirmationMinEndMean', 'BaseConfirmationMinEndSD',
+ 'BaseConfirmationMinEndMedian', 'BaseConfirmationMinEndMad',
+ 'BaseConfirmationMinEndMin', 'BaseConfirmationMinEndMax',
+ 'BaseConfirmationMaxEndMean', 'BaseConfirmationMaxEndSD',
+ 'BaseConfirmationMaxEndMedian', 'BaseConfirmationMaxEndMad',
+ 'BaseConfirmationMaxEndMin', 'BaseConfirmationMaxEndMax')
+ return(ordered)
+}
+
+orderMinMax <- function(line){
+ if (line[1] <= line[7]) return(line)
+ else return(cbind(line[7:12],line[1:6]))
+}
+
+getEnd <- function(sequence, endsize, side){
+ if (side=='right') sequence <- rev(sequence)
+ sequence[1:min(endsize,length(sequence))]
+}
\ No newline at end of file
diff --git a/r/expofits.R b/r/expofits.R
new file mode 100755
index 0000000..05d80d9
--- /dev/null
+++ b/r/expofits.R
@@ -0,0 +1,134 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+#
+getExpoClassThresholds <- function(training, quantile=0.25){
+
+ training$NormedMatchCount1 <- 1 - training$NormedMatchCount1
+ training$NormedMatchCount2 <- 1 - training$NormedMatchCount2
+ training$NormedContigLength1 <- 1 - training$NormedContigLength1
+
+ firstMetric <- which(colnames(training)=="NormedMatchCount1")
+ lastMetric <- ncol(training)
+
+ quant <- vector(mode="numeric")
+
+ quantiles <- rep(quantile, length(firstMetric:lastMetric))
+ names(quantiles) <- names(training)[firstMetric:lastMetric]
+
+ for (i in firstMetric:lastMetric){
+ fit <- fitExpDist(training[[i]], plot=F, main=names(training)[i])
+ quant <- append( quant, expPosByQuantile(fit, quantile=quantiles[names(training)[i]], plot=F) )
+ }
+
+ names(quant) <- names(training)[firstMetric:lastMetric]
+
+ quant["NormedMatchCount1"] <- 1 - quant["NormedMatchCount1"]
+ quant["NormedMatchCount2"] <- 1 - quant["NormedMatchCount2"]
+ quant["NormedContigLength1"] <- 1 - quant["NormedContigLength1"]
+
+ return(quant)
+}
+
+
+#
+expoFit <- function(scores, pdf=TRUE,
+ filename=paste("expofit_",
+ format(Sys.time(), "%Y-%m-%d_%H-%M-%S"),
+ ".pdf", sep="")){
+
+ fM <- which(colnames(scores)=="NormedMatchCount1")
+ lM <- ncol(scores)
+ metrics <- scores[ ,fM:lM]
+
+ metrics$NormedMatchCount1 <- 1 - metrics$NormedMatchCount1
+ metrics$NormedMatchCount2 <- 1 - metrics$NormedMatchCount2
+ metrics$NormedContigLength1 <- 1 - metrics$NormedContigLength1
+
+ if (pdf){
+ pdf(file=filename, width=5.875, height=5.875, onefile=TRUE)
+ }
+
+ par.default <- par()
+ par.mar <- c(3, 3, 2, 1)
+ par.mgp <- c(1.5, 0.5, 0)
+ par(mfrow=c(4,2),mar=par.mar, mgp=par.mgp)
+
+ fM <- which(colnames(metrics)=="NormedMatchCount1")
+ lM <- ncol(metrics)
+
+ quantiles <- c(0.50, 0.50, 0.50, 0.50, 0.50, 0.25, 0.33, 0.50, 0.33, 0.25, 0.25, 0.25)
+ quantiles <- rep(0.25, length(fM:lM))
+ names(quantiles) <- names(metrics)[fM:lM]
+
+ for (name in names(metrics)[fM:lM]){
+ fit <- fitExpDist(metrics[[name]], plot=T, main=name)
+ quant <- expPosByQuantile(fit, quantile=quantiles[name], metrics[[name]], plot=T)
+ }
+
+# par(par.default)
+
+ if (pdf) garbage <- dev.off()
+}
+
+
+# fit an exponential distribution to a histogram of metric
+fitExpDist <- function(metric, plot=FALSE, logIt=FALSE, ...){
+
+ # use logarithmic scale (+1 shift cause of possible zeros)
+ if (logIt) metric <- log(metric +1)
+
+ # fit exponential distribution
+ require(MASS)
+ fit <- fitdistr(metric, densfun="exponential")
+
+ # plot it
+ if (plot){
+ breaks <- getBreaks(metric)
+
+ h <- hist(metric, breaks=breaks, freq=F, plot=plot, ...)
+ x <- h$mids
+ y <- h$density
+
+ lines(x, dexp(x, rate=fit$estimate), col="red")
+ }
+
+ return(fit)
+}
+
+expPosByQuantile <- function(fit, quantile=0.50, metric, plot=FALSE, expIt=FALSE, ...){
+
+ # calc quantiles from exponential fit
+ quant <- qexp(quantile, rate=fit$estimate)
+# quant <- qnorm(quantile, mean=fit$estimate["mean"], sd=fit$estimate["sd"])
+
+ # plot abline at quant positions
+ if (plot){
+ y <- diff(par("usr")[3:4])*0.6 + par("usr")[3]
+ for (i in 1:length(quantile)){
+ abline(v=quant[i], col="red", ...)
+ lab <- paste(100*quantile[i], "% quantile", "\n# <=", ":", sum(metric<=quant[i]), "\n# >", ":", sum(metric>quant[i]), sep="")
+ text(x=quant[i],y=y, pos=4, labels=lab, col="black")
+ }
+ }
+
+ # reverse logarithmic scale (-1 to reverse +1 shift)
+ if (expIt) quants <- exp(quant) -1
+
+ return(quant)
+}
+
+
+getBreaks <- function(metric){
+
+ if (diff(range(metric)) <= 1){ # if normed metric
+ breaks=seq(from=0.00, to=1.00, by=0.01)
+ } else if (all(floor(metric) == metric, na.rm = TRUE)) { # if integer scale
+ breaks=(min(metric)-0.5):(max(metric)+0.5)
+ } else { # other scales, e.g. log scale
+ breaks=100
+ }
+
+ return(breaks)
+}
\ No newline at end of file
diff --git a/r/feature.R b/r/feature.R
new file mode 100755
index 0000000..4095bfa
--- /dev/null
+++ b/r/feature.R
@@ -0,0 +1,164 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# imports the tmp features from the java routine
+importFeatures <- function(assembly.files, quality.files,
+ features.files, coverage.files, ccoverage.files){
+
+ rawdata <- vector('list', length(features.files))
+ if (DEBUGGING){ print("feature import") }
+ for (i in 1:length(features.files)){
+ if (DEBUGGING){ print(as.character(features.files[i])) }
+
+ rawdata[[i]] <- cbind(Assembly=assembly.files[i],
+ ReadQuality=quality.files[i],
+ read.table(features.files[i], header=TRUE,
+ colClasses=get.colClasses(features.files[i])))
+ rawdata[[i]]$Coverage <- read.coverage(coverage.files[i])
+ rawdata[[i]]$CoreCoverage <- read.coverage(ccoverage.files[i])
+ }
+ if (DEBUGGING){ print("import done") }
+ return(rawdata)
+}
+
+
+# Attach an expected genome size from egs to every data.frame in feature.list
+attachEGS <- function(feature.list, egs){
+ if (length(egs)==1){
+ egs <- rep(egs, length(feature.list))
+ }
+ for (i in 1:length(feature.list)){
+ attr(feature.list[[i]], "egs") <- egs[i]
+ }
+ return(feature.list)
+}
+
+
+# Extract further features, check if single or multi core should be used
+extractFeatures <- function(rawdata, cores=1){
+ if (DEBUGGING){ cat("feature extraction... ") }
+
+ if (cores>1) results <- mcFeatures(rawdata, cores)
+ else results <- scFeatures(rawdata)
+
+ if (DEBUGGING){ cat("done") }
+ return(results)
+}
+
+
+# Singlecore feature extraction
+scFeatures <- function(rawdata){
+ results <- lapply(rawdata, function(x){ return(features(x)) })
+ return(results)
+}
+
+
+# Multicore feature extraction
+mcFeatures <- function(rawdata, cores){
+# require("parallel")
+ results <- mclapply(rawdata, function(x){ return(features(x)) }, mc.cores=cores)
+ return(results)
+}
+
+
+# Step by step feature extraction
+features <- function(result){
+
+ # Convert ContigID from factor to characters
+ result$ContigID <- as.character(result$ContigID)
+
+ # GenomeSize & GenomeRelation
+ egs = attr(result, "egs", exact=TRUE)
+ if (DEBUGGING) print(paste("egs:", egs))
+ if (egs == 0){
+ result <- insert(result, sum(result$Length), 'EstimatedGenomeSize', 'ReadCount')
+ }
+ else{
+ result <- insert(result, egs, 'EstimatedGenomeSize', 'ReadCount')
+ }
+ result <- insert(result, result$Length/result$EstimatedGenomeSize, 'GenomeRelation', 'EstimatedGenomeSize')
+
+ # N50 & N50Relation
+ result <- insert(result, N50(result$Length), 'N50', 'GenomeRelation')
+ result <- insert(result, result$Length/result$N50, 'N50Relation', 'N50')
+
+ # ReadLengthQuotients
+ result <- insert(result, result$ReadLengthsMean/result$ReadLengthsPaddedMean, 'ReadLengthsQuotientMean', 'ReadLengthsPaddedMax')
+ result <- insert(result, result$ReadLengthsMedian/result$ReadLengthsPaddedMedian, 'ReadLengthsQuotientMedian', 'ReadLengthsQuotientMean')
+ result <- insert(result, result$ReadLengthsMin/result$ReadLengthsPaddedMin, 'ReadLengthsQuotientMin', 'ReadLengthsQuotientMedian')
+ result <- insert(result, result$ReadLengthsMax/result$ReadLengthsPaddedMax, 'ReadLengthsQuotientMax', 'ReadLengthsQuotientMin')
+
+ # ReadLengthQuotions Old: had to remove ReadLengthsQuotientSD and ReadLengthsQuotientMAD
+ # They can become NAs if the ReadLengthsPaddesSD/MAD are all 0 (division by zero -> NA).
+ # This occures only if the PaddedReadLength of all reads of a contig is exactly the same (i.e. no gaps at all),
+ # which is extremely unlikely in a real assembly. However, we observed data were this occures.
+# result <- insert(result, result$ReadLengthsMean/result$ReadLengthsPaddedMean, 'ReadLengthsQuotientMean', 'ReadLengthsPaddedMax')
+# result <- insert(result, result$ReadLengthsSD/result$ReadLengthsPaddedSD, 'ReadLengthsQuotientSD', 'ReadLengthsQuotientMean')
+# result <- insert(result, result$ReadLengthsMedian/result$ReadLengthsPaddedMedian, 'ReadLengthsQuotientMedian', 'ReadLengthsQuotientSD')
+# result <- insert(result, result$ReadLengthsMAD/result$ReadLengthsPaddedMAD, 'ReadLengthsQuotientMAD', 'ReadLengthsQuotientMedian')
+# result <- insert(result, result$ReadLengthsMin/result$ReadLengthsPaddedMin, 'ReadLengthsQuotientMin', 'ReadLengthsQuotientMAD')
+# result <- insert(result, result$ReadLengthsMax/result$ReadLengthsPaddedMax, 'ReadLengthsQuotientMax', 'ReadLengthsQuotientMin')
+
+ # CoverageComparison
+ result <- insert(result, result$CoverageGlobalMean/mean(result$CoverageGlobalMean),
+ 'CoverageGlobalMeanComparison', 'CoverageMaxEndMax')
+ result <- insert(result, result$CoverageMinEndMean/mean(result$CoverageMinEndMean),
+ 'CoverageMinEndMeanComparison', 'CoverageGlobalMeanComparison')
+ result <- insert(result, result$CoverageMaxEndMean/mean(result$CoverageMaxEndMean),
+ 'CoverageMaxEndMeanComparison', 'CoverageMinEndMeanComparison')
+
+ # BaseConfirmation
+ conf <- apply(cbind(result$CoreCoverage, result$Coverage), 1, multibinom)
+ result <- insert(result, sapply(conf, mean), 'BaseConfirmationGlobalMean', 'CoreCoverageMaxEndMax')
+ result <- insert(result, sapply(conf, sd), 'BaseConfirmationGlobalSD', 'BaseConfirmationGlobalMean')
+ result <- insert(result, sapply(conf, median), 'BaseConfirmationGlobalMedian', 'BaseConfirmationGlobalSD')
+ result <- insert(result, sapply(conf, mad), 'BaseConfirmationGlobalMAD', 'BaseConfirmationGlobalMedian')
+ result <- insert(result, sapply(conf, min), 'BaseConfirmationGlobalMin', 'BaseConfirmationGlobalMAD')
+ result <- insert(result, sapply(conf, max), 'BaseConfirmationGlobalMax', 'BaseConfirmationGlobalMin')
+
+ pos <- which(names(result)=='BaseConfirmationGlobalMax')
+ endconf <- endConfirmation(result$Coverage, result$CoreCoverage, result$ReadLengthsMean)
+ result <- cbind(result[1:pos], endconf, result[(pos+1):length(result)])
+
+ # CoverageDrop
+ result <- cbind(result, calcFeatureCoverageDrops(result$Coverage, result$ReadLengthsMean))
+
+ # remove Coverega and CoreCoverage
+ result$Coverage <- NULL
+ result$CoreCoverage <- NULL
+
+ return(result)
+}
+
+
+export <- function(results){
+ for (i in 1:length(results)){
+ filename <- paste("output", as.character(i), ".csv", sep="")
+ write.table(results[[i]],file=filename,sep=",",row.names=F)
+ }
+}
+
+featureReduction <- function(){
+ featCor <- cor(training[,3:(length(training)-1)],training[,3:(length(training)-1)])
+ index <- is.na(featCor)
+ featCor2 <- replace(featCor, index, 0)
+ # corrgram(featCor2, type='cor')
+
+ i=1
+ while (i<=dim(featCor2)[1]){
+ idx <- abs(featCor2[i,])<0.20
+ idx[i] <- TRUE
+ featCor2 <- featCor2[idx,idx]
+ i <- i+1
+ }
+
+ featNames <- rownames(featCor2)
+ featNames <- c(names(training)[1:2], featNames, names(training)[length(training)])
+ return(featNames)
+
+ featCor <- cor(training[,featNames],training[,featNames])
+ index <- is.na(featCor)
+ featCor2 <- replace(featCor, index, 0)
+ corrgram(featCor2, type='cor')
+}
\ No newline at end of file
diff --git a/r/functions.R b/r/functions.R
new file mode 100755
index 0000000..fea100c
--- /dev/null
+++ b/r/functions.R
@@ -0,0 +1,35 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# calculation of N50
+N50 <- function(contigLengths){
+ median(rep(contigLengths,contigLengths))
+}
+
+
+baseconf <- function(corecoverage, coverage, prob){
+ output <- vector(mode="numeric", length=length(coverage))
+ for (i in 1:length(coverage)){
+ output[i] <- binom.test(corecoverage[i], coverage[i], prob, "greater")$p.value
+ }
+ return(output)
+}
+
+mybinom <- function(covpair){
+ if (covpair[2]<covpair[1]) covpair[2] <- covpair[1]
+ return(binom.test(covpair[1], covpair[2], 0.98, "greater")$p.value)
+}
+
+multibinom <- function(covpairs){
+ return(apply(cbind(unlist(covpairs[1]),unlist(covpairs[2])), 1, mybinom))
+}
+
+# insert vector "input" with colname "name"
+# into data.frame "data" after column "after"
+insert <- function(data, input, name, after){
+ pos <- which(names(data)==after)
+ output <- cbind(data[1:pos], input, data[(pos+1):length(data)])
+ colnames(output)[(pos+1)] <- name
+ return(output)
+}
\ No newline at end of file
diff --git a/r/import.R b/r/import.R
new file mode 100755
index 0000000..47861b5
--- /dev/null
+++ b/r/import.R
@@ -0,0 +1,81 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# Import of a quality or coverage file in fasta like style but single quality
+# values are separated by spaces
+# Code inspired by the function "read.fasta" from the R package "seqinr"
+#
+# Coverage example:
+# >Contig00001
+# 31 31 31 37 35 37 37 37 39 39 39 32 35 10 34 36 38 36 25 31 32 37 33 32 36 36
+#
+read.coverage <- function(file){
+
+ if (file.exists(file)){
+ if (DEBUGGING){ cat(paste("read file:", file, "\n")) }
+
+ lines <- readLines(file)
+
+ ind <- which(substr(lines, 1L, 1L) == ">")
+ nseq <- length(ind)
+ if (nseq == 0) {
+ stop("no line starting with a > character found")
+ }
+ start <- ind + 1
+ end <- ind - 1
+ end <- c(end[-1], length(lines))
+
+ coverage <- lapply(seq_len(nseq), function(i) {
+ as.integer(do.call(c, strsplit(lines[start[i]:end[i]], " "))) })
+
+ titles <- lapply(seq_len(nseq), function(i) {
+ firstword <- strsplit(lines[ind[i]], " ")[[1]][1]
+ substr(firstword, 2, nchar(firstword)) })
+
+ names(coverage) <- titles
+
+# workaround for zero coverage positions:
+# pseude count, increase all by 1
+ coverage <- lapply(coverage, "+", 1)
+
+ return(coverage)
+
+ } else {
+ cat(paste("can not read file:", file, "\n"))
+ }
+}
+
+
+# import of files with SuRankCo features, as exported from the feature module
+readSurankcoFeatures <- function(files){
+ features <- vector(mode="list", length=length(files))
+ for (i in 1:length(files)){
+ features[[i]] <- read.table(file=files[i], sep="\t", dec = ".", header=TRUE,
+ colClasses=get.colClasses(files[i], 3))
+ }
+ return(features)
+}
+
+
+# import of files with SuRankCo scores, as exported from the score module
+readSurankcoScores <- function(files){
+ scores <- vector(mode="list", length=length(files))
+ for (i in 1:length(files)){
+ scores[[i]] <- read.table(file=files[i], sep="\t", dec = ".", header=TRUE,
+ colClasses=get.colClasses(files[i], 3))
+ }
+ return(scores)
+}
+
+
+# predefine the column classes for read.table imports
+# the first char.num columns will be character and the rest numeric
+get.colClasses <- function(filename, char.num=1){
+ feature.names <- scan(filename, what="character", nlines=1, quiet=TRUE)
+ colClasses <- c(rep("character", char.num),
+ rep("numeric", length(feature.names)-char.num))
+ names(colClasses) <- feature.names
+ if (DEBUGGING){ print(colClasses) }
+ return(colClasses)
+}
\ No newline at end of file
diff --git a/r/parameter.R b/r/parameter.R
new file mode 100755
index 0000000..f0c114a
--- /dev/null
+++ b/r/parameter.R
@@ -0,0 +1,577 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# Function for parameter parsing, file checks and preparation,
+# package loading and error handling
+
+
+loadPackages <- function(names, quietly=TRUE){
+ # Tries to require R packages/libraries and stop executions if failing
+ #
+ # Args:
+ # names: vector with package names
+ #
+ # Return:
+ # Nothing, but will stop execution if packages are missing
+
+ check <- vector(mode="logical", length=length(names))
+
+ for (i in 1:length(names)){
+ suppressMessages(check[i] <- require(names[i],
+ quietly=quietly,
+ character.only=TRUE))
+ }
+
+ if (!all(check)){
+ complainAndStop("missing R-package(s)", names[!check])
+ }
+}
+
+
+# surankco-feature argument parser
+parseSurankcoFeature <- function(args=commandArgs(trailingOnly = TRUE)){
+ # Parses parameters for the surankco feature module
+ #
+ # Args:
+ # args: commandline parameters, default = commandArgs(trailingOnly = TRUE)
+ #
+ # Returns:
+ # List of parameter values as provided by parse_args (optparse package)
+
+ parser.options <- list(
+ # Input parameter
+ make_option(opt_str=c("-a", "--assemblies"),
+ action="store",
+ help="Indicate a list of assembly files (comma separated), correct
+ suffixes are mandatory (e.g. \".ace\")"),
+
+ make_option(opt_str=c("-d", "--directory"),
+ action="store",
+ help="Indicate a directory containing assembly files with indicated
+ format (default: ace, see parameter -f)"),
+
+ make_option(opt_str=c("-f", "--assembly.format"),
+ action="store",
+ default="ace",
+ help="Indicate assembly/contig format (resp. suffix),
+ either \"ace\" (default) or \"contigs.fasta\""),
+
+ make_option(opt_str=c("-r", "--read.quality.format"),
+ action="store",
+ default="no value",
+ help="Indicate the read quality format: qual, qua or fastq for ACE
+ (default=\"qual\") resp. sam or bam for contigs.fasta (default=\"sam\")"),
+
+ make_option(opt_str=c("-q", "--fastq.version"),
+ action="store",
+ default="illumina18",
+ help="Indicate the fastq version: auto, sanger, solexa, illumina13,
+ illumina15, illumina18 (default). Only needed for ACE assemblies."),
+
+ # Read name regex
+ make_option(opt_str=c("-s", "--split.regex"),
+ action="store",
+ #default=".[0-9]*-",
+ default="nosplit",
+ type="character",
+ help="Indicate a regular expression to cutoff read names (e.g. if
+ modified by the assembler). Only needed for ACE assemblies.
+ Note, if a backslash \"\\\" is needed use \"\\\\\\\\\"!"),
+
+ # make_option(opt_str=c("-n", "--name.regex"),
+ # action="store",
+ # help="Indicate a regular expression to match read names
+ # (e.g. if modified by the assembler)"),
+
+ # Performance parameter
+ make_option(opt_str=c("-t", "--threads"),
+ action="store",
+ default=1,
+ type="integer",
+ help="Indicate a number of cores or threads to use. Might speed up
+ some parallelized operations (default: 1)"),
+
+ make_option(opt_str=c("-m", "--memory"),
+ action="store",
+ default=32,
+ type="double",
+ help="Indicate the maximum memory usage (in Gb) of Javas virtual
+ machine (default: 32). Try to increase if big data sets report
+ heap space problems."),
+
+ # Feature parameter
+ make_option(opt_str=c("-k", "--kmer.features"),
+ action="store_true",
+ default=FALSE,
+ help="Indicates whether k-mer features should be computed
+ (experimental, very long runtime) or not (default)"),
+
+ make_option(opt_str=c("-g", "--expected.genome.size"),
+ action="store",
+ default="0",
+ type="character",
+ help="Indicate a list of expected genome sizes (comma separated) or
+ one value for all assemblies. Default is 0, which will estimate
+ the genome sizes as sum of contig lengths."),
+
+ make_option(opt_str=c("-c", "--contig.size.filter"),
+ action="store",
+ default="0",
+ type="integer",
+ help="Indicate a minimum contig size. Default: 0")
+ )
+
+ myparser <- OptionParser(usage="usage: surankco-feature [options]",
+ option_list=parser.options,
+ prog = NULL, description = "", epilogue = "")
+
+ parameters <- parse_args(object = myparser,
+ args = args,
+ print_help_and_exit = TRUE,
+ positional_arguments = FALSE)
+
+
+ # check obligatory parameter
+ checkObligatoryParameters(parameters, need.xor=c("assemblies", "directory"))
+
+ # check assembly.format
+ if (!(parameters$assembly.format %in% c("ace", "contigs.fasta"))){
+ complainAndStop(paste("invalid \"assembly.format\" (resp. suffix)",
+ "(expecting ace or contigs.fasta)"),
+ parameters$assembly.format)
+ }
+
+ # check read.quality.format
+ if (parameters$assembly.format == "ace"){
+ if (parameters$read.quality.format == "no value") parameters$read.quality.format <- "qual"
+ else if (!(parameters$read.quality.format %in% c("qual", "qua", "fastq"))){
+ complainAndStop(paste("invalid \"read.quality.format\"",
+ "(expecting qual, qua, fastq)"),
+ parameters$read.quality.format)
+ }
+ }
+ if (parameters$assembly.format == "contigs.fasta"){
+ if (parameters$read.quality.format == "no value") parameters$read.quality.format <- "sam"
+ else if (!(parameters$read.quality.format %in% c("sam", "bam"))){
+ complainAndStop(paste("invalid \"read.quality.format\"",
+ "(expecting sam or bam)"),
+ parameters$read.quality.format)
+ }
+ }
+
+ # check files
+ parameters$files <- prepareFiles(parameters, "assemblies",
+ parameters$assembly.format,
+ parameters$read.quality.format)
+
+ # parameter checks
+ # fastq.version
+ if (!(parameters$fastq.version %in% c("auto", "sanger", "solexa",
+ "illumina13", "illumina15",
+ "illumina18"))){
+ complainAndStop(paste("invalid \"fastq.version\"",
+ "(expecting auto, sanger, solexa, illumina13, ",
+ "illumina15 or illumina18)"),
+ parameters$fastq.version)
+ }
+
+ # threads
+ if (is.na(parameters$threads) || parameters$threads < 1){
+ complainAndStop(paste("invalid thread number",
+ "(expecting a positiv integer)"),
+ parameters$threads)
+ }
+ parameters$threads <- min(parameters$threads, nrow(parameters$files))
+
+ # memory
+ if (is.na(parameters$memory) || parameters$memory <= 0){
+ complainAndStop(paste("invalid memory value ",
+ "(expecting a positiv number)"),
+ parameters$memory)
+ }
+
+ # expected.genome.size
+ splits <- as.numeric(strsplit(parameters$expected.genome.size, ",")[[1]])
+ if (any(is.na(splits)) || any(floor(splits)!=splits) || any(splits < 0)){
+ complainAndStop(paste("invalid values of \"expected.genome.size\"",
+ "(expecting positive integers only)"), splits)
+ }
+ if (length(parameters$expected.genome.size) > 1){
+ if (nrow(parameters$files) != length(parameters$expected.genome.size)){
+ complainAndStop(paste("number of expected.genome.size values and assembly",
+ "files not matching"),
+ paste(length(parameters$expected.genome.size), "vs",
+ nrow(parameters$files)))
+ }
+ }
+ parameters$expected.genome.size <- splits
+
+ # contig.size.filter
+ if (is.na(parameters$contig.size.filter) || parameters$contig.size.filter < 0){
+ complainAndStop(paste("invalid minimum contig size",
+ "(expecting a positiv integer >= 0)"),
+ parameters$contig.size.filter)
+ }
+
+ return(parameters)
+}
+
+
+# surankco-score argument parser
+parseSurankcoScore <- function(args = commandArgs(trailingOnly = TRUE)){
+ # Parses parameters for the surankco score module
+ #
+ # Args:
+ # args: commandline parameters, default = commandArgs(trailingOnly = TRUE)
+ #
+ # Returns:
+ # List of parameter values as provided by parse_args (optparse package)
+
+ parser.options <- list(
+ # Input parameter
+ make_option(opt_str=c("-a", "--assemblies"),
+ action="store",
+ help="Indicate a list of assembly files (comma separated), correct
+ suffixes are mandatory (default \"*.contigs.fasta\")"),
+
+ make_option(opt_str=c("-d", "--directory"),
+ action="store",
+ help="Indicate a directory containing assembly files with indicated
+ format (e.g. \"*.contigs.fasta\")"),
+
+ make_option(opt_str=c("-f", "--assembly.suffix"),
+ action="store",
+ default="contigs.fasta",
+ help="Indicate assembly format/suffix, default=\"contigs.fasta\""),
+
+ make_option(opt_str=c("-r", "--reference.suffix"),
+ action="store",
+ default="ref.fasta",
+ help="Indicate the reference format/suffix, default=\"ref.fasta\""),
+
+ make_option(opt_str=c("-p", "--pdf.histograms"),
+ action="store",
+ default="surankco_score_histograms.pdf",
+ help="Indicate a name for the score histogram pdf
+ (default = \"surankco_score_histograms.pdf\")"),
+
+ make_option(opt_str=c("-m", "--memory"),
+ action="store",
+ default=32,
+ type="integer",
+ help="Indicate the maximum memory usage (in Gb) of Javas virtual
+ machine (default: 32). Try to increase if big data sets report
+ heap space problems.")
+ )
+
+ myparser <- OptionParser(usage="usage: surankco-score [options]",
+ option_list=parser.options,
+ prog = NULL, description = "", epilogue = "")
+
+ parameters <- parse_args(object = myparser,
+ args = args,
+ print_help_and_exit = TRUE,
+ positional_arguments = FALSE)
+
+
+ # check obligatory parameter
+ checkObligatoryParameters(parameters, need.xor=c("assemblies", "directory"))
+
+ # check files
+ parameters$files <- prepareFiles(parameters, "assemblies",
+ parameters$assembly.suffix,
+ parameters$reference.suffix)
+
+ # parameter checks
+ # pdf.histograms
+ if (!file.exists(dirname(parameters$pdf.histograms))){
+ complainAndStop("can not find path for \"pdf.histograms\"",
+ parameters$pdf.histograms)
+ }
+
+ # memory
+ if (is.na(parameters$memory) || parameters$memory <= 0){
+ complainAndStop(paste("invalid memory value ",
+ "(expecting a positiv number)"),
+ parameters$memory)
+ }
+
+ return(parameters)
+}
+
+
+# surankco-training argument parser
+parseSurankcoTraining <- function(args = commandArgs(trailingOnly = TRUE)){
+ # Parses parameters for the surankco training module
+ #
+ # Args:
+ # args: commandline parameters, default = commandArgs(trailingOnly = TRUE)
+ #
+ # Returns:
+ # List of parameter values as provided by parse_args (optparse package)
+
+ parser.options <- list(
+ # Input parameter
+ make_option(opt_str=c("-f", "--features"),
+ action="store",
+ help="Indicate a list of surankco feature files (comma separated),
+ correct suffixes are mandatory (\"*.features.txt\")"),
+
+ make_option(opt_str=c("-d", "--directory"),
+ action="store",
+ help="Indicate a directory containing surankco feature files
+ (\"*.features.txt\")"),
+
+ make_option(opt_str=c("-o", "--output.filename"),
+ action="store",
+ default="surankco_rfs.RData",
+ help="Indicate a name for the export of the random forest classifiers
+ (default = \"surankco_rfs.RData\")"),
+
+ make_option(opt_str=c("-e", "--exponential.quantile"),
+ action="store",
+ default=0.25,
+ help="Indicate an exponential distribution quantile to divide scores
+ (default: 0.25)"),
+
+ make_option(opt_str=c("-m", "--manual.thresholds"),
+ action="store",
+ help="Indicate manual thresholds instead of quantiles, one per score
+ each (comma separated) in the same order as in the score file")
+ )
+
+ myparser <- OptionParser(usage="usage: surankco-training [options]",
+ option_list=parser.options,
+ prog = NULL, description = "", epilogue = "")
+
+ parameters <- parse_args(object = myparser,
+ args = args,
+ print_help_and_exit = TRUE,
+ positional_arguments = FALSE)
+
+
+ # check (obligatory) parameters
+ checkObligatoryParameters(parameters, need.xor=c("features", "directory"))
+
+ # check files
+ parameters$features.suffix <- "features.txt"
+ parameters$scores.suffix <- "scores.txt"
+ parameters$files <- prepareFiles(parameters, "features",
+ parameters$features.suffix,
+ parameters$scores.suffix)
+
+ # parameter checks
+ # output.filename
+ if (!file.exists(dirname(parameters$output.filename))){
+ complainAndStop("can not find path for \"output.filename\"",
+ parameters$output.filename)
+ }
+
+ # exponential.quantile
+ if (is.na(parameters$exponential.quantile) ||
+ parameters$exponential.quantile <= 0 ||
+ parameters$exponential.quantile >= 1){
+ complainAndStop("\"exponential.quantile\" out of range (0,1)",
+ parameters$exponential.quantile)
+ }
+
+ # manual.thresholds
+ if ("manual.thresholds" %in% names(parameters)){
+ splits <- as.numeric(strsplit(parameters$manual.thresholds, ",")[[1]])
+ in.range <- 0 <= splits[-c(5,7)] & splits[-c(5,7)] <= 1
+ in.range <- append(in.range, 0 <= splits[c(5,7)] & splits[c(5,7)] <= 100)
+ if (length(splits) != 8){
+ complainAndStop("wrong number of \"manual.thresholds\" (expecting 8)", splits)
+ }
+ if (is.na(splits) || !all(in.range)){
+ complainAndStop(paste0("out of range \"manual.thresholds\" (expecting ",
+ "[0,1] resp. [0,100] for threshold 5 and 7)"), splits)
+ }
+ parameters$manual.thresholds <- splits
+ }
+
+ return(parameters)
+}
+
+
+# surankco-prediction argument parser
+parseSurankcoPrediction <- function(args = commandArgs(trailingOnly = TRUE)){
+ # Parses parameters for the surankco prediction module
+ #
+ # Args:
+ # args: commandline parameters, default = commandArgs(trailingOnly = TRUE)
+ #
+ # Returns:
+ # List of parameter values as provided by parse_args (optparse package)
+
+ parser.options <- list(
+ # Input parameter
+ make_option(opt_str=c("-f", "--features"),
+ action="store",
+ help="Indicate one surankco feature file, a correct suffixe is
+ mandatory (\"*.features.txt\")"),
+
+ # make_option(opt_str=c("-d", "--directory"),
+ # action="store",
+ # help="Indicate a directory containing surankco feature files
+ # (\"*.features.txt\")"),
+
+ make_option(opt_str=c("-r", "--random.forests"),
+ action="store",
+ help="Indicate a surankco random forests file (\"*.RData\")"),
+
+ make_option(opt_str=c("-o", "--output.filename"),
+ action="store",
+ default="surankco_results.txt",
+ help="Indicate a name for the final results file")
+ )
+
+ myparser <- OptionParser(usage="usage: surankco-prediction [options]",
+ option_list=parser.options,
+ prog = NULL, description = "", epilogue = "")
+
+ parameters <- parse_args(object = myparser,
+ args = args,
+ print_help_and_exit = TRUE,
+ positional_arguments = FALSE)
+
+
+ # check obligatory parameter
+ checkObligatoryParameters(parameters, need.all=c("features", "random.forests"))
+
+ # check files
+ parameters$features.suffix <- "features.txt"
+ parameters$rf.suffix <- "RData"
+ parameters$files.features <- prepareFiles(parameters, "features", parameters$features.suffix)
+ parameters$files.rf <- prepareFiles(parameters, "random.forests", parameters$rf.suffix)
+ parameters$files.features <- parameters$files.features$features.txt
+ parameters$files.rf <- parameters$files.rf$RData
+
+ # parameter checks
+ # output.filename
+ if (!file.exists(dirname(parameters$output.filename))){
+ complainAndStop("can not find path for \"output.filename\"",
+ parameters$output.filename)
+ }
+
+ return(parameters)
+}
+
+
+checkObligatoryParameters <- function(values, need.all=NULL, need.xor=NULL){
+ # Checks whether all obligatory parameters were indicated
+ #
+ # Args:
+ # values: list of parameter values as provided
+ # by parse_args (optparse package)
+ # need.all: vector of obligatory parameters
+ # need.xor: vector of parameters where only one is needed (exclusive)
+ #
+ # Returns:
+ # Nothing, but will stop execution if obligatory parameters are missing
+
+ if (!is.null(need.all)){
+ if (!all(test <- need.all %in% names(values))){
+ complainAndStop("missing parameter(s)", need.all[!test])
+ }
+ }
+
+ if (!is.null(need.xor)){
+ if (sum(test <- need.xor %in% names(values)) > 1){
+ complainAndStop("conflicting parameters", need.xor[test])
+ }
+ if (sum(test <- need.xor %in% names(values)) < 1){
+ complainAndStop("missing parameters (either)", need.xor)
+ }
+ }
+}
+
+
+prepareFiles <- function(parameters, file.parameter,
+ first.suffix, additional.suffixes=NULL){
+ # prepares a list of files from either a directory or a filelist
+
+ if ("directory" %in% names(parameters)){
+ filelist <- paste(sep="", parameters$directory, "/",
+ list.files(path=parameters$directory,
+ pattern=paste(first.suffix, "$", sep="")))
+ }
+ else{
+ filelist <- strsplit(parameters[[file.parameter]], split=",", fixed=TRUE)[[1]]
+ }
+
+ files <- checkFiles(filelist, first.suffix, additional.suffixes)
+
+ return(files)
+}
+
+
+checkFiles <- function(filelist, first.suffix, additional.suffixes=NULL){
+ # Checks if files have the expected suffix and if files exists,
+ # includes files with same prefix but different suffix
+ #
+ # Args:
+ # filelist:
+ # first.suffix:
+ # additional.suffixes:
+ #
+ # Returns:
+ # A data.frame containing filename columns per suffix or
+ # will stop execution if files are missing
+
+ suffix <- paste(".", first.suffix, "$", sep="")
+
+ # check first.suffix
+ missing.suffix <- !grepl(pattern=suffix, x=filelist)
+ if (any(missing.suffix)){
+ complainAndStop(paste("missing suffix -", first.suffix,
+ "- for file(s)", sep=""),
+ filelist[missing.suffix],
+ item.start=":\n\t",
+ item.sep="\n\t")
+ }
+
+ prefix <- sub(pattern=suffix, replacement="", x=filelist)
+ files <- data.frame(filelist, stringsAsFactors=FALSE)
+ colnames(files) <- first.suffix
+ rownames(files) <- prefix
+
+ # build filenames from additional.suffixes
+ if (!is.null(additional.suffixes)){
+ for (add.suf in additional.suffixes){
+ files[add.suf] <- paste(prefix, add.suf, sep=".")
+ }
+ }
+
+ # check file existence
+ missing.files <- !file.exists(unlist(files))
+ if (any(missing.files)){
+ complainAndStop("missing file(s)",
+ unlist(files)[missing.files],
+ item.start=":\n\t",
+ item.sep="\n\t")
+ }
+
+ return(files)
+}
+
+
+complainAndStop <- function(message, items=NULL,
+ item.start=": ", item.sep=", ", item.end=""){
+ # Prints an error message and stops the programm using stop()
+ #
+ # Args:
+ # message: error message to print
+ # items: list of items to add to the message (e.g. missing files)
+ # item.start: separator between message and items (e.g. ": ")
+ # item.sep: separator for the item list (e.g. ", ", "\n\t")
+
+ if (is.null(items)){
+ stop(message, call.=FALSE)
+ }
+ else{
+ items <- paste(items, collapse=item.sep)
+ stop(paste(message, items, sep=item.start), call.=FALSE)
+ }
+}
\ No newline at end of file
diff --git a/r/pslMatchFilter b/r/pslMatchFilter
new file mode 100755
index 0000000..2aa38d8
--- /dev/null
+++ b/r/pslMatchFilter
@@ -0,0 +1,27 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# R script to filter a BLAT psl file for maximum matching hits
+args <- commandArgs(trailingOnly = TRUE)
+
+# prepare psl column names
+pslnames <- "match mismatch repmatch Ns Qgapcount Qgapbases Tgapcount Tgapbases strand Qname Qsize Qstart Qend Tname Tsize Tstart Tend blockcount blockSizes qStarts tStarts"
+pslnames <- strsplit(pslnames, split="[ \t]+")[[1]]
+
+# import the psl file
+filename <- args[1]
+pslheader <- readLines(filename, n=5)
+psldata <- read.table(filename, sep="\t", skip=5)
+colnames(psldata) <- pslnames
+
+# sort psl data by query name and match count, keep entry with maximum match per query
+sorted <- psldata[order(psldata$Qname, -psldata$match), ]
+uniques <- sorted[!duplicated(sorted$Qname), ]
+
+# export the filter psl data
+filename <- args[2]
+write(pslheader, filename)
+write.table(uniques, filename, append=T, quote=F, sep="\t", row.names=F, col.names=F)
diff --git a/r/rf.R b/r/rf.R
new file mode 100755
index 0000000..f2e235d
--- /dev/null
+++ b/r/rf.R
@@ -0,0 +1,103 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# functions handling the training and prediction of contig qualities with random forest
+#
+# author: Mathias Kuhring
+
+
+# random forest training of contig qualities
+# returns a list of randomForest objects, one per target column
+rfTraining <- function(input, targets, balance=FALSE){
+ if (DEBUGGING){ print('forest training') }
+
+ rfList <- vector("list", ncol(targets))
+ names(rfList) <- names(targets)
+ library("randomForest")
+ for (i in 1:ncol(targets)){
+ if (DEBUGGING){ print(i) }
+
+ # balancing
+ index <- 1:dim(input)[1]
+ if (balance){
+ if (DEBUGGING){
+ print("balancing...")
+ print(paste("size before:", length(index)))
+ }
+ index <- balancingUp(training[,i])
+ if (DEBUGGING){ print(paste("size after:", length(index))) }
+ }
+
+# vars <- 3:ncol(input)
+
+ # training
+ rfList[[i]] <- randomForest(x=input[index, ], y=targets[index,i],
+ importance=TRUE, localImp=TRUE, ntree=100)
+ }
+
+ if (DEBUGGING){ print('training done') }
+ return(rfList)
+}
+
+
+# random forest prediction of contig qualities
+# returns a list of predictions per metric
+scorePrediction <- function(rfs, input, type){
+ if (DEBUGGING){ print("score prediction") }
+
+ predictions <- vector("list", length(rfs))
+ names(predictions) <- names(rfs)
+
+# vars <- 3:ncol(input)
+
+ require("randomForest")
+ for (i in 1:length(rfs)){
+ # prediction
+ predictions[[i]] <- predict(rfs[[i]], input, type=type)
+ }
+
+ if (DEBUGGING){ print("prediction done") }
+ return(predictions)
+}
+
+
+selectContigs <- function(features, scores){
+
+ number <- length(features)
+ combination <- vector(mode="list", number)
+
+ for (i in 1:number){
+ features[[i]]$Assembly <- NULL
+ features[[i]]$ReadQuality <- NULL
+ scores[[i]]$Assembly <- NULL
+ scores[[i]]$Reference <- NULL
+ combination[[i]] <- merge(features[[i]], scores[[i]], by="ContigID")
+ }
+
+ return( do.call(rbind, combination) )
+}
+
+dataFilter <- function(data){
+ if (DEBUGGING){ print('data filtering') }
+ check <- vector('logical',nrow(data))
+ for (i in 1:ncol(data)){
+ check <- check | is.infinite(data[,i])
+ check <- check | is.nan(data[,i])
+ check <- check | is.na(data[,i])
+ }
+
+ # (Temporary) Remove Contigs with MatchCount bigger than Length (shouldn't be)
+ if ("NormedMatchCount1" %in% names(data)){
+ check <- check | data$NormedMatchCount1 > 1
+ }
+
+ # # Remove MIRA Contigs with repeats
+ # check <- check | grepl("*rep*", data$ContigID)
+
+ data <- data[!check,]
+ if (sum(check)>0) cat(paste("warning: removed", sum(check), " inconsistent contigs\n"))
+
+ if (DEBUGGING){ print('filtering done') }
+ return(data)
+}
\ No newline at end of file
diff --git a/r/scores.R b/r/scores.R
new file mode 100755
index 0000000..25e98fb
--- /dev/null
+++ b/r/scores.R
@@ -0,0 +1,88 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# import of the scores from the java routine
+importScores <- function(score.files, assembly.files, reference.files){
+ scores <- vector('list', length(score.files))
+ if (DEBUGGING){ print("metrics import") }
+ for (i in 1:length(score.files)){
+ if (DEBUGGING){ print(as.character(score.files[i])) }
+ scores[[i]] <- cbind(Assembly=assembly.files[i],
+ Reference=reference.files[i],
+ read.table(score.files[i], header=TRUE,
+ colClasses=get.colClasses(score.files[i])))
+ }
+ if (DEBUGGING){ print("import done") }
+ return(scores)
+}
+
+
+# removes contig duplicates with same ID (e.g. different local matches)
+# returns best contig per ID depending on filter metric values (highest MatchCount per default)
+bestDuplicate <- function(metrics, filter="MatchCount", highest=TRUE){
+ if (highest){ mod <- -1 } else{ mod <- 1 }
+ sorted <- metrics[order(metrics$ContigID, mod * metrics[filter]), ]
+ uniques <- sorted[!duplicated(sorted$ContigID),]
+ return(uniques)
+}
+
+
+# removes redundant and correlating scores
+correctScores <- function(metrics){
+ for (i in 1:length(metrics)){
+ metrics[[i]]$Sample <- NULL
+
+ # unnormalized scores are not further used
+ metrics[[i]]$MatchCount <- NULL
+ metrics[[i]]$ErrorCount <- NULL
+ metrics[[i]]$ErrorSubtraction <- NULL
+
+ # following scores are directly correlated and yield no further information
+ metrics[[i]]$NormedErrorCount2 <- NULL
+ metrics[[i]]$NormedErrorSubtraction <- NULL
+ metrics[[i]]$MaxEndContError <- NULL
+
+ metrics[[i]]$NormedContigLength2 <- NULL
+ }
+ return(metrics)
+}
+
+
+# separates the scores into two classes each based on the manual thresholds
+manualClasses <- function(scores, thresholds){
+ return(regToClassByThresholds(scores, thresholds))
+}
+
+
+# separates the scores into two classes each based on exponential fittings
+expoClasses <- function(scores, quantile){
+
+ thresholds <- getExpoClassThresholds(scores, quantile)
+ scores <- regToClassByThresholds(scores, thresholds)
+
+ return(scores)
+}
+
+
+# separates data into classes by given thresholds
+regToClassByThresholds <- function(scores, thresholds){
+ firstMetric <- which(colnames(scores)=="NormedMatchCount1")
+ lastMetric <- ncol(scores)
+
+ for (i in firstMetric:lastMetric){
+
+ if (any(colnames(scores)[i] ==
+ c("NormedMatchCount1", "NormedMatchCount2",
+ "NormedContigLength1", "VitorScore")))
+ {
+ scores[i] <- as.factor(scores[i] >= thresholds[i])
+ }
+ else
+ {
+ scores[i] <- as.factor(scores[i] <= thresholds[i])
+ }
+ }
+
+ return(scores)
+}
\ No newline at end of file
diff --git a/r/voting.R b/r/voting.R
new file mode 100755
index 0000000..94d96cb
--- /dev/null
+++ b/r/voting.R
@@ -0,0 +1,42 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+
+# metric vote, by default TRUE when half of metrics/scores TRUE, but adjustable with min
+voting <- function(scoreList, min=(length(scoreList)/2)){
+ pro <- rep(0, length(scoreList[[1]]))
+ contra <- rep(0, length(scoreList[[1]]))
+
+ for (i in 1:length(scoreList)){
+ pro <- pro + as.numeric(as.logical(scoreList[[i]]))
+ contra <- contra + as.numeric(!as.logical(scoreList[[i]]))
+ }
+
+ choise <- (pro >= min)
+ return(choise)
+}
+
+
+# returns the sum of TRUE votes
+votingSum <- function(scoreList){
+ pro <- rep(0, length(scoreList[[1]]))
+
+ for (i in 1:length(scoreList)){
+ pro <- pro + as.numeric(as.logical(scoreList[[i]]))
+ }
+
+ return(pro)
+}
+
+
+# returns the sum of weighted TRUE votes
+weightedVotingSum <- function(scoreList, weightList){
+ pro <- rep(0, length(scoreList[[1]]))
+
+ for (i in 1:length(scoreList)){
+ pro <- pro + as.numeric(as.logical(scoreList[[i]])) * weightList[[i]][,"TRUE"]
+ }
+
+ return(pro)
+}
\ No newline at end of file
diff --git a/readme.txt b/readme.txt
new file mode 100755
index 0000000..2812b08
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,354 @@
+SuRankCo: Supervised Ranking of Contigs in de novo Assemblies
+
+Author: Mathias Kuhring
+Contact: KuhringM at rki.de
+
+
+UPDATE
+------
+Jun2015:
+* Minor changes to enable BAM support, too.
+
+Feb2015:
+* Added support for FASTA/SAM assemblies in addition to ACE/FASTQ(QUAL).
+ NOTE: features of FASTA/SAM assemblies will not include BaseCount,
+ BaseSeqmentCount and ContigQualities, yet.
+
+
+INTRODUCTION
+------------
+
+SuRankCo is a machine learning based software to score and rank contigs from de
+novo assemblies of next generation sequencing data. It trains with alignments of
+contigs with known reference genomes and predicts scores and ranking for contigs
+which have no related reference genome yet.
+
+For more details about SuRankCo and its functioning, please see
+
+ "SuRankCo: Supervised Ranking of Contigs in de novo Assemblies"
+ Mathias Kuhring, Piotr Wojtek Dabrowski, Andreas Nitsche and Bernhard Y. Renard
+ (Submitted manuscript)
+
+PLEASE NOTE, it is recommended to read the paper and this readme.txt file before
+using SuRankCo.
+
+
+INSTRUCTIONS
+------------
+SuRankCo consists of four modules:
+- surankco-feature: feature generation from contigs (ACE or FASTA) and
+ corresponding reads (QUAL, QUA, FASTQ resp. SAM or BAM)
+- surankco-score: alignment of unpadded(!) contigs (FASTA format) and
+ reference genomes (FASTA format) and score calculation
+- surankco-training: training of random forest using contig features (from
+ surankco-feature) and contig scores (surankco-score)
+- surankco-prediction: prediction of scores and ranking of contigs using
+ trained random forests (from surankco-training) and
+ contig features (from surankco-feature)
+
+Different combinations of these modules allow for two workflows, training and
+prediction. For the training, use surankco-feature and surankco-score with the
+same contigs followed by surankco-training. For the prediction, use
+surankco-feature with new contigs followed by surankco-prediction.
+
+
+SYSTEM REQUIREMENTS
+-------------------
+The following software and libraries are required to run SuRankCo:
+- Java 7
+ (Source: http://www.java.com)
+- GNU R 3 including Rscript
+ (Source: http://www.r-project.org/)
+- additional GNU R Packages: optparse, MASS, randomForest
+ (Please refer to the R manuals on how to install packages)
+- Blat and accompanying pslPretty
+ (Source: http://hgdownload.cse.ucsc.edu/admin/exe/)
+
+All required executables (java, Rscript, blat and pslPretty) have to be
+available in the PATH environment variable. Please refer to the operating
+systems manual on how to set up the PATH variable, if manual setup is necessary.
+
+NOTE: For Mac/OSX the R version 3.0.3 is recommended. The necessary package
+randomForest is not yet compatible to the latest version 3.1
+(please refer to the troubleshooting section for more details).
+
+
+USER INTERFACE & EXECUTION
+--------------------------
+SuRankCo is a command-line based software with four executables equivalent to
+the four modules. Use the -h flag for more parameter details. surankco-feature,
+surankco-score and surankco-training can process the contigs of several
+assemblies at once. Thus a list of applicable files or a directory can be
+indicated. The directory will be scanned for all files that have an applicable
+suffix. Input file examples are provided below.
+
+surankco-feature
+ - Expects pairs of either ACE and FASTQ/QUAL/QUA or FASTA and SAM/BAM files.
+ You provide either a list of ACE or FASTA files or a directory with such.
+ NOTE: features of FASTA/SAM/BAM assemblies will not include BaseCount,
+ BaseSeqmentCount and ContigQualities, yet.
+ - The expected suffix for the contig FASTAs is ".contigs.fasta" to prevent
+ confusion with reference genome fasta files.
+ - FASTQ/QUAL/QUA or SAM/BAM files are automatically detected and expected to
+ share the same prefix such as their corresponding ACE or FASTA files, resp.
+ - For ACE: QUAL (default), QUA or FASTQ format can be selected per parameter as well
+ as the FASTQ version (default: Illumina1.8+).
+ - The FASTQ version is alternatively auto detectable, but the identification is
+ not (!) 100% reliable and thus uncertainties will stop the process and
+ suggestions for manual settings will be provided.
+ - Some de novo assembler may extend the read names with extra information. A
+ Java compatible regular expression can be indicated to cut of the extensions
+ to enable matching between ACE and QUAL/FASTQ reads. E.g., a Newbler read
+ "myread.000089727.26-146.fm1165.to1077" may by trimmed to the original name
+ "myread.000089727" by using the regex "\\\\.\\\\d+-".
+ NOTE, if a backslash "\" is needed use "\\\\"!
+ - Some feature generation steps can be executed in parallel if more than one
+ assembly (resp. ACE or FASTA file) is processed. Thus, a number of threads
+ can be indicated.
+ - The memory used by the Java virtual machine can be adjusted, e.g. if the
+ default (32 GB) is too high for the system or too low for big assemblies.
+ - The feature "Genome Relation" requires an expected genome size. It can be
+ indicated or roughly estimated as the sum of contig lengths.
+
+ -> The output is a set of TXT files (one per input ACE or FASTA file)
+ containing the calculated features in a table like format. The files are
+ named with same paths and prefixes as the corresponding assembly files and
+ the suffix ".features.txt".
+
+surankco-score
+ - Expects a list of contig FASTA files or a directory with FASTA files.
+ - The expected suffix for the contig FASTAs is ".contigs.fasta" (variable).
+ - Reference genome FASTAs are automatically detected and expected to share the
+ same prefix as corresponding contig FASTAs followed by the suffix
+ ".ref.fasta" (variable).
+ - The memory used by the Java virtual machine can be adjusted, e.g. if the
+ default (32 GB) is too high for the system or too low for big assemblies.
+
+ -> The output is a set of TXT files (one per input contig FASTA file)
+ containing the calculated scores in a table like format. The files are named
+ with same paths and prefixes as the corresponding FASTA files and the suffix
+ ".scores.txt". Additionally, a PDF file with histograms per score calculated
+ over all processed contigs is exported to support the class separation
+ threshold selection in surankco-training. The default filename
+ "surankco_score_histograms.pdf" can be customized.
+ (Please refer to the indicated paper for details about the thresholds)
+
+surankco-training
+ - Expects a list of SuRankCo feature files or a directory with feature files.
+ - The expected suffix for the feature files is ".features.txt".
+ - Corresponding SuRankCo score files are automatically detected and expected to
+ share the same prefix as the corresponding feature files followed by the
+ suffix ".scores.txt".
+ - For the class separation, manual thresholds per score or a quantile for
+ automatic threshold selection with exponential fittings can be indicated.
+ Manual thresholds need range [0,1], except for threshold 5 and 7
+ (MaxRegionError and MaxEndErrorCount) which need range [0,100]. Most scores
+ get separated into <= or > of the threshold, except for NormedMatchCount1,
+ NormedMatchCount2 and NormedContigLength1 which use >= or <.
+ The exponential quantile needs the range [0,1] and uses <= or > in all cases,
+ since NormedMatchCount1, NormedMatchCount2 and NormedContigLength1 have to be
+ reversed anyway to enable exponential fittings.
+
+ -> The output is an R data file containing the trained random forests.
+ The default file name is "surankco_rfs.RData" but can be changed.
+
+surankco-prediction
+ - Expects one SuRankCo feature file with suffix ".features.txt".
+ - Expects a R data file with a random forest trained by surankco-training and
+ the suffix ".RData".
+
+ -> The output is a table formatted text file (named "surankco_results.txt" per
+ default, but can be changed), sorted by decreasing scores.
+
+ The file contains four columns:
+ "Assembly" : the filename of the corresponding assembly file
+ "ReadQuality" : the filename of the corresponding read quality file
+ "ContigID" : the contig id as in the corresponding assembly and
+ feature file
+ "SurankcoContigScore" : the final SuRankCo contig score
+
+
+INPUT FILE & EXECUTION EXAMPLES
+-------------------------------
+The following file lists illustrate the required file combinations and suffixes.
+
+Training
+surankco-feature:
+ assembly1.ace, assembly2.ace, assemblyXYZ.ace
+ assembly1.qual, assembly2.qual, assemblyXYZ.qual
+surankco-score:
+ assembly1.contigs.fasta, assembly2.contigs.fasta, assemblyXYZ.contigs.fasta
+ assembly1.ref.fasta, assembly2.ref.fasta, assemblyXYZ.ref.fasta
+surankco-training:
+ assembly1.features.txt, assembly2.features.txt, assemblyXYZ.features.txt
+ assembly1.scores.txt, assembly2.scores.txt, assemblyXYZ.scores.txt
+
+Prediction
+surankco-feature:
+ assembly_new.ace
+ assembly_new.qual
+surankco-prediction:
+ assembly_new.txt
+ surankco_rfs.RData
+
+The following calls show example parameter uses (short flags and long flags).
+The example calls assume all files to be in the execution directory (-a) or in
+the indicated directory (-d). However, full pathnames can be indicated.
+Note, file list and directory parameters are mutually exclusive but indicating
+one of them is mandatory. Other parameters are optional due to default values.
+manual.thresholds has higher priority than exponential.quantile when indicated.
+
+surankco-feature
+ -a assembly1.ace,assembly2.ace,assemblyXYZ.ace
+ -d /home/user/mydata/
+ -r qual
+ -q illumina18
+ -s ////.////d+-
+ -t 4
+ -m 32
+ -k
+ -g 1234567,2000000,111111
+
+ --assemblies=assembly1.ace,assembly2.ace,assemblyXYZ.ace
+ --directory=/home/user/mydata/
+ --read.quality.format=qual
+ --fastq.version=illumina18
+ --split.regex=////.////d+-
+ --threads=4
+ --memory=32
+ --kmer.features
+ --expected.genome.size=1234567,2000000,111111
+
+surankco-score
+ -a assembly1.contigs.fasta,assembly2.contigs.fasta,assemblyXYZ.contigs.fasta
+ -d /home/user/mydata/
+ -f contigs.fasta
+ -r ref.fasta
+ -p /home/user/mydata/surankco_score_histograms.pdf
+ -m 32
+
+ --assemblies=assembly1.contigs.fasta,assembly2.contigs.fasta,assemblyXYZ.contigs.fasta
+ --directory=/home/user/mydata/
+ --assembly.suffix=contigs.fasta
+ --reference.suffix=ref.fasta
+ --pdf.histograms=/home/user/mydata/surankco_score_histograms.pdf
+ --memory=32
+
+surankco-training
+ -f assembly1.features.txt,assembly2.features.txt,assemblyXYZ.features.txt
+ -d /home/user/mydata/
+ -o /home/user/mydata/mydata_rfs.RData
+ -e 0.5
+ -m 0.95,0.95,0.05,0.05,5,0.05,5,0.95
+
+ --features=assembly1.features.txt,assembly2.features.txt,assemblyXYZ.features.txt
+ --directory=/home/user/mydata/
+ --output.filename=/home/user/mydata/mydata_rfs.RData
+ --exponential.quantile=0.5
+ --manual.thresholds=0.95,0.95,0.05,0.05,5,0.05,5,0.95
+
+surankco-prediction
+ -f assembly_new.txt
+ -r surankco_rfs.RData
+ -o my_results.txt
+
+ --features=assembly_new.txt
+ --random.forests=surankco_rfs.RData
+ --output.filename=my_results.txt
+
+
+TROUBLESHOOTING
+---------------
+P1: [OSX] Error loading package "randomForest" in R 3.1
+ Error in dyn.load(file, DLLpath = DLLpath, ...) :
+ kann shared object '/Library/Frameworks/R.framework/Versions/3.1/Resources/
+ library/randomForest/libs/randomForest.so' nicht laden:
+ dlopen(/Library/Frameworks/R.framework/Versions/3.1/Resources/library/
+ randomForest/libs/randomForest.so, 6): Library not loaded: /Library/
+ Frameworks/R.framework/Versions/3.0/Resources/lib/libgfortran.2.dylib
+ Referenced from: /Library/Frameworks/R.framework/Versions/3.1/Resources/
+ library/randomForest/libs/randomForest.so
+ Reason: image not found
+ Fehler: Laden von Paket oder Namensraum für ‘randomForest’ fehlgeschlagen
+
+ randomForest tries to load external libraries of older R Version 3.0.*.
+ However, providing them by installing R 3.0.* in parallel leads to
+ incompatibilities (see next problem P2). Therefore, using R 3.0.3 solely is
+ recommended for OSX.
+
+P2: [OSX] Memory error running suranco-training with R 3.1
+ *** caught segfault ***
+ address 0x18, cause 'memory not mapped'
+
+ R 3.1 doesn't provide the required versions of external libraries for the
+ randomForest package (see previous problem P1). Providing the libraries of
+ older R Versions leads to incompatibilities with R 3.1 and seems to produce a
+ segfault. Therefore, using R 3.0.3 solely is recommended for OSX.
+
+P3: Permission denied when executing surankco, e.g.:
+ -bash: /home/kuhringm/workspace/surankco/surankco-score: Permission denied
+
+ The surankco scripts might have lost their execution permission. In a terminal,
+ go to the suranko directory and run "chmod +x surankco-feature surankco-score
+ surankco-training surankco-prediction".
+
+P4: suranco-score terminates with following Permission denied error:
+ sh: 1: /home/kuhringm/workspace/surankco/r/pslMatchFilter: Permission denied
+ Error: pslMatchFilter not successfully executed: 126
+ Execution halted
+
+ pslMatchFilter needs execution permissions. In a terminal, go to the suranko
+ directory and run "chmod +x r/pslMatchFilter".
+
+P4: surankco-score prints dexp/NaN warning messages:
+ Warning messages:
+ 1: In dexp(x, estimate, log = TRUE) : NaNs produced
+ 2: In dexp(x, rate = fit$estimate) : NaNs produced
+
+ Some scores don't have enough variance to fit an exponential distribution. This
+ is not critical in surankco-score, but might be a problem for the training
+ (see problem P6).
+
+P5: surankco-training prints dexp/NaN warning messages:
+ Warning messages:
+ 1: In dexp(x, estimate, log = TRUE) : NaNs produced
+
+ Some scores don't have enough variance to fit an exponential distribution.
+ The training will not be possible (see problem P6).
+
+P6: surankco-training terminates with following training error
+ Error in randomForest.default(x = input[index, ], y = targets[index, i], :
+ Need at least two classes to do classification.
+ Calls: rfTraining -> randomForest -> randomForest.default
+ Execution halted
+
+ Some scores don't provide two classes. Either the variance of these scores is
+ indeed to low or the thresholds for class separation are poorly chosen. The
+ variance might be increased by providing more contigs for the training.
+
+
+--------------------------------------------------------------------------------
+Copyright (c) 2014,
+Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * The name of the author may not be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL Mathias Kuhring BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/src/de/rki/ng4/surankco/data/AceContig.java b/src/de/rki/ng4/surankco/data/AceContig.java
new file mode 100755
index 0000000..196aec6
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/AceContig.java
@@ -0,0 +1,220 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.data;
+
+import java.util.Vector;
+
+public class AceContig{
+
+ private int[][] nucleotides;
+
+ private int lineCO;
+ private int lineBQ;
+
+ private String header;
+
+ private String id;
+ private int baseCount;
+ private int readCount;
+ private int baseSeqmentCount;
+ private String orientation;
+
+ private String consensusSequence;
+
+ private Vector<Integer> qualityValues;
+
+ private Vector<AceRead> reads;
+
+ public AceContig(String header){
+ // CO 1 30502 510 273 U
+ this.header = header;
+
+ String[] splits = header.split(" ");
+
+ id = splits[1];
+ baseCount = Integer.parseInt(splits[2]);
+ readCount = Integer.parseInt(splits[3]);
+ baseSeqmentCount = Integer.parseInt(splits[4]);
+ orientation = splits[5];
+
+ qualityValues = new Vector<Integer>();
+ reads = new Vector<AceRead>();
+ }
+
+ public void addRead(AceRead read){
+ reads.add(read);
+ }
+
+
+ /**
+ * @return the nucleotides
+ */
+ public int[][] getNucleotides() {
+ return nucleotides;
+ }
+
+ /**
+ * @param nucleotides the nucleotides to set
+ */
+ public void setNucleotides(int[][] nucleotides) {
+ this.nucleotides = nucleotides;
+ }
+
+ /**
+ * @return the lineCO
+ */
+ public int getLineCO() {
+ return lineCO;
+ }
+
+ /**
+ * @param lineCO the lineCO to set
+ */
+ public void setLineCO(int lineCO) {
+ this.lineCO = lineCO;
+ }
+
+ /**
+ * @return the lineBQ
+ */
+ public int getLineBQ() {
+ return lineBQ;
+ }
+
+ /**
+ * @param lineBQ the lineBQ to set
+ */
+ public void setLineBQ(int lineBQ) {
+ this.lineBQ = lineBQ;
+ }
+
+ /**
+ * @return the header
+ */
+ public String getHeader() {
+ return header;
+ }
+
+ /**
+ * @param header the header to set
+ */
+ public void setHeader(String header) {
+ this.header = header;
+ }
+
+ /**
+ * @return the id
+ */
+ public String getId() {
+ return id;
+ }
+
+ /**
+ * @param id the id to set
+ */
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ /**
+ * @return the baseCount
+ */
+ public int getBaseCount() {
+ return baseCount;
+ }
+
+ /**
+ * @param baseCount the baseCount to set
+ */
+ public void setBaseCount(int baseCount) {
+ this.baseCount = baseCount;
+ }
+
+ /**
+ * @return the readCount
+ */
+ public int getReadCount() {
+ return readCount;
+ }
+
+ /**
+ * @param readCount the readCount to set
+ */
+ public void setReadCount(int readCount) {
+ this.readCount = readCount;
+ }
+
+ /**
+ * @return the baseSeqmentCount
+ */
+ public int getBaseSeqmentCount() {
+ return baseSeqmentCount;
+ }
+
+ /**
+ * @param baseSeqmentCount the baseSeqmentCount to set
+ */
+ public void setBaseSeqmentCount(int baseSeqmentCount) {
+ this.baseSeqmentCount = baseSeqmentCount;
+ }
+
+ /**
+ * @return the orientation
+ */
+ public String getOrientation() {
+ return orientation;
+ }
+
+ /**
+ * @param orientation the orientation to set
+ */
+ public void setOrientation(String orientation) {
+ this.orientation = orientation;
+ }
+
+ /**
+ * @return the consensusSequence
+ */
+ public String getConsensusSequence() {
+ return consensusSequence;
+ }
+
+ /**
+ * @param consensusSequence the consensusSequence to set
+ */
+ public void setConsensusSequence(String consensusSequence) {
+ this.consensusSequence = consensusSequence;
+ }
+
+ /**
+ * @return the qualityValues
+ */
+ public Vector<Integer> getQualityValues() {
+ return qualityValues;
+ }
+
+ /**
+ * @param qualityValues the qualityValues to set
+ */
+ public void setQualityValues(Vector<Integer> qualityValues) {
+ this.qualityValues = qualityValues;
+ }
+
+ /**
+ * @return the reads
+ */
+ public Vector<AceRead> getReads() {
+ return reads;
+ }
+
+ /**
+ * @param reads the reads to set
+ */
+ public void setReads(Vector<AceRead> reads) {
+ this.reads = reads;
+ }
+
+}
diff --git a/src/de/rki/ng4/surankco/data/AceRead.java b/src/de/rki/ng4/surankco/data/AceRead.java
new file mode 100755
index 0000000..53d6595
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/AceRead.java
@@ -0,0 +1,182 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.data;
+
+public class AceRead {
+
+ private int lineAF;
+ private int lineRD;
+ private int lineQA;
+
+ // AF
+ private String id;
+ private char orientation;
+ private int offset;
+
+ // RD
+ private int paddedLength;
+
+ private String paddedSequence;
+
+ // QA
+ private int alignClippingStart;
+ private int alignClippingEnd;
+
+ public AceRead(){
+ }
+
+ public void addAF(String line){
+ String[] splits = line.split(" ");
+ id = splits[1];
+ orientation = splits[2].charAt(0);
+ offset = Integer.parseInt(splits[3]);
+ }
+
+ public void addRD(String line){
+ // RD TBEOG48.y1 619 0 0
+ String[] splits = line.split(" ");
+ id = splits[1];
+ paddedLength = Integer.parseInt(splits[2]);
+ }
+
+ public void addQA(String line){
+ String[] splits = line.split(" ");
+ alignClippingStart = Integer.parseInt(splits[3]);
+ alignClippingEnd = Integer.parseInt(splits[4]);
+ }
+
+ public int getClippingLength(){
+ return (alignClippingEnd-alignClippingStart+1);
+ }
+
+ /**
+ * @return the lineRD
+ */
+ public int getLineRD() {
+ return lineRD;
+ }
+
+ /**
+ * @param lineRD the lineRD to set
+ */
+ public void setLineRD(int lineRD) {
+ this.lineRD = lineRD;
+ }
+
+ /**
+ * @return the id
+ */
+ public String getId() {
+ return id;
+ }
+
+ /**
+ * @param id the id to set
+ */
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ /**
+ * @return the orientation
+ */
+ public char getOrientation() {
+ return orientation;
+ }
+
+ /**
+ * @param orientation the orientation to set
+ */
+ public void setOrientation(char orientation) {
+ this.orientation = orientation;
+ }
+
+ /**
+ * @return the paddedSequence
+ */
+ public String getPaddedSequence() {
+ return paddedSequence;
+ }
+
+ /**
+ * @param paddedSequence the paddedSequence to set
+ */
+ public void setPaddedSequence(String paddedSequence) {
+ this.paddedSequence = paddedSequence;
+ }
+
+ /**
+ * @return the offset
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * @param offset the offset to set
+ */
+ public void setOffset(int offset) {
+ this.offset = offset;
+ }
+
+ /**
+ * @return the paddedLength
+ */
+ public int getPaddedLength() {
+ return paddedLength;
+ }
+
+ /**
+ * @param paddedLength the paddedLength to set
+ */
+ public void setPaddedLength(int paddedLength) {
+ this.paddedLength = paddedLength;
+ }
+
+ /**
+ * @return the alignClippingStart
+ */
+ public int getAlignClippingStart() {
+ return alignClippingStart;
+ }
+
+ /**
+ * @param alignClippingStart the alignClippingStart to set
+ */
+ public void setAlignClippingStart(int alignClippingStart) {
+ this.alignClippingStart = alignClippingStart;
+ }
+
+ /**
+ * @return the alignClippingEnd
+ */
+ public int getAlignClippingEnd() {
+ return alignClippingEnd;
+ }
+
+ /**
+ * @param alignClippingEnd the alignClippingEnd to set
+ */
+ public void setAlignClippingEnd(int alignClippingEnd) {
+ this.alignClippingEnd = alignClippingEnd;
+ }
+
+ public int getLineAF() {
+ return lineAF;
+ }
+
+ public void setLineAF(int lineAF) {
+ this.lineAF = lineAF;
+ }
+
+ public int getLineQA() {
+ return lineQA;
+ }
+
+ public void setLineQA(int lineQA) {
+ this.lineQA = lineQA;
+ }
+}
\ No newline at end of file
diff --git a/src/de/rki/ng4/surankco/data/Assembly.java b/src/de/rki/ng4/surankco/data/Assembly.java
new file mode 100755
index 0000000..59db0e7
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/Assembly.java
@@ -0,0 +1,30 @@
+package de.rki.ng4.surankco.data;
+
+import java.util.Iterator;
+import java.util.Vector;
+
+public abstract class Assembly {
+
+ protected String contigFile;
+ protected String readFile;
+
+ protected Vector<Contig> contigs;
+
+ public abstract Vector<Contig> getContigs();
+
+ public abstract Vector<Vector<Integer>> getPooledReadQualities(
+ Vector<Vector<String>> readIds);
+
+ public abstract Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> readIds);
+
+ public void lengthFilter(int minContigSize) {
+// int counter = 0;
+ for (Iterator<Contig> iterator = contigs.iterator(); iterator.hasNext(); ){
+ if (iterator.next().getLength() < minContigSize){
+ iterator.remove();
+// counter++;
+ }
+ }
+// System.out.println("removed " + counter + " small contigs");
+ }
+}
diff --git a/src/de/rki/ng4/surankco/data/AssemblyAce.java b/src/de/rki/ng4/surankco/data/AssemblyAce.java
new file mode 100755
index 0000000..7a3c5ea
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/AssemblyAce.java
@@ -0,0 +1,162 @@
+package de.rki.ng4.surankco.data;
+
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.Assembly;
+import de.rki.ng4.surankco.files.Ace;
+
+public class AssemblyAce extends Assembly {
+
+ private FastqReads reads;
+
+ public AssemblyAce(String aceFileName, String readFileName, String fastqParameter){
+ contigs = new Vector<Contig>();
+
+ contigFile = aceFileName;
+ readFile = readFileName;
+
+ Ace aceIn = new Ace(aceFileName);
+ aceIn.readConsensusSequences();
+ aceIn.readQualityValues();
+
+ reads = new FastqReads(readFileName, fastqParameter);
+
+ convertContigs(aceIn);
+ }
+
+ /*
+ * convert Ace Contigs & Fastq reads to internal Contigs
+ */
+ private void convertContigs(Ace aceIn){
+ Contig contig;
+ Read read;
+ Vector<Read> reads;
+ for (AceContig aceContig : aceIn.getContigs()){
+ contig = new Contig(aceContig.getId());
+ contig.setLength(aceContig.getQualityValues().size());
+ contig.setBaseCount(aceContig.getBaseCount());
+ contig.setReadCount(aceContig.getReadCount());
+ contig.setBaseSeqmentCount(aceContig.getBaseSeqmentCount());
+ contig.setConsensusSequence(aceContig.getConsensusSequence());
+ contig.setQualityValues(aceContig.getQualityValues());
+
+ contig.setNucleotideCount(countNucleotides(aceContig));
+
+ reads = new Vector<Read>();
+ for (AceRead aceRead : aceContig.getReads()){
+ read = new Read(aceRead.getId());
+ read.setOrientation(aceRead.getOrientation());
+ read.setOffset(aceRead.getOffset());
+ read.setPaddedLength(aceRead.getPaddedLength());
+// read.setPaddedSequence(aceRead.getPaddedSequence());
+ read.setAlignClippingStart(aceRead.getAlignClippingStart());
+ read.setAlignClippingEnd(aceRead.getAlignClippingEnd());
+ reads.add(read);
+ }
+ contig.setReads(reads);
+ contigs.add(contig);
+ }
+ }
+
+ private int[][] countNucleotides(AceContig c){
+ int[][] nucleotides = new int[c.getConsensusSequence().length()][6];
+
+ String readSeq;
+ int offset, contigPos;
+ boolean clipped;
+ for (AceRead read : c.getReads()){
+ readSeq = read.getPaddedSequence().toUpperCase();
+ offset = read.getOffset();
+ clipped = false;
+ for (int readPos=(read.getAlignClippingStart()-1);
+ readPos<read.getAlignClippingEnd(); readPos++){
+ contigPos = readPos + offset - 1;
+
+ // workaround for reads out of contig range
+ if ((contigPos < 0) || (contigPos >= nucleotides.length)){
+ clipped = true;
+ }
+ else{
+
+ switch (readSeq.charAt(readPos)){
+ case 'A':
+ nucleotides[contigPos][0]++;
+ break;
+ case 'C':
+ nucleotides[contigPos][1]++;
+ break;
+ case 'G':
+ nucleotides[contigPos][2]++;
+ break;
+ case 'T':
+ nucleotides[contigPos][3]++;
+ break;
+ case 'N':
+ nucleotides[contigPos][4]++;
+ break;
+ case '*':
+ nucleotides[contigPos][5]++;
+ break;
+ default:
+ }
+
+ }
+ }
+ if (clipped) System.out.println("warning: clipped read " + read.getId());
+ }
+
+ return nucleotides;
+ }
+
+ @Override
+ public Vector<Contig> getContigs() {
+ return contigs;
+ }
+
+ @Override
+ public Vector<Vector<Integer>> getPooledReadQualities(
+ Vector<Vector<String>> readIds) {
+ Vector<Vector<Integer>> readQualities = reads.getPooledReadQualities(readIds);
+ reads.readMatchCheck(readQualities, getIds(), contigFile, readFile);
+ return readQualities;
+ }
+
+ @Override
+ public Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> readIds) {
+ Vector<Vector<Integer>> readLengths = reads.getReadLengths(readIds);
+ reads.readMatchCheck(readLengths, getIds(), contigFile, readFile);
+ return readLengths;
+ }
+
+ public Vector<String> getIds(){
+ Vector<String> ids = new Vector<String>();
+ for (Contig c : contigs){
+ ids.add(c.getId());
+ }
+ return ids;
+ }
+
+ @SuppressWarnings("unused")
+ private Vector<Vector<Character>> buildAlignment(AceContig c){
+ Vector<Vector<Character>> alignment = new Vector<Vector<Character>>();
+
+ for (Character ch : c.getConsensusSequence().toCharArray()){
+ alignment.add(new Vector<Character>());
+ alignment.lastElement().add(ch);
+ }
+
+ int offset;
+ int contigPos;
+
+ for (AceRead read : c.getReads()){
+ offset = read.getOffset();
+ for (int readPos=(read.getAlignClippingStart()-1);
+ readPos<read.getAlignClippingEnd(); readPos++){
+ contigPos = readPos + offset - 1;
+ alignment.get(contigPos).add(read.getPaddedSequence().charAt(readPos));
+ }
+ }
+
+ return alignment;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/data/AssemblySam.java b/src/de/rki/ng4/surankco/data/AssemblySam.java
new file mode 100755
index 0000000..3d98888
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/AssemblySam.java
@@ -0,0 +1,174 @@
+package de.rki.ng4.surankco.data;
+
+import htsjdk.samtools.SAMRecord;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.files.Fasta;
+import de.rki.ng4.surankco.files.Sam;
+
+public class AssemblySam extends Assembly {
+
+ public static void main(String[] args){
+ String samfile = "data/ex1.sam";
+ String fastafile = "data/ex1.fa";
+ AssemblySam test = new AssemblySam(fastafile, samfile);
+ for (Contig contig : test.getContigs()){
+ System.out.println(contig.getId());
+ }
+ }
+
+ public AssemblySam(String fastaFileName, String samFileName){
+ contigs = new Vector<Contig>();
+ contigFile = fastaFileName;
+ readFile = samFileName;
+
+ Fasta fasta = new Fasta(fastaFileName);
+ Vector<FastaContig> fastaContigs = fasta.parseFasta();
+
+ // for (FastaContig contig : fastaContigs){
+ // System.out.println(contig.getId() + " (length=" + contig.getSequence().length() + ")");
+ // System.out.println(contig.getSequence());
+ // }
+
+ Sam sam = new Sam(samFileName);
+ HashMap<String, SamContig> samContigs = sam.getContigs();
+
+ convertContigs(fastaContigs, samContigs);
+
+ // for (SamContig contig : samContigs.values()){
+ // System.out.println(contig.getId());
+ // }
+ }
+
+ /*
+ * convert Fasta Contigs & Sam Reads to internal Contigs
+ */
+ private void convertContigs(Vector<FastaContig> fastaContigs, HashMap<String, SamContig> samContigs){
+ SamContig samContig;
+ Contig contig;
+ Read read;
+ Vector<Read> reads;
+ for (FastaContig fastaContig : fastaContigs){
+ if (samContigs.containsKey(fastaContig.getId())){
+ samContig = samContigs.get(fastaContig.getId());
+
+ contig = new Contig(fastaContig.getId());
+
+ contig.setLength(fastaContig.getSequence().length());
+ contig.setConsensusSequence(fastaContig.getSequence());
+
+ contig.setReadCount(samContig.getReads().size());
+
+ contig.setNucleotideCount(countNucleotides(samContig.getAlignment()));
+
+ // contig.setBaseCount(fastaContig.getSequence().length());
+ // contig.setBaseSeqmentCount(fastaContig.getBaseSeqmentCount());
+ // contig.setQualityValues(fastaContig.getQualityValues());
+
+ reads = new Vector<Read>();
+ for (SAMRecord samRead : samContig.getReads()){
+ read = new Read(samRead.getReadName());
+ read.setOffset(samRead.getUnclippedStart());
+ read.setAlignClippingStart(samRead.getAlignmentStart() - samRead.getUnclippedStart() + 1);
+ read.setAlignClippingEnd(samRead.getAlignmentEnd() - samRead.getUnclippedEnd() + 1);
+ // read.setPaddedSequence(samRead.getReadString()); // sam read sequence is not padded
+
+ if (samRead.getReadNegativeStrandFlag()) read.setOrientation('C');
+ else read.setOrientation('U');
+
+ read.setQualityValues(parsePhred(samRead.getBaseQualityString()));
+
+ // not sure if correct
+ read.setPaddedLength(samRead.getCigar().getReferenceLength());
+
+ reads.add(read);
+ }
+ contig.setReads(reads);
+ contigs.add(contig);
+ }
+ else {
+ System.out.println("warning: skipped contig " + fastaContig.getId() +
+ " (no reads in sam file)");
+ }
+ }
+ }
+
+ private int[][] countNucleotides(ArrayList<ArrayList<Byte>> alignment) {
+ int[][] nucleotides = new int[alignment.size()][6];
+
+ for (int i=0; i<alignment.size(); i++){
+ for (Byte nucleotide : alignment.get(i)){
+ switch ((char) nucleotide.byteValue()){
+ case 'A':
+ nucleotides[i][0]++;
+ break;
+ case 'C':
+ nucleotides[i][1]++;
+ break;
+ case 'G':
+ nucleotides[i][2]++;
+ break;
+ case 'T':
+ nucleotides[i][3]++;
+ break;
+ case 'N':
+ nucleotides[i][4]++;
+ break;
+ case '*':
+ nucleotides[i][5]++;
+ break;
+ default:
+ }
+ }
+ }
+
+ return nucleotides;
+ }
+
+ private Vector<Integer> parsePhred(String charQualities){
+ Vector<Integer> intQualities = new Vector<Integer>();
+ for (char qual : charQualities.toCharArray()){
+ intQualities.add((int) qual - 33);
+ }
+ return intQualities;
+ }
+
+ @Override
+ public Vector<Contig> getContigs() {
+ return contigs;
+ }
+
+ @Override
+ public Vector<Vector<Integer>> getPooledReadQualities(Vector<Vector<String>> readIds) {
+ Vector<Vector<Integer>> pooledReadQualities = new Vector<Vector<Integer>>();
+
+ for (Contig contig : contigs){
+ Vector<Integer> readQualities = new Vector<Integer>();
+ for (Read read : contig.getReads()){
+ readQualities.addAll(read.getQualityValues());
+ }
+ pooledReadQualities.add(readQualities);
+ }
+
+ return pooledReadQualities;
+ }
+
+ @Override
+ public Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> readIds) {
+ Vector<Vector<Integer>> readLengths = new Vector<Vector<Integer>>();
+
+ for (Contig contig : contigs){
+ Vector<Integer> lengths = new Vector<Integer>();
+ for (Read read : contig.getReads()){
+ lengths.add(read.getQualityValues().size());
+ }
+ readLengths.add(lengths);
+ }
+
+ return readLengths;
+ }
+
+}
diff --git a/src/de/rki/ng4/surankco/data/Contig.java b/src/de/rki/ng4/surankco/data/Contig.java
new file mode 100755
index 0000000..68f51e1
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/Contig.java
@@ -0,0 +1,100 @@
+package de.rki.ng4.surankco.data;
+
+import java.util.Vector;
+
+/*
+ * Addapter Class for different Contig data, e.g. from Ace, Fasta, ...
+ */
+public class Contig {
+
+ private String id;
+ private int length;
+ private int baseCount;
+ private int readCount;
+ private int baseSeqmentCount;
+
+ private String consensusSequence;
+ private Vector<Integer> qualityValues;
+ private int[][] nucleotideCounts;
+
+ private Vector<Read> reads;
+
+
+ public Contig(String id) {
+ this.id = id;
+ }
+
+ // getter
+ public String getId() {
+ return id;
+ }
+
+ public Integer getLength() {
+ return length;
+ }
+
+ public Integer getBaseCount() {
+ return baseCount;
+ }
+
+ public Integer getReadCount() {
+ return readCount;
+ }
+
+ public Integer getBaseSeqmentCount() {
+ return baseSeqmentCount;
+ }
+
+ public Vector<Read> getReads() {
+ return reads;
+ }
+
+ public String getConsensusSequence() {
+ return consensusSequence;
+ }
+
+ public Vector<Integer> getQualityValues() {
+ return qualityValues;
+ }
+
+ public int[][] getNucleotideCount() {
+ return nucleotideCounts;
+ }
+
+ // setter
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public void setLength(int length) {
+ this.length = length;
+ }
+
+ public void setBaseCount(int baseCount) {
+ this.baseCount = baseCount;
+ }
+
+ public void setReadCount(int readCount) {
+ this.readCount = readCount;
+ }
+
+ public void setBaseSeqmentCount(int baseSeqmentCount) {
+ this.baseSeqmentCount = baseSeqmentCount;
+ }
+
+ public void setConsensusSequence(String consensusSequence) {
+ this.consensusSequence = consensusSequence;
+ }
+
+ public void setQualityValues(Vector<Integer> qualityValues) {
+ this.qualityValues = qualityValues;
+ }
+
+ public void setNucleotideCount(int[][] nucleotideCount) {
+ this.nucleotideCounts = nucleotideCount;
+ }
+
+ public void setReads(Vector<Read> reads) {
+ this.reads = reads;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/data/FastaContig.java b/src/de/rki/ng4/surankco/data/FastaContig.java
new file mode 100755
index 0000000..024cefc
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/FastaContig.java
@@ -0,0 +1,24 @@
+package de.rki.ng4.surankco.data;
+
+public class FastaContig {
+
+ private String id;
+ private StringBuffer sequence;
+
+ public FastaContig(String header){
+ this.id = header.substring(1).split(" ")[0];
+ sequence = new StringBuffer();
+ }
+
+ public void appendSequence(String sequence){
+ this.sequence.append(sequence);
+ }
+
+ public String getSequence(){
+ return sequence.toString();
+ }
+
+ public String getId() {
+ return id;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/data/FastqReads.java b/src/de/rki/ng4/surankco/data/FastqReads.java
new file mode 100755
index 0000000..5540ded
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/FastqReads.java
@@ -0,0 +1,109 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.data;
+
+import java.util.HashMap;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.feature.Main;
+import de.rki.ng4.surankco.files.Fastq;
+import de.rki.ng4.surankco.files.Qual;
+
+public class FastqReads {
+
+ public Vector<String> readId;
+ public Vector<Vector<Integer>> readQuality;
+
+ HashMap<String,Integer> lengthMap;
+ HashMap<String,Vector<Integer>> qualityMap;
+
+ public FastqReads(String filename, String parameter) {
+ readId = new Vector<String>();
+ readQuality = new Vector<Vector<Integer>>();
+
+ String readfile_suffix = filename.substring(filename.lastIndexOf(".")).toLowerCase();
+
+ if (readfile_suffix.matches(".qual") || readfile_suffix.matches(".qua") ){
+ new Qual(filename, this);
+ }
+ else if(readfile_suffix.matches(".fastq")){
+ new Fastq(filename, this, parameter);
+ }
+
+ if (Main.DEBUGMODE){
+ int readnumber = Math.min(readId.size(), 5);
+ System.out.println("read quality control view (" + readnumber + " of " + readId.size() + " reads)");
+ for (int i=0; i<readnumber; i++){
+ System.out.println(readId.get(i));
+ System.out.println(readQuality.get(i));
+ }
+ }
+
+ calcQualityMap();
+ }
+
+
+ private void calcQualityMap(){
+ qualityMap = new HashMap<String, Vector<Integer>>();
+ for (int i=0; i<readId.size(); i++){
+ qualityMap.put(readId.get(i), readQuality.get(i));
+ }
+ }
+
+ public Vector<Vector<Integer>> getPooledReadQualities(Vector<Vector<String>> contigReads){
+
+ Vector<Vector<Integer>> qualities = new Vector<Vector<Integer>>();
+
+ for (Vector<String> vs : contigReads){
+ qualities.add(new Vector<Integer>());
+ for (String s : vs){
+ try {
+ qualities.lastElement().addAll(qualityMap.get(s));
+ }
+ catch (NullPointerException e){
+// e.printStackTrace();
+ System.out.println("\tread missing (check split.regex): " + s);
+ }
+ }
+ }
+ return qualities;
+ }
+
+ public Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> contigReads){
+ Vector<Vector<Integer>> lengths = new Vector<Vector<Integer>>();
+ for (Vector<String> vs : contigReads){
+ lengths.add(new Vector<Integer>());
+ for (String s : vs){
+ try {
+ lengths.lastElement().add(qualityMap.get(s).size());
+ }
+ catch (NullPointerException e){
+// e.printStackTrace();
+ System.out.println("\tread missing (check split.regex): " + s);
+ }
+ }
+ }
+ return lengths;
+ }
+
+ public void readMatchCheck(Vector<Vector<Integer>> readMapping, Vector<String> contigId, String contigfile, String readfile){
+ Vector<String> contigsWithNoMatches = new Vector<String>();
+
+ for (int i=0; i<readMapping.size(); i++){
+ if (readMapping.get(i).size()==0){
+ contigsWithNoMatches.add(contigId.get(i));
+ }
+ }
+
+ if (contigsWithNoMatches.size() > 0){
+ StringBuffer output = new StringBuffer();
+ output.append("error: no reads matching in " + contigfile + " and " + readfile + " for contigs:\n");
+ output.append(contigsWithNoMatches.toString());
+ System.out.println(output.toString());
+ System.exit(1);
+ }
+ }
+}
diff --git a/src/de/rki/ng4/surankco/data/Read.java b/src/de/rki/ng4/surankco/data/Read.java
new file mode 100755
index 0000000..2876c7a
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/Read.java
@@ -0,0 +1,91 @@
+package de.rki.ng4.surankco.data;
+
+import java.util.Vector;
+
+public class Read {
+
+ private String id;
+ private char orientation;
+ private int offset;
+
+ private int paddedLength;
+// private String paddedSequence; // not need anymore, nucleotide counts
+
+ private int alignClippingStart;
+ private int alignClippingEnd;
+
+ private Vector<Integer> qualityValues;
+
+ public Read(String id){
+ this.id = id;
+ }
+
+ public Integer getClippingLength() {
+ return (alignClippingEnd-alignClippingStart+1);
+ }
+
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public char getOrientation() {
+ return orientation;
+ }
+
+ public void setOrientation(char orientation) {
+ this.orientation = orientation;
+ }
+
+ public int getOffset() {
+ return offset;
+ }
+
+ public void setOffset(int offset) {
+ this.offset = offset;
+ }
+
+ public int getPaddedLength() {
+ return paddedLength;
+ }
+
+ public void setPaddedLength(int paddedLength) {
+ this.paddedLength = paddedLength;
+ }
+
+// public String getPaddedSequence() {
+// return paddedSequence;
+// }
+
+// public void setPaddedSequence(String paddedSequence) {
+// this.paddedSequence = paddedSequence;
+// }
+
+ public int getAlignClippingStart() {
+ return alignClippingStart;
+ }
+
+ public void setAlignClippingStart(int alignClippingStart) {
+ this.alignClippingStart = alignClippingStart;
+ }
+
+ public int getAlignClippingEnd() {
+ return alignClippingEnd;
+ }
+
+ public void setAlignClippingEnd(int alignClippingEnd) {
+ this.alignClippingEnd = alignClippingEnd;
+ }
+
+ public Vector<Integer> getQualityValues() {
+ return qualityValues;
+ }
+
+ public void setQualityValues(Vector<Integer> qualityValues) {
+ this.qualityValues = qualityValues;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/data/Reads.java b/src/de/rki/ng4/surankco/data/Reads.java
new file mode 100755
index 0000000..7847eb8
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/Reads.java
@@ -0,0 +1,109 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.data;
+
+import java.util.HashMap;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.feature.Main;
+import de.rki.ng4.surankco.files.Fastq;
+import de.rki.ng4.surankco.files.Qual;
+
+public class Reads {
+
+ public Vector<String> readId;
+ public Vector<Vector<Integer>> readQuality;
+
+ HashMap<String,Integer> lengthMap;
+ HashMap<String,Vector<Integer>> qualityMap;
+
+ public Reads(String filename, String parameter) {
+ readId = new Vector<String>();
+ readQuality = new Vector<Vector<Integer>>();
+
+ String readfile_suffix = filename.substring(filename.lastIndexOf(".")).toLowerCase();
+
+ if (readfile_suffix.matches(".qual") || readfile_suffix.matches(".qua") ){
+ new Qual(filename, this);
+ }
+ else if(readfile_suffix.matches(".fastq")){
+ new Fastq(filename, this, parameter);
+ }
+
+ if (Main.DEBUGMODE){
+ int readnumber = Math.min(readId.size(), 5);
+ System.out.println("read quality control view (" + readnumber + " of " + readId.size() + " reads)");
+ for (int i=0; i<readnumber; i++){
+ System.out.println(readId.get(i));
+ System.out.println(readQuality.get(i));
+ }
+ }
+
+ calcQualityMap();
+ }
+
+
+ private void calcQualityMap(){
+ qualityMap = new HashMap<String, Vector<Integer>>();
+ for (int i=0; i<readId.size(); i++){
+ qualityMap.put(readId.get(i), readQuality.get(i));
+ }
+ }
+
+ public Vector<Vector<Integer>> getPooledReadQualities(Vector<Vector<String>> contigReads){
+
+ Vector<Vector<Integer>> qualities = new Vector<Vector<Integer>>();
+
+ for (Vector<String> vs : contigReads){
+ qualities.add(new Vector<Integer>());
+ for (String s : vs){
+ try {
+ qualities.lastElement().addAll(qualityMap.get(s));
+ }
+ catch (NullPointerException e){
+// e.printStackTrace();
+ System.out.println("\tread missing (check split.regex): " + s);
+ }
+ }
+ }
+ return qualities;
+ }
+
+ public Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> contigReads){
+ Vector<Vector<Integer>> lengths = new Vector<Vector<Integer>>();
+ for (Vector<String> vs : contigReads){
+ lengths.add(new Vector<Integer>());
+ for (String s : vs){
+ try {
+ lengths.lastElement().add(qualityMap.get(s).size());
+ }
+ catch (NullPointerException e){
+// e.printStackTrace();
+ System.out.println("\tread missing (check split.regex): " + s);
+ }
+ }
+ }
+ return lengths;
+ }
+
+ public void readMatchCheck(Vector<Vector<Integer>> readMapping, Vector<String> contigId, String contigfile, String readfile){
+ Vector<String> contigsWithNoMatches = new Vector<String>();
+
+ for (int i=0; i<readMapping.size(); i++){
+ if (readMapping.get(i).size()==0){
+ contigsWithNoMatches.add(contigId.get(i));
+ }
+ }
+
+ if (contigsWithNoMatches.size() > 0){
+ StringBuffer output = new StringBuffer();
+ output.append("error: no reads matching in " + contigfile + " and " + readfile + " for contigs:\n");
+ output.append(contigsWithNoMatches.toString());
+ System.out.println(output.toString());
+ System.exit(1);
+ }
+ }
+}
diff --git a/src/de/rki/ng4/surankco/data/SamContig.java b/src/de/rki/ng4/surankco/data/SamContig.java
new file mode 100755
index 0000000..39a70d9
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/SamContig.java
@@ -0,0 +1,43 @@
+package de.rki.ng4.surankco.data;
+
+import htsjdk.samtools.SAMRecord;
+
+import java.util.ArrayList;
+
+public class SamContig {
+
+ private String id;
+
+ private ArrayList<SAMRecord> reads;
+ private ArrayList<ArrayList<Byte>> alignment;
+
+ public SamContig(String id){
+ this.id = id;
+ reads = new ArrayList<SAMRecord>();
+ alignment = new ArrayList<ArrayList<Byte>>();
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public ArrayList<SAMRecord> getReads() {
+ return reads;
+ }
+
+ public void addRead(SAMRecord read) {
+ this.reads.add(read);
+ }
+
+ public ArrayList<ArrayList<Byte>> getAlignment() {
+ return alignment;
+ }
+
+ public void addAlignmentColumn(ArrayList<Byte> alignmentColumn) {
+ this.alignment.add(alignmentColumn);
+ }
+}
diff --git a/src/de/rki/ng4/surankco/feature/Coverage.java b/src/de/rki/ng4/surankco/feature/Coverage.java
new file mode 100755
index 0000000..f2ffdf2
--- /dev/null
+++ b/src/de/rki/ng4/surankco/feature/Coverage.java
@@ -0,0 +1,166 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.feature;
+
+import java.util.Vector;
+
+import de.rki.ng4.surankco.tools.Statistic;
+
+
+
+public class Coverage {
+
+ private Vector<Vector<Integer>> coverages;
+
+ private Vector<Vector<Integer>> coverageLeftEnd;
+ private Vector<Vector<Integer>> coverageRightEnd;
+
+ private Vector<Double> coverageMinEndMean;
+ private Vector<Double> coverageMinEndSD;
+ private Vector<Double> coverageMinEndMedian;
+ private Vector<Double> coverageMinEndMAD;
+ private Vector<Double> coverageMinEndMin;
+ private Vector<Double> coverageMinEndMax;
+
+ private Vector<Double> coverageMaxEndMean;
+ private Vector<Double> coverageMaxEndSD;
+ private Vector<Double> coverageMaxEndMedian;
+ private Vector<Double> coverageMaxEndMAD;
+ private Vector<Double> coverageMaxEndMin;
+ private Vector<Double> coverageMaxEndMax;
+
+ boolean calced = false;
+
+ public Coverage(Vector<Vector<Integer>> coverages){
+ this.coverages = coverages;
+ }
+
+ public boolean calcEndCovs(Vector<Double> endSize){
+ boolean sizeCheck = endSize.size() == coverages.size();
+ if(sizeCheck){
+ coverageLeftEnd = new Vector<Vector<Integer>>();
+ coverageRightEnd = new Vector<Vector<Integer>>();
+ for(int i=0; i<coverages.size(); i++){
+ coverageLeftEnd.add(new Vector<Integer>(coverages.get(i).subList(0, Math.min(endSize.get(i).intValue(), coverages.get(i).size()))));
+ coverageRightEnd.add(new Vector<Integer>(coverages.get(i).subList(Math.max(0, coverages.get(i).size()-endSize.get(i).intValue()), coverages.get(i).size())));
+ }
+ }
+ calcStats();
+ return sizeCheck;
+ }
+
+ private void calcStats(){
+ coverageMinEndMean = new Vector<Double>();
+ coverageMinEndSD = new Vector<Double>();
+ coverageMinEndMedian = new Vector<Double>();
+ coverageMinEndMAD = new Vector<Double>();
+ coverageMinEndMin = new Vector<Double>();
+ coverageMinEndMax = new Vector<Double>();
+
+ coverageMaxEndMean = new Vector<Double>();
+ coverageMaxEndSD = new Vector<Double>();
+ coverageMaxEndMedian = new Vector<Double>();
+ coverageMaxEndMAD = new Vector<Double>();
+ coverageMaxEndMin = new Vector<Double>();
+ coverageMaxEndMax = new Vector<Double>();
+
+ Vector<Double> leftMean = Statistic.means(coverageLeftEnd);
+ Vector<Double> leftSD = Statistic.sds(coverageLeftEnd);
+ Vector<Double> leftMedian = Statistic.medians(coverageLeftEnd);
+ Vector<Double> leftMAD = Statistic.mads(coverageLeftEnd);
+ Vector<Double> leftMin = Statistic.mins(coverageLeftEnd);
+ Vector<Double> leftMax = Statistic.maxs(coverageLeftEnd);
+
+ Vector<Double> rightMean = Statistic.means(coverageRightEnd);
+ Vector<Double> rightSD = Statistic.sds(coverageRightEnd);
+ Vector<Double> rightMedian = Statistic.medians(coverageRightEnd);
+ Vector<Double> rightMAD = Statistic.mads(coverageRightEnd);
+ Vector<Double> rightMin = Statistic.mins(coverageRightEnd);
+ Vector<Double> rightMax = Statistic.maxs(coverageRightEnd);
+
+ for (int i=0; i<leftMean.size(); i++){
+ if (leftMean.get(i) <= rightMean.get(i)){
+ coverageMinEndMean.add(leftMean.get(i));
+ coverageMinEndSD.add(leftSD.get(i));
+ coverageMinEndMedian.add(leftMedian.get(i));
+ coverageMinEndMAD.add(leftMAD.get(i));
+ coverageMinEndMin.add(leftMin.get(i));
+ coverageMinEndMax.add(leftMax.get(i));
+
+ coverageMaxEndMean.add(rightMean.get(i));
+ coverageMaxEndSD.add(rightSD.get(i));
+ coverageMaxEndMedian.add(rightMedian.get(i));
+ coverageMaxEndMAD.add(rightMAD.get(i));
+ coverageMaxEndMin.add(rightMin.get(i));
+ coverageMaxEndMax.add(rightMax.get(i));
+ }
+ else{
+ coverageMinEndMean.add(rightMean.get(i));
+ coverageMinEndSD.add(rightSD.get(i));
+ coverageMinEndMedian.add(rightMedian.get(i));
+ coverageMinEndMAD.add(rightMAD.get(i));
+ coverageMinEndMin.add(rightMin.get(i));
+ coverageMinEndMax.add(rightMax.get(i));
+
+ coverageMaxEndMean.add(leftMean.get(i));
+ coverageMaxEndSD.add(leftSD.get(i));
+ coverageMaxEndMedian.add(leftMedian.get(i));
+ coverageMaxEndMAD.add(leftMAD.get(i));
+ coverageMaxEndMin.add(leftMin.get(i));
+ coverageMaxEndMax.add(leftMax.get(i));
+ }
+ }
+ calced = true;
+ }
+
+ public Vector<Double> getCoverageMinEndMean(){
+ return coverageMinEndMean;
+ }
+
+ public Vector<Double> getCoverageMinEndSD(){
+ return coverageMinEndSD;
+ }
+
+ public Vector<Double> getCoverageMinEndMedian(){
+ return coverageMinEndMedian;
+ }
+
+ public Vector<Double> getCoverageMinEndMAD(){
+ return coverageMinEndMAD;
+ }
+
+ public Vector<Double> getCoverageMinEndMin(){
+ return coverageMinEndMin;
+ }
+
+ public Vector<Double> getCoverageMinEndMax(){
+ return coverageMinEndMax;
+ }
+
+ public Vector<Double> getCoverageMaxEndMean(){
+ return coverageMaxEndMean;
+ }
+
+ public Vector<Double> getCoverageMaxEndSD(){
+ return coverageMaxEndSD;
+ }
+
+ public Vector<Double> getCoverageMaxEndMedian(){
+ return coverageMaxEndMedian;
+ }
+
+ public Vector<Double> getCoverageMaxEndMAD(){
+ return coverageMaxEndMAD;
+ }
+
+ public Vector<Double> getCoverageMaxEndMin(){
+ return coverageMaxEndMin;
+ }
+
+ public Vector<Double> getCoverageMaxEndMax(){
+ return coverageMaxEndMax;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/feature/Kmer.java b/src/de/rki/ng4/surankco/feature/Kmer.java
new file mode 100755
index 0000000..a33876f
--- /dev/null
+++ b/src/de/rki/ng4/surankco/feature/Kmer.java
@@ -0,0 +1,279 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.feature;
+
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.util.Vector;
+
+public class Kmer {
+
+ private Vector<String> consensusSequences;
+ private int kmerSize;
+
+ private Vector<Integer> kmerEndUniquenessMin;
+ private Vector<Integer> kmerEndUniquenessMax;
+
+ public Kmer(Vector<String> conSeqs, int kmerSize){
+ this.kmerSize = kmerSize;
+ consensusSequences = new Vector<String>();
+ StringBuffer temp;
+ for (String s : conSeqs) {
+ temp = new StringBuffer();
+ for (Character ch : s.toUpperCase().toCharArray()){
+ if (ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T'){
+ temp.append(ch);
+ }
+ }
+ consensusSequences.add(temp.toString());
+ }
+ }
+
+ public Vector<Integer> getKmerGlobalUniqueness(){
+ Vector<HashMap<KmerKey,Boolean>> kmers = getKmers(kmerSize, consensusSequences);
+ Vector<Integer> uniqueKmerCount = new Vector<Integer>();
+ boolean unique;
+ int count;
+ for (int i=0; i<kmers.size(); i++){
+ count = 0;
+ for (KmerKey kmer : kmers.get(i).keySet()){
+ unique = true;
+ for (int j=0; j<kmers.size(); j++){
+ if (i!=j){
+ if (kmers.get(j).containsKey(kmer)){
+ unique = false;
+ break;
+ }
+ }
+ }
+ if (unique){
+ count++;
+ }
+ }
+ uniqueKmerCount.add(count);
+ }
+ return uniqueKmerCount;
+ }
+
+ public Vector<Integer> getKmerEndUniquenessMin(int endLength){
+ if (kmerEndUniquenessMin == null)
+ calcKmerEndUniqueness(kmerSize, endLength);
+ return kmerEndUniquenessMin;
+ }
+
+ public Vector<Integer> getKmerEndUniquenessMax(int endLength){
+ if (kmerEndUniquenessMax == null)
+ calcKmerEndUniqueness(kmerSize, endLength);
+ return kmerEndUniquenessMax;
+ }
+
+ public Vector<Integer> getKmerSize(){
+ Vector<Integer> kmerSizes = new Vector<Integer>();
+ for (int i=0; i<consensusSequences.size(); i++){
+ kmerSizes.add(kmerSize);
+ }
+ return kmerSizes;
+ }
+
+ private void calcKmerEndUniqueness(int kmerSize, int endLength){
+ Vector<HashMap<KmerKey,Boolean>> kmersLeft = getKmers(kmerSize, sequencePrefixes(endLength));
+ Vector<HashMap<KmerKey,Boolean>> kmersRight = getKmers(kmerSize, sequenceSuffixes(endLength));
+ kmerEndUniquenessMin = new Vector<Integer>();
+ kmerEndUniquenessMax = new Vector<Integer>();
+ int left, right;
+ boolean unique;
+ for (int i=0; i<kmersLeft.size(); i++){
+ left = 0;
+ for (KmerKey kmer : kmersLeft.get(i).keySet()){
+ unique = true;
+ for (int j=0; j<kmersLeft.size(); j++){
+ if (i!=j){
+ if (kmersLeft.get(j).containsKey(kmer) || kmersRight.get(j).containsKey(kmer)){
+ unique = false;
+ break;
+ }
+ }
+ }
+ if (unique){
+ left++;
+ }
+ }
+ right = 0;
+ for (KmerKey kmer : kmersRight.get(i).keySet()){
+ unique = true;
+ for (int j=0; j<kmersRight.size(); j++){
+ if (i!=j){
+ if (kmersLeft.get(j).containsKey(kmer) || kmersRight.get(j).containsKey(kmer)){
+ unique = false;
+ break;
+ }
+ }
+ }
+ if (unique){
+ right++;
+ }
+ }
+ kmerEndUniquenessMax.add(Math.max(left, right));
+ kmerEndUniquenessMin.add(Math.min(left, right));
+ }
+ }
+
+ private Vector<HashMap<KmerKey, Boolean>> getKmers(int kmerSize, Vector<String> sequences){
+ Vector<HashMap<KmerKey,Boolean>> kmers = new Vector<HashMap<KmerKey,Boolean>>();
+ for (String seq : sequences){
+ kmers.add(new HashMap<KmerKey, Boolean>());
+ String kmer;
+ for (int i=0; i<=seq.length()-kmerSize; i++){
+ kmer = seq.substring(i, i+kmerSize);
+ kmers.lastElement().put(new KmerKey(kmer), true);
+
+ }
+ }
+ return kmers;
+ }
+
+ public Vector<Vector<Double>> getKmerDistances(){
+ Vector<HashMap<KmerKey, Boolean>> kmers = getKmers(kmerSize, consensusSequences);
+ Vector<Vector<Double>> pairwiseDistances = new Vector<Vector<Double>>();
+ for (HashMap<KmerKey, Boolean> kmerMapA : kmers){
+ pairwiseDistances.add(new Vector<Double>());
+ for (HashMap<KmerKey, Boolean> kmerMapB : kmers){
+ if (!kmerMapA.equals(kmerMapB))
+ pairwiseDistances.lastElement().add(editDistance(kmerMapA,kmerMapB));
+ }
+ }
+ return pairwiseDistances;
+ }
+
+ private double editDistance(HashMap<KmerKey, Boolean> kmerMapA, HashMap<KmerKey, Boolean> kmerMapB){
+ double distance = 0;
+ double sizeA = kmerMapA.size();
+ double sizeB = 0;
+ for (Entry<KmerKey, Boolean> entry : kmerMapA.entrySet()){
+ if (kmerMapB.containsKey(entry.getKey())) sizeB++;
+ }
+ for (Entry<KmerKey, Boolean> entry : kmerMapA.entrySet()){
+ if (kmerMapB.containsKey(entry.getKey()))
+ distance += Math.abs((1/sizeA) - (1/sizeB));
+ else
+ distance += (1/sizeA);
+ }
+ return distance;
+ }
+
+ private Vector<String> sequencePrefixes(int length){
+ Vector<String> prefixes = new Vector<String>();
+ for (String s : consensusSequences){
+ prefixes.add(s.substring(0, Math.min(length, s.length())));
+ }
+ return prefixes;
+ }
+
+ private Vector<String> sequenceSuffixes(int length){
+ Vector<String> suffixes = new Vector<String>();
+ for (String s : consensusSequences){
+ suffixes.add(s.substring(Math.max(0, s.length()-length), s.length()));
+ }
+ return suffixes;
+ }
+
+ class MutableInt {
+ private int value = 0;
+ public void inc () { ++value; }
+ public int get () { return value; }
+ public String toString(){
+ return Integer.toString(value);
+ }
+ }
+
+ private class KmerKey{
+ private String kmer;
+ private String revcomp;
+
+ private int hash;
+
+ public KmerKey(String kmer){
+ this.kmer = kmer.toUpperCase();
+ this.revcomp = reverseCompl(this.kmer);
+ initHash();
+ }
+
+ @Override
+ public String toString(){
+ return kmer + "<>" + revcomp;
+ }
+
+ private String reverseCompl(String kmer){
+ StringBuffer revcomp = new StringBuffer();
+ for (Character ch : kmer.toCharArray()){
+ switch (ch){
+ case 'A':
+ revcomp.append('T');
+ break;
+ case 'C':
+ revcomp.append('G');
+ break;
+ case 'G':
+ revcomp.append('C');
+ break;
+ case 'T':
+ revcomp.append('A');
+ break;
+ default:
+ revcomp.append(ch);
+ }
+ }
+ return revcomp.reverse().toString();
+ }
+
+ @Override
+ public boolean equals(Object obj){
+ if(obj == this) {
+ return true;
+ }
+ if(!(obj instanceof KmerKey)) {
+ return false;
+ }
+
+ KmerKey kk = (KmerKey) obj;
+ return (this.kmer.matches(kk.kmer) || this.kmer.matches(kk.revcomp));
+ }
+
+ @Override
+ public int hashCode(){
+ return hash;
+ }
+
+ private void initHash(){
+ hash = Math.max(hash(kmer), hash(revcomp));
+ }
+
+ private int hash(String s){
+ int hash = 0;
+ for (Character ch : s.toCharArray()){
+ switch (ch){
+ case 'A':
+ hash = hash << 2;
+ hash = hash | 0;
+ break;
+ case 'C':
+ hash = hash << 2;
+ hash = hash | 1;
+ break;
+ case 'G':
+ hash = hash << 2;
+ hash = hash | 2;
+ break;
+ case 'T':
+ hash = hash << 2;
+ hash = hash | 3;
+ break;
+ }
+ }
+ return hash;
+ }
+ }
+}
diff --git a/src/de/rki/ng4/surankco/feature/Main.java b/src/de/rki/ng4/surankco/feature/Main.java
new file mode 100755
index 0000000..f8edc50
--- /dev/null
+++ b/src/de/rki/ng4/surankco/feature/Main.java
@@ -0,0 +1,257 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.feature;
+
+import java.util.Arrays;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.Assembly;
+import de.rki.ng4.surankco.data.AssemblyAce;
+import de.rki.ng4.surankco.data.AssemblySam;
+import de.rki.ng4.surankco.files.ExportR;
+import de.rki.ng4.surankco.tools.Statistic;
+
+/**
+ * Part of the feature generation/calculation of SuRankCo
+ * Main procedure, initializes file imports, features extractions and file exports
+ *
+ * @author Mathias Kuhring
+ *
+ */
+public class Main {
+
+ // debugging
+ public final static boolean DEBUGMODE = false;
+
+ // simple timer use with timerStart() and timerStop()
+ public static long timer;
+
+
+ /**
+ *
+ *
+ * @param args Arguments and their order are described in the surankco-feature R-script
+ */
+ public static void main(String[] args) {
+
+ if (Main.DEBUGMODE){ System.out.println(Arrays.toString(args)); }
+
+ // prepare export object to collect features
+ ExportR e = new ExportR();
+
+ // import ace file and read qualities
+ if (Main.DEBUGMODE){ System.out.println("-> import assembly"); }
+ String contigfile = args[0];
+ String readfile = args[1];
+ String fastqParameter = args[5];
+
+ boolean ace = contigfile.endsWith("ace");
+ Assembly assembly;
+ if (ace){
+ assembly = new AssemblyAce(contigfile, readfile, fastqParameter);
+ }
+ else{
+ assembly = new AssemblySam(contigfile, readfile);
+ }
+ System.gc();
+
+ // contig lenght filter
+ assembly.lengthFilter(Integer.parseInt(args[8]));
+
+ // calculate several features
+ Several features = new Several(assembly.getContigs());
+
+ e.addStringVariable("ContigID", features.getIds());
+ e.addIntegerVariable("Length", features.getContigLength());
+ if (ace){ // !!! no BaseCount and BaseSegmentCount for sam contigs yet, because no padded sequence available yet
+ e.addIntegerVariable("BaseCount", features.getBaseCounts());
+ e.addIntegerVariable("BaseSegmentCount", features.getBaseSeqmentCounts());
+ }
+ e.addIntegerVariable("ReadCount", features.getReadCounts());
+ e.addDoubleVariable("ReadComplementFraction", features.getReadComplementFraction());
+
+ if (Main.DEBUGMODE){ System.out.println("-> contig qualities"); }
+
+ if (ace){ // !!! no quality values for sam contigs yet
+ Vector<Vector<Integer>> contigQualities = features.getQualityValues();
+ e.addDoubleVariable("ContigQualitiesMean", Statistic.means(contigQualities));
+ e.addDoubleVariable("ContigQualitiesSD", Statistic.sds(contigQualities));
+ e.addDoubleVariable("ContigQualitiesMedian", Statistic.medians(contigQualities));
+ e.addDoubleVariable("ContigQualitiesMAD", Statistic.mads(contigQualities));
+ e.addDoubleVariable("ContigQualitiesMin", Statistic.mins(contigQualities));
+ e.addDoubleVariable("ContigQualitiesMax", Statistic.maxs(contigQualities));
+ }
+
+ System.gc();
+
+ // calculate read quality features
+ if (Main.DEBUGMODE){ System.out.println("-> read qualities"); }
+
+ String regex = args[6];
+
+ Vector<Vector<String>> readIds = features.getReadIds(regex);
+ Vector<Vector<Integer>> readQualities = assembly.getPooledReadQualities(readIds);
+
+ e.addDoubleVariable("ReadQualitiesMean", Statistic.means(readQualities));
+ e.addDoubleVariable("ReadQualitiesSD", Statistic.sds(readQualities));
+ e.addDoubleVariable("ReadQualitiesMedian", Statistic.medians(readQualities));
+ e.addDoubleVariable("ReadQualitiesMAD", Statistic.mads(readQualities));
+ e.addDoubleVariable("ReadQualitiesMin", Statistic.mins(readQualities));
+ e.addDoubleVariable("ReadQualitiesMax", Statistic.maxs(readQualities));
+
+ if (Main.DEBUGMODE){ System.out.println("-> read lengths"); }
+
+ Vector<Vector<Integer>> readLengths = assembly.getReadLengths(readIds);
+
+ Vector<Double> readLengthsMean = Statistic.means(readLengths);
+ e.addDoubleVariable("ReadLengthsMean", readLengthsMean);
+ e.addDoubleVariable("ReadLengthsSD", Statistic.sds(readLengths));
+ e.addDoubleVariable("ReadLengthsMedian", Statistic.medians(readLengths));
+ e.addDoubleVariable("ReadLengthsMAD", Statistic.mads(readLengths));
+ e.addDoubleVariable("ReadLengthsMin", Statistic.mins(readLengths));
+ e.addDoubleVariable("ReadLengthsMax", Statistic.maxs(readLengths));
+
+ assembly = null;
+ System.gc();
+
+ // calculate read length features
+ if (Main.DEBUGMODE){ System.out.println("-> read lengths padded"); }
+ Vector<Vector<Integer>> readPaddedLengths = features.getReadPaddedLengths();
+ e.addDoubleVariable("ReadLengthsPaddedMean", Statistic.means(readPaddedLengths));
+ e.addDoubleVariable("ReadLengthsPaddedSD", Statistic.sds(readPaddedLengths));
+ e.addDoubleVariable("ReadLengthsPaddedMedian", Statistic.medians(readPaddedLengths));
+ e.addDoubleVariable("ReadLengthsPaddedMAD", Statistic.mads(readPaddedLengths));
+ e.addDoubleVariable("ReadLengthsPaddedMin", Statistic.mins(readPaddedLengths));
+ e.addDoubleVariable("ReadLengthsPaddedMax", Statistic.maxs(readPaddedLengths));
+
+ if (Main.DEBUGMODE){ System.out.println("-> read lengths clipping"); }
+ Vector<Vector<Integer>> readClippingLengths = features.getReadClippingLengths();
+ e.addDoubleVariable("ReadLengthsClippingMean", Statistic.means(readClippingLengths));
+ e.addDoubleVariable("ReadLengthsClippingSD", Statistic.sds(readClippingLengths));
+ e.addDoubleVariable("ReadLengthsClippingMedian", Statistic.medians(readClippingLengths));
+ e.addDoubleVariable("ReadLengthsClippingMAD", Statistic.mads(readClippingLengths));
+ e.addDoubleVariable("ReadLengthsClippingMin", Statistic.mins(readClippingLengths));
+ e.addDoubleVariable("ReadLengthsClippingMax", Statistic.maxs(readClippingLengths));
+
+ if (Main.DEBUGMODE){ System.out.println("-> gc-content"); }
+ e.addDoubleVariable("GcFraction", features.getGcContent());
+
+ System.gc();
+
+ // calculate coverage features
+ if (Main.DEBUGMODE){ System.out.println("-> coverage"); timerStart(); }
+
+ Vector<Vector<Integer>> coverageGlobal = features.getGlobalCoverages();
+ e.addDoubleVariable("CoverageGlobalMean", Statistic.means(coverageGlobal));
+ e.addDoubleVariable("CoverageGlobalSD", Statistic.sds(coverageGlobal));
+ e.addDoubleVariable("CoverageGlobalMedian", Statistic.medians(coverageGlobal));
+ e.addDoubleVariable("CoverageGlobalMAD", Statistic.mads(coverageGlobal));
+ e.addDoubleVariable("CoverageGlobalMin", Statistic.mins(coverageGlobal));
+ e.addDoubleVariable("CoverageGlobalMax", Statistic.maxs(coverageGlobal));
+ if (Main.DEBUGMODE){ timerStop(); }
+
+ if (Main.DEBUGMODE){ System.out.println("-> coverage 2"); timerStart(); }
+ Coverage cov = new Coverage(coverageGlobal);
+ if (cov.calcEndCovs(readLengthsMean)){
+ e.addDoubleVariable("CoverageMinEndMean", cov.getCoverageMinEndMean());
+ e.addDoubleVariable("CoverageMinEndSD", cov.getCoverageMinEndSD());
+ e.addDoubleVariable("CoverageMinEndMedian", cov.getCoverageMinEndMedian());
+ e.addDoubleVariable("CoverageMinEndMAD", cov.getCoverageMinEndMAD());
+ e.addDoubleVariable("CoverageMinEndMin", cov.getCoverageMinEndMin());
+ e.addDoubleVariable("CoverageMinEndMax", cov.getCoverageMinEndMax());
+ e.addDoubleVariable("CoverageMaxEndMean", cov.getCoverageMaxEndMean());
+ e.addDoubleVariable("CoverageMaxEndSD", cov.getCoverageMaxEndSD());
+ e.addDoubleVariable("CoverageMaxEndMedian", cov.getCoverageMaxEndMedian());
+ e.addDoubleVariable("CoverageMaxEndMAD", cov.getCoverageMaxEndMAD());
+ e.addDoubleVariable("CoverageMaxEndMin", cov.getCoverageMaxEndMin());
+ e.addDoubleVariable("CoverageMaxEndMax", cov.getCoverageMaxEndMax());
+ }
+ if (Main.DEBUGMODE){ timerStop(); }
+
+ if (Main.DEBUGMODE){ System.out.println("-> core coverage"); timerStart(); }
+ Vector<Vector<Integer>> coreCoverage = features.getCoreCoverages();
+ e.addDoubleVariable("CoreCoverageGlobalMean", Statistic.means(coreCoverage));
+ e.addDoubleVariable("CoreCoverageGlobalSD", Statistic.sds(coreCoverage));
+ e.addDoubleVariable("CoreCoverageGlobalMedian", Statistic.medians(coreCoverage));
+ e.addDoubleVariable("CoreCoverageGlobalMAD", Statistic.mads(coreCoverage));
+ e.addDoubleVariable("CoreCoverageGlobalMin", Statistic.mins(coreCoverage));
+ e.addDoubleVariable("CoreCoverageGlobalMax", Statistic.maxs(coreCoverage));
+ if (Main.DEBUGMODE){ timerStop(); }
+
+ // catch coverage vector size error
+ boolean devError = false;
+ for (int i=0; i<coverageGlobal.size(); i++){
+ if (devError = coverageGlobal.get(i).size() != coreCoverage.get(i).size()){
+ System.out.println("dev.error: different vector size of coverage (" + coverageGlobal.get(i).size() +
+ ") and core coverage (" + coreCoverage.get(i).size() + ") for contig " + features.getIds().get(i));
+ }
+ }
+ if (devError){
+ System.exit(1);
+ }
+
+ if (Main.DEBUGMODE){ System.out.println("-> core coverage 2"); timerStart(); }
+ Coverage corecov = new Coverage(coreCoverage);
+ if (corecov.calcEndCovs(readLengthsMean)){
+ e.addDoubleVariable("CoreCoverageMinEndMean", corecov.getCoverageMinEndMean());
+ e.addDoubleVariable("CoreCoverageMinEndSD", corecov.getCoverageMinEndSD());
+ e.addDoubleVariable("CoreCoverageMinEndMedian", corecov.getCoverageMinEndMedian());
+ e.addDoubleVariable("CoreCoverageMinEndMAD", corecov.getCoverageMinEndMAD());
+ e.addDoubleVariable("CoreCoverageMinEndMin", corecov.getCoverageMinEndMin());
+ e.addDoubleVariable("CoreCoverageMinEndMax", corecov.getCoverageMinEndMax());
+ e.addDoubleVariable("CoreCoverageMaxEndMean", corecov.getCoverageMaxEndMean());
+ e.addDoubleVariable("CoreCoverageMaxEndSD", corecov.getCoverageMaxEndSD());
+ e.addDoubleVariable("CoreCoverageMaxEndMedian", corecov.getCoverageMaxEndMedian());
+ e.addDoubleVariable("CoreCoverageMaxEndMAD", corecov.getCoverageMaxEndMAD());
+ e.addDoubleVariable("CoreCoverageMaxEndMin", corecov.getCoverageMaxEndMin());
+ e.addDoubleVariable("CoreCoverageMaxEndMax", corecov.getCoverageMaxEndMax());
+ }
+ if (Main.DEBUGMODE){ timerStop(); }
+
+ System.gc();
+
+ // calculate kmer features
+ boolean calcKmers = Boolean.parseBoolean(args[7]);
+ if (calcKmers){
+ if (Main.DEBUGMODE){ System.out.println("-> kmer"); timerStart(); }
+ int kmerSize = 8;
+ int ends = (int) (Statistic.mean(Statistic.means(readLengths))*0.8);
+ Kmer kmer = new Kmer(features.getConsensusSequences(), kmerSize);
+ e.addIntegerVariable("KmerSize", kmer.getKmerSize());
+ e.addIntegerVariable("KmerGlobalUniqueness", kmer.getKmerGlobalUniqueness());
+ e.addIntegerVariable("KmerEndUniquenessMin", kmer.getKmerEndUniquenessMin(ends));
+ e.addIntegerVariable("KmerEndUniquenessMax", kmer.getKmerEndUniquenessMax(ends));
+ e.addDoubleVariable("KmerDistanceMin", Statistic.mins(kmer.getKmerDistances()));
+ if (Main.DEBUGMODE){ timerStop(); }
+ }
+
+
+ if (Main.DEBUGMODE){ System.out.println("-> export"); }
+
+ System.gc();
+
+ // export features to table format text
+ String outFilename = args[2];
+ e.write(outFilename);
+
+ // export Coverage and CoreCoverage to fasta like text
+ System.gc();
+ e.writeCoverage(args[3], features.getIds(), coverageGlobal);
+ System.gc();
+ e.writeCoverage(args[4], features.getIds(), coreCoverage);
+ }
+
+ // starts a simple timer
+ private static void timerStart(){
+ timer = System.currentTimeMillis();
+ }
+
+ // stops the simple timer
+ private static void timerStop(){
+ System.out.println((System.currentTimeMillis() - timer) + " ms");
+ }
+
+}
diff --git a/src/de/rki/ng4/surankco/feature/Several.java b/src/de/rki/ng4/surankco/feature/Several.java
new file mode 100755
index 0000000..e2bdea7
--- /dev/null
+++ b/src/de/rki/ng4/surankco/feature/Several.java
@@ -0,0 +1,287 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.feature;
+
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.Contig;
+import de.rki.ng4.surankco.data.Read;
+
+public class Several {
+
+ private Vector<Contig> contigs;
+
+ public Several(Vector<Contig> contigs){
+ this.contigs = contigs;
+ }
+
+
+ public void printContigInfo(){
+ for (Contig c : contigs){
+ System.out.println(c.getId());
+ System.out.println(c.getQualityValues().toString());
+ }
+ }
+
+
+ public Vector<String> getIds(){
+ Vector<String> ids = new Vector<String>();
+ for (Contig c : contigs){
+ ids.add(c.getId());
+ }
+ return ids;
+ }
+
+ public Vector<Integer> getBaseCounts(){
+ Vector<Integer> baseCounts = new Vector<Integer>();
+ for (Contig c : contigs){
+ baseCounts.add(c.getBaseCount());
+ }
+ return baseCounts;
+ }
+
+ public Vector<Integer> getReadCounts(){
+ Vector<Integer> readCounts = new Vector<Integer>();
+ for (Contig c : contigs){
+ readCounts.add(c.getReadCount());
+ }
+ return readCounts;
+ }
+
+ public Vector<Integer> getBaseSeqmentCounts(){
+ Vector<Integer> baseSeqmentCounts = new Vector<Integer>();
+ for (Contig c : contigs){
+ baseSeqmentCounts.add(c.getBaseSeqmentCount());
+ }
+ return baseSeqmentCounts;
+ }
+
+ public Vector<Integer> getContigLength(){
+ Vector<Integer> length = new Vector<Integer>();
+ for (Contig c : contigs){
+ length.add(c.getLength());
+ }
+ return length;
+ }
+
+ public Vector<Double> getReadComplementFraction(){
+ Vector<Double> fraction = new Vector<Double>();
+ double complements;
+ for (Contig c : contigs){
+ complements = 0;
+ for (Read read : c.getReads()){
+ if (read.getOrientation() == 'C') complements++;
+ }
+ fraction.add(complements / (double) c.getReads().size());
+ }
+ return fraction;
+ }
+
+ public Vector<String> getConsensusSequences(){
+ Vector<String> conSeqs = new Vector<String>();
+ for (Contig c : contigs){
+ conSeqs.add(c.getConsensusSequence());
+ }
+ return conSeqs;
+ }
+
+ public Vector<Vector<Integer>> getQualityValues(){
+ Vector<Vector<Integer>> qualityValues = new Vector<Vector<Integer>>();
+ for (Contig c : contigs){
+ qualityValues.add(c.getQualityValues());
+ }
+ return qualityValues;
+ }
+
+ // Collects contig read ids and uses a regex (if indicated) to split off additional suffixes in contrast to the original read names in fastq/qual files
+ public Vector<Vector<String>> getReadIds(String regex){
+ Vector<Vector<String>> readIds = new Vector<Vector<String>>();
+
+ if (regex.matches("nosplit")){
+ for (Contig c : contigs){
+ readIds.add(new Vector<String>());
+ for (Read read : c.getReads()){
+ readIds.lastElement().add(read.getId());
+ }
+ }
+ }
+ else {
+ for (Contig c : contigs){
+ readIds.add(new Vector<String>());
+ for (Read read : c.getReads()){
+ readIds.lastElement().add(read.getId().split(regex)[0]);
+ }
+ }
+ }
+
+ return readIds;
+ }
+
+ public Vector<Vector<Integer>> getReadPaddedLengths(){
+ Vector<Vector<Integer>> lengths = new Vector<Vector<Integer>>();
+ for (Contig c : contigs){
+ lengths.add(new Vector<Integer>());
+ for (Read read : c.getReads()){
+ lengths.lastElement().add(read.getPaddedLength());
+ }
+ }
+ return lengths;
+ }
+
+ public Vector<Vector<Integer>> getReadClippingLengths(){
+ Vector<Vector<Integer>> clippings = new Vector<Vector<Integer>>();
+ for (Contig c : contigs){
+ clippings.add(new Vector<Integer>());
+ for (Read read : c.getReads()){
+ clippings.lastElement().add(read.getClippingLength());
+ }
+ }
+ return clippings;
+ }
+
+ public void printConsensusSequences(){
+ for (Contig c : contigs){
+ System.out.println(c.getId() + " (l=" + c.getConsensusSequence().length() + "):\t" + c.getConsensusSequence().substring(0,Math.min(100, c.getConsensusSequence().length())) + "...");
+ }
+ }
+
+ public void printContigReads(int contigNum){
+ Contig c = contigs.get(contigNum-1);
+ System.out.println("Reads in Contig " + c.getId() + ":");
+ for (Read read : c.getReads()){
+ System.out.println(read.getId() + ":\t" + read.getOrientation() + " " + read.getOffset());
+ }
+ }
+
+ public Vector<Vector<Integer>> getGlobalCoverages(){
+ Vector<Vector<Integer>> coverages = new Vector<Vector<Integer>>();
+
+ for (Contig c : contigs){
+ coverages.add(calcCoverage(c.getConsensusSequence(), c.getNucleotideCount()));
+ }
+
+ return coverages;
+ }
+
+ public Vector<Vector<Integer>> getCoreCoverages(){
+ Vector<Vector<Integer>> ccoverages = new Vector<Vector<Integer>>();
+
+ for (Contig c : contigs){
+ ccoverages.add(calcCoreCoverage(c.getConsensusSequence(), c.getNucleotideCount()));
+ }
+
+ return ccoverages;
+ }
+
+ private Vector<Integer> calcCoverage(String consensus, int[][] nucCounts){
+ Vector<Integer> coverage = new Vector<Integer>();
+ int nucCountSize = nucCounts.length;
+ int consensusSize = consensus.length();
+ for (int i=0; i<consensus.length(); i++){
+ if (consensus.charAt(i) != '*'){
+ coverage.add(nucCounts[i][0] + nucCounts[i][1] + nucCounts[i][2] +
+ nucCounts[i][3] + nucCounts[i][4] + nucCounts[i][5]);
+ }
+ //else System.out.print("*");
+ }
+ return coverage;
+ }
+
+ // Supported Nucleotides:
+ // A Adenin
+ // C Cytosin
+ // G Guanin
+ // T Thymin
+ // R G A (PuRine)
+ // Y T C (PYrimidine)
+ // K G T (Ketone)
+ // M A C (AMinogruppen)
+ // S G C (Starke Wechselwirkung)
+ // W A T (Weiche Wechselwirkung)
+ // B G T C (nicht A) (B kommt nach A)
+ // D G A T (nicht C) (D kommt nach C)
+ // H A C T (nicht G) (H kommt nach G)
+ // V G C A (nicht T, nicht U) (V kommt nach U)
+ // N A G C T (aNy)
+ private Vector<Integer> calcCoreCoverage(String consensus, int[][] nucCounts){
+ Vector<Integer> ccoverage = new Vector<Integer>();
+ for (int i=0; i<consensus.length(); i++){
+ switch (consensus.charAt(i)){
+ case 'A': // A Adenin
+ ccoverage.add(nucCounts[i][0]);
+ break;
+ case 'C': // C Cytosin
+ ccoverage.add(nucCounts[i][1]);
+ break;
+ case 'G': // G Guanin
+ ccoverage.add(nucCounts[i][2]);
+ break;
+ case 'T': // T Thymin
+ ccoverage.add(nucCounts[i][3]);
+ break;
+ case 'R': // R G A (PuRine)
+ ccoverage.add(Math.max(nucCounts[i][2], nucCounts[i][0]));
+ break;
+ case 'Y': // Y T C (PYrimidine)
+ ccoverage.add(Math.max(nucCounts[i][3], nucCounts[i][1]));
+ break;
+ case 'K': // K G T (Ketone)
+ ccoverage.add(Math.max(nucCounts[i][2], nucCounts[i][3]));
+ break;
+ case 'M': // M A C (AMinogruppen)
+ ccoverage.add(Math.max(nucCounts[i][0], nucCounts[i][1]));
+ break;
+ case 'S': // S G C (Starke Wechselwirkung)
+ ccoverage.add(Math.max(nucCounts[i][2], nucCounts[i][1]));
+ break;
+ case 'W': // W A T (Weiche Wechselwirkung)
+ ccoverage.add(Math.max(nucCounts[i][0], nucCounts[i][3]));
+ break;
+ case 'B': // B G T C (nicht A) (B kommt nach A)
+ ccoverage.add(Math.max(Math.max(nucCounts[i][2], nucCounts[i][3]), nucCounts[i][1]));
+ break;
+ case 'D': // D G A T (nicht C) (D kommt nach C)
+ ccoverage.add(Math.max(Math.max(nucCounts[i][2], nucCounts[i][0]), nucCounts[i][3]));
+ break;
+ case 'H': // H A C T (nicht G) (H kommt nach G)
+ ccoverage.add(Math.max(Math.max(nucCounts[i][0], nucCounts[i][1]), nucCounts[i][3]));
+ break;
+ case 'V': // V G C A (nicht T, nicht U) (V kommt nach U)
+ ccoverage.add(Math.max(Math.max(nucCounts[i][2], nucCounts[i][1]), nucCounts[i][0]));
+ break;
+ case 'N': // N A G C T (aNy)
+ ccoverage.add(Math.max(Math.max(nucCounts[i][0], nucCounts[i][1]),
+ Math.max(nucCounts[i][2], nucCounts[i][3])));
+ break;
+ case '*':
+ //System.out.print("*");
+ break;
+ default:
+ ccoverage.add(nucCounts[i][0] + nucCounts[i][1] + nucCounts[i][2] +
+ nucCounts[i][3] + nucCounts[i][4] + nucCounts[i][5]);
+ System.out.println("warning: unsupported nucleotide in consensus at position " + i + ": " + consensus.charAt(i) + " -> using full coverage for this position");
+ }
+ }
+ return ccoverage;
+ }
+
+ public Vector<Double> getGcContent(){
+ Vector<Double> content = new Vector<Double>();
+ double gc, at;
+ for (Contig c : contigs){
+ at = 0;
+ gc = 0;
+ for (Character ch : c.getConsensusSequence().toUpperCase().toCharArray()){
+ if (ch == 'A' || ch == 'T') at++;
+ if (ch == 'G' || ch == 'C') gc++;
+ }
+ content.add(gc/(gc+at));
+ }
+ return content;
+ }
+
+
+}
diff --git a/src/de/rki/ng4/surankco/files/Ace.java b/src/de/rki/ng4/surankco/files/Ace.java
new file mode 100755
index 0000000..46307f8
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Ace.java
@@ -0,0 +1,175 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.AceRead;
+import de.rki.ng4.surankco.data.AceContig;
+
+
+public class Ace {
+
+ private String fileName;
+ private Vector<AceContig> contigs;
+
+ public Ace(String fileName){
+ this.fileName = fileName;
+ contigs = new Vector<AceContig>();
+ try {
+ LineNumberReader acebr = new LineNumberReader(new FileReader(new File(fileName)));
+ try {
+ String line;
+ int readCounter = 0;
+ while ((line = acebr.readLine()) != null){
+ if (line.startsWith("CO ")){
+ contigs.add(new AceContig(line));
+ contigs.lastElement().setLineCO( acebr.getLineNumber()-1 );
+ readCounter = 0;
+ }
+ else if (line.startsWith("BQ")){
+ contigs.lastElement().setLineBQ( acebr.getLineNumber()-1 );
+ }
+ else if (line.startsWith("AF ")){
+ contigs.lastElement().addRead(new AceRead());
+ contigs.lastElement().getReads().lastElement().addAF(line);
+ }
+ else if (line.startsWith("RD ")){
+ readCounter++;
+
+ contigs.lastElement().getReads().get(readCounter-1).addRD(line);
+ contigs.lastElement().getReads().get(readCounter-1).setLineRD( acebr.getLineNumber()-1 );
+
+ StringBuffer readSeq = new StringBuffer();
+ while ((line = acebr.readLine()) != null && line.length() != 0){
+ readSeq.append(line);
+ }
+ contigs.lastElement().getReads().get(readCounter-1).setPaddedSequence( readSeq.toString() );
+ }
+ else if (line.startsWith("QA ")){
+ contigs.lastElement().getReads().get(readCounter-1).addQA(line);
+ }
+ }
+ }
+ finally {
+ acebr.close();
+ }
+ }
+ catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + fileName);
+ System.exit(1);
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + fileName);
+ System.exit(1);
+ }
+ }
+
+ public void readConsensusSequences(){
+ String line;
+ StringBuffer consensus;
+ try {
+ BufferedReader acebr = new BufferedReader(new FileReader(new File(fileName)));
+ int lineCounter = 0;
+ try {
+ for (AceContig c : contigs){
+ consensus = new StringBuffer();
+ while (lineCounter < c.getLineCO()){
+ if ((line = acebr.readLine()) == null) break;
+ lineCounter++;
+ }
+
+ if ((line = acebr.readLine()) != null && line.startsWith("CO")){
+ lineCounter++;
+ while ((line = acebr.readLine()) != null && line.length() != 0){
+ lineCounter++;
+ consensus.append(line);
+ }
+ }
+ lineCounter++;
+ c.setConsensusSequence( consensus.toString().toUpperCase() );
+ }
+ }
+ finally {
+ acebr.close();
+ }
+ }
+ catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + fileName);
+ System.exit(1);
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + fileName);
+ System.exit(1);
+ }
+ }
+
+ public void readQualityValues(){
+ String line;
+ String[] splits;
+ try {
+ BufferedReader acebr = new BufferedReader(new FileReader(new File(fileName)));
+ int lineCounter = 0;
+ try {
+ for (AceContig c : contigs){
+ while (lineCounter < c.getLineBQ()){
+ if ((line = acebr.readLine()) == null) break;
+ lineCounter++;
+ }
+
+ if ((line = acebr.readLine()) != null && line.matches("BQ")){
+ lineCounter++;
+ while ((line = acebr.readLine()) != null && line.length() != 0){
+ lineCounter++;
+ splits = line.split(" ");
+ for (String s : splits){
+ c.getQualityValues().add(Integer.parseInt(s));
+ }
+ }
+ }
+ lineCounter++;
+ }
+ }
+ finally {
+ acebr.close();
+ }
+ }
+ catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + fileName);
+ System.exit(1);
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + fileName);
+ System.exit(1);
+ }
+ }
+
+ /**
+ * @return the contigs
+ */
+ public Vector<AceContig> getContigs() {
+ return contigs;
+ }
+
+ /**
+ * @param contigs the contigs to set
+ */
+ public void setContigs(Vector<AceContig> contigs) {
+ this.contigs = contigs;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/files/ExportR.java b/src/de/rki/ng4/surankco/files/ExportR.java
new file mode 100755
index 0000000..417fb25
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/ExportR.java
@@ -0,0 +1,131 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.util.Vector;
+
+public class ExportR {
+
+ private Vector<String> header;
+ private Vector<Vector<String>> data;
+
+ public ExportR(){
+ header = new Vector<String>();
+ data = new Vector<Vector<String>>();
+ }
+
+ public void addStringVariable(String name, Vector<String> values){
+ header.add(name);
+ data.add(values);
+ }
+
+ public void addIntegerVariable(String name, Vector<Integer> values){
+ header.add(name);
+ data.add(ints(values));
+ }
+
+ public void addDoubleVariable(String name, Vector<Double> values){
+ header.add(name);
+ data.add(doubles(values));
+ }
+
+ public void write(String fileName){
+ try {
+ BufferedWriter bw = new BufferedWriter(new FileWriter(new File(fileName)));
+ try {
+ for (String s : header){
+ bw.append(s + " ");
+ }
+ bw.newLine();
+
+ for (int i=0; i<data.firstElement().size(); i++){
+ for (int j=0; j<data.size(); j++){
+ bw.append(data.get(j).get(i) + " ");
+ }
+ bw.newLine();
+ }
+ }
+ finally {
+ bw.close();
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems writing file: " + fileName);
+ System.exit(1);
+ }
+ }
+
+ public static Vector<String> intVecAsRString(Vector<Vector<Integer>> vecs){
+ Vector<String> output = new Vector<String>();
+ StringBuffer string;
+ for (Vector<Integer> v : vecs){
+ string = new StringBuffer("\"");
+ for (Integer i : v){
+ string.append(i + " ");
+ }
+ string.replace(string.length(), string.length(), "\"");
+ output.add(string.toString());
+ }
+ return output;
+ }
+
+ private static Vector<String> ints(Vector<Integer> vecs){
+ Vector<String> output = new Vector<String>();
+ for (Integer i : vecs){
+ output.add(i.toString());
+ }
+ return output;
+ }
+
+ private static Vector<String> doubles(Vector<Double> vecs){
+ Vector<String> output = new Vector<String>();
+ for (Double i : vecs){
+ output.add(i.toString());
+ }
+ return output;
+ }
+
+ public void writeCoverage(String fileName, Vector<String> ids, Vector<Vector<Integer>> covs) {
+ try {
+ BufferedWriter bw = new BufferedWriter(new FileWriter(new File(fileName)));
+ try {
+ DecimalFormat df = new DecimalFormat( "000" );
+
+ for (int i=0; i<ids.size(); i++){
+ bw.append(">" + ids.get(i));
+ bw.newLine();
+
+
+
+ int count = 0;
+ for (int j=0; j<covs.get(i).size(); j++){
+
+ bw.append( df.format(covs.get(i).get(j)) + " " );
+ count++;
+
+ if (count == 15){
+ bw.newLine();
+ count = 0;
+ }
+ }
+ bw.newLine();
+ }
+ }
+ finally {
+ bw.close();
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems writing file: " + fileName);
+ System.exit(1);
+ }
+ }
+}
diff --git a/src/de/rki/ng4/surankco/files/Fasta.java b/src/de/rki/ng4/surankco/files/Fasta.java
new file mode 100755
index 0000000..c82b5b4
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Fasta.java
@@ -0,0 +1,52 @@
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.FastaContig;
+
+public class Fasta {
+
+ private String filename;
+
+ public Fasta(String filename){
+ this.filename = filename;
+ }
+
+ public Vector<FastaContig> parseFasta(){
+ Vector<FastaContig> contigs = new Vector<FastaContig>();
+ try {
+ BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+ try {
+ String line;
+ while ((line = br.readLine()) != null){
+ if (line.startsWith(">")){
+ contigs.add(new FastaContig(line));
+ }
+ else {
+ contigs.lastElement().appendSequence(line.trim().toUpperCase());
+ }
+ }
+ }
+ finally {
+ br.close();
+ }
+ }
+ catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + filename);
+ System.exit(1);
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + filename);
+ System.exit(1);
+ }
+ return contigs;
+ }
+
+}
diff --git a/src/de/rki/ng4/surankco/files/Fastq.java b/src/de/rki/ng4/surankco/files/Fastq.java
new file mode 100755
index 0000000..08db3ee
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Fastq.java
@@ -0,0 +1,217 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.FastqReads;
+import de.rki.ng4.surankco.feature.Main;
+
+/**
+ * Import of read qualities from Fastq-Files
+ *
+ * @author Mathias Kuhring
+ *
+ */
+public class Fastq {
+
+ String fastqParameter;
+ String fastqVersion;
+
+ Vector<String> quality;
+
+ public Fastq(String filename, FastqReads reads, String parameter) {
+ quality = new Vector<String>();
+
+ fastqParameter = parameter;
+
+ try {
+ BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+ try {
+ String line;
+ String[] splits;
+ while ((line = br.readLine()) != null){
+ if (line.startsWith("@")){
+ // parse read id
+ splits = line.split(" ");
+ reads.readId.add(splits[0].substring(1));
+ // skip read sequence
+ br.readLine();
+ // skip +
+ br.readLine();
+ // read quality
+ line = br.readLine();
+ quality.add(line);
+ }
+ }
+ }
+ finally {
+ br.close();
+ }
+ }
+ catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + filename);
+ System.exit(1);
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + filename);
+ System.exit(1);
+ }
+
+ if (Main.DEBUGMODE){
+ System.out.println(reads.readId.get(0));
+ System.out.println(quality.get(0));
+ }
+
+ reads.readQuality.addAll(parseFastqQuality());
+ }
+
+ // Coordination of Fastq Quality conversion
+ private Vector<Vector<Integer>> parseFastqQuality(){
+ // convert from ascii to integer
+ Vector<Vector<Integer>> intQuality = asciiToNumber();
+ // determine fastq version
+ fastqVersion = getFastqVersion(new Vector<Vector<Integer>>(intQuality.subList(0, Math.min(intQuality.size(), 10))), fastqParameter);
+ // convert to illumina1.8+
+ intQuality = convertToIllumina18(intQuality);
+
+ return intQuality;
+ }
+
+ // Convert different fastq quality versions to illumina1.8+
+ private Vector<Vector<Integer>> convertToIllumina18(Vector<Vector<Integer>> intQuality) {
+ int illumina18Cutoff = 41;
+
+ switch (fastqVersion) {
+ case "sanger":
+ intQuality = shiftQualities(intQuality, 33, illumina18Cutoff);
+ break;
+ case "solexa":
+ intQuality = shiftQualities(intQuality, 64, 62);
+ intQuality = convertSolexaToPhred(intQuality);
+ intQuality = shiftQualities(intQuality, 0, illumina18Cutoff);
+ break;
+ case "illumina13":
+ intQuality = shiftQualities(intQuality, 64, illumina18Cutoff);
+ break;
+ case "illumina15":
+ intQuality = shiftQualities(intQuality, 64, illumina18Cutoff);
+ break;
+ case "illumina18":
+ intQuality = shiftQualities(intQuality, 33, illumina18Cutoff);
+ break;
+ default:
+ System.out.println("error: bad fastq version");
+ System.exit(1);
+ break;
+ }
+ return intQuality;
+ }
+
+ // converts Solexa Quality to Phred Quality: qSolexy = -10 x log10(Pe / (1-Pe)) -> qPhred = -10 x log10(Pe)
+ // according to Cock et. al 2009, The Sanger FASTQ file format...
+ private Vector<Vector<Integer>> convertSolexaToPhred(Vector<Vector<Integer>> intQuality) {
+ double qSolexa = 0;
+ int converted = 0;
+
+ for (int i=0; i<intQuality.size(); i++){
+ for (int j=0; j<intQuality.get(i).size(); j++){
+ qSolexa = intQuality.get(i).get(j);
+ converted = (int) Math.round(10d * Math.log10( Math.pow(10d,(qSolexa/10d)) + 1d ));
+ intQuality.get(i).set(j,converted);
+ }
+ }
+
+ return intQuality;
+ }
+
+ // Scaling of Qualities
+ private Vector<Vector<Integer>> shiftQualities(Vector<Vector<Integer>> intQuality, int shift, int cutoff) {
+ int shifted = 0;
+
+ for (int i=0; i<intQuality.size(); i++){
+ for (int j=0; j<intQuality.get(i).size(); j++){
+ shifted = Math.min((intQuality.get(i).get(j) - shift), cutoff);
+ intQuality.get(i).set(j,shifted);
+ }
+ }
+
+ return intQuality;
+ }
+
+ // Converts the fastq quality ascii strings to integer vectors
+ private Vector<Vector<Integer>> asciiToNumber() {
+ Vector<Vector<Integer>> intQuality = new Vector<Vector<Integer>>();
+
+ for (String read : quality){
+ intQuality.add(new Vector<Integer>());
+ for (char qual : read.toCharArray()){
+ intQuality.lastElement().add((int) qual);
+ }
+ }
+
+ return intQuality;
+ }
+
+ // Determines Fastq version, estimating if parameter is "auto"
+ private String getFastqVersion(Vector<Vector<Integer>> intQuality, String parameter){
+ Vector<String> version = new Vector<String>();
+ if (parameter.matches("auto")){
+ int minQual = Integer.MAX_VALUE;
+ int maxQual = Integer.MIN_VALUE;
+ for (Vector<Integer> vec : intQuality){
+ for (Integer qual : vec){
+ minQual = Math.min(minQual, qual);
+ maxQual = Math.max(maxQual, qual);
+ }
+ }
+
+ if (Main.DEBUGMODE){
+ System.out.println("min qual (ascii): " + minQual);
+ System.out.println("max qual (ascii): " + maxQual);
+ }
+
+ if (minQual >= 33 && maxQual <= 126){
+ version.add("sanger");
+ }
+ if (minQual >= 35 && maxQual <= 74){
+ version.add("illumina18");
+ }
+ if (minQual >= 59 && maxQual <= 126){
+ version.add("solexa");
+ }
+ if (minQual >= 64 && maxQual <= 126){
+ version.add("illumina13");
+ }
+ if (minQual >= 66 && maxQual <= 126){
+ version.add("illumina15");
+ }
+ }
+ else{
+ version.add(parameter);
+ }
+ if (version.size() > 1){
+ System.out.println("error: fastq version dedection is ambiguous " + version + ".\n please set version manually!");
+ System.exit(1);
+ }
+ if (version.size() == 0){
+ System.out.println("error: fastq version dedection failed, unknown version.\n please set version manually!");
+ System.exit(1);
+ }
+ if (Main.DEBUGMODE){
+ System.out.println("detected fastq version: " + version.get(0));
+ }
+ return version.get(0);
+ }
+
+}
\ No newline at end of file
diff --git a/src/de/rki/ng4/surankco/files/Qual.java b/src/de/rki/ng4/surankco/files/Qual.java
new file mode 100755
index 0000000..923f5bb
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Qual.java
@@ -0,0 +1,61 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.FastqReads;
+
+/**
+ * Import of read qualities from Qual/Qua-Files
+ *
+ * @author Mathias Kuhring
+ *
+ */
+public class Qual {
+
+ public Qual(String filename, FastqReads reads) {
+
+ try {
+ BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+ try {
+ String line;
+ String[] splits;
+ while ((line = br.readLine()) != null){
+ if (line.startsWith(">")){
+ splits = line.split(" ");
+ reads.readId.add(splits[0].substring(1));
+ reads.readQuality.add(new Vector<Integer>());
+ }
+ else {
+ splits = line.split(" ");
+ for (String s : splits){
+ reads.readQuality.lastElement().add(Integer.parseInt(s));
+ }
+ }
+ }
+ }
+ finally {
+ br.close();
+ }
+ }
+ catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + filename);
+ System.exit(1);
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + filename);
+ System.exit(1);
+ }
+ }
+}
diff --git a/src/de/rki/ng4/surankco/files/Sam.java b/src/de/rki/ng4/surankco/files/Sam.java
new file mode 100755
index 0000000..043fdd7
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Sam.java
@@ -0,0 +1,72 @@
+package de.rki.ng4.surankco.files;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.SamReaderFactory;
+import htsjdk.samtools.util.SamLocusIterator;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import de.rki.ng4.surankco.data.SamContig;
+
+public class Sam {
+
+ public static void main(String[] args){
+ Sam test = new Sam("data/ex1.sam");
+ for (SamContig contig : test.getContigs().values()){
+ System.out.println(contig.getId());
+ }
+ }
+
+ private HashMap<String,SamContig> contigs;
+
+ public Sam(String filename) {
+
+ contigs = new HashMap<String,SamContig>();
+
+ final SamReader readReader = SamReaderFactory.makeDefault().open(new File(filename));
+
+ // System.out.println(readReader.getFileHeader());
+
+ String refId;
+ for (final SAMRecord samRecord : readReader) {
+
+ if (!samRecord.getReadUnmappedFlag()){
+ refId = samRecord.getReferenceName();
+ if (!contigs.containsKey(refId)){
+ contigs.put(refId, new SamContig(refId));
+ }
+ contigs.get(refId).addRead(samRecord);
+ }
+
+ // System.out.println(samRecord);
+ }
+
+ final SamReader alignmentReader = SamReaderFactory.makeDefault().open(new File(filename));
+
+ for (final SamLocusIterator.LocusInfo locus : new SamLocusIterator(alignmentReader)){
+ refId = locus.getSequenceName();
+
+ if (contigs.containsKey(locus.getSequenceName())){
+ ArrayList<Byte> alignmentColumn = new ArrayList<Byte>();
+
+ List<SamLocusIterator.RecordAndOffset> recs = locus.getRecordAndPositions();
+
+// System.out.println(refId + "\t" + locus.getPosition() + ": ");
+ for (SamLocusIterator.RecordAndOffset rec : recs){
+ alignmentColumn.add(rec.getReadBase());
+ // System.out.print((char) rec.getReadBase() + " ");
+ }
+ // System.out.println();
+ contigs.get(locus.getSequenceName()).addAlignmentColumn(alignmentColumn);
+ }
+ }
+ }
+
+ public HashMap<String,SamContig> getContigs(){
+ return contigs;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/scoring/Main.java b/src/de/rki/ng4/surankco/scoring/Main.java
new file mode 100755
index 0000000..1875c8d
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/Main.java
@@ -0,0 +1,32 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring;
+
+public class Main {
+
+ // debugging
+ public final static boolean DEBUGMODE = false;
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+
+ String filename = args[0];
+ String outputFile = args[1];
+
+ if (Main.DEBUGMODE){ System.out.println("parse pretty psl file..."); }
+ PSLPrettyParser psl = new PSLPrettyParser(filename);
+
+ if (Main.DEBUGMODE){ System.out.println("calculate metrics..."); }
+ MetricsPSL metrics = new MetricsPSL(psl.getHits());
+ metrics.calculateMetrics();
+ if (Main.DEBUGMODE){ System.out.println("export metrics..."); }
+ metrics.export(outputFile);
+
+ }
+
+}
diff --git a/src/de/rki/ng4/surankco/scoring/MetricsPSL.java b/src/de/rki/ng4/surankco/scoring/MetricsPSL.java
new file mode 100755
index 0000000..c167fbb
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/MetricsPSL.java
@@ -0,0 +1,349 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.scoring.scores.NormedContigLength1;
+import de.rki.ng4.surankco.scoring.scores.NormedContigLength2;
+import de.rki.ng4.surankco.scoring.scores.Score;
+
+
+public class MetricsPSL {
+
+ private Vector<PSLHit> hits;
+ private Vector<Vector<Boolean>> matchSequences;
+
+ private Vector<String> metricsHeader;
+ private Vector<Vector<Double>> metricsValues;
+
+ public MetricsPSL(Vector<PSLHit> hits){
+ this.hits = hits;
+
+ matchSequences = new Vector<Vector<Boolean>>();
+ for (PSLHit hit : hits){
+ matchSequences.add(hit.getExtendedMatchSequence());
+ }
+
+ metricsHeader = new Vector<String>();
+ metricsValues = new Vector<Vector<Double>>();
+ }
+
+ public void calculateMetrics(){
+ Vector<Double> matchCounts = matchCount();
+ Vector<Double> errorCounts = errorCount(matchCounts);
+ Vector<Double> errorSubtractions = errorSubtraction(errorCounts);
+
+ Vector<Double> normMatchCounts1 = normMatchCount1(matchCounts);
+ Vector<Double> normMatchCounts2 = normMatchCount2(matchCounts);
+ Vector<Double> normErrorCounts1 = normErrorCount1(errorCounts);
+ Vector<Double> normErrorCounts2 = normErrorCount2(errorCounts);
+ Vector<Double> normErrorSubtractions = normErrorSubtraction(errorSubtractions);
+
+ Vector<Double> maxContErrors = maxContError();
+ Vector<Double> maxRegionErrors = maxRegionError(100);
+
+ Vector<Double> maxEndErrorStretches = maxEndErrorStretch();
+ Vector<Double> maxEndErrorCounts = maxEndErrorCount(100);
+ Vector<Double> maxEndContErrors = maxEndContError(100);
+
+ metricsValues.add(matchCounts);
+ metricsValues.add(errorCounts);
+ metricsValues.add(errorSubtractions);
+
+ metricsValues.add(normMatchCounts1);
+ metricsValues.add(normMatchCounts2);
+ metricsValues.add(normErrorCounts1);
+ metricsValues.add(normErrorCounts2);
+ metricsValues.add(normErrorSubtractions);
+
+ metricsValues.add(maxContErrors);
+ metricsValues.add(maxRegionErrors);
+
+ metricsValues.add(maxEndErrorStretches);
+ metricsValues.add(maxEndErrorCounts);
+ metricsValues.add(maxEndContErrors);
+
+
+ // New Scoring implementation
+ Vector<Score> scores = new Vector<Score>();
+ scores.add(new NormedContigLength1());
+ scores.add(new NormedContigLength2());
+
+ Vector<Vector<Double>> results = new Vector<Vector<Double>>();
+ for (Score score : scores){
+ metricsHeader.add(score.getScoreName());
+ results.add(new Vector<Double>());
+ }
+
+ for (PSLHit hit : hits){
+ for (int i=0; i<scores.size(); i++){
+ results.get(i).add(scores.get(i).calculate(hit));
+ }
+ }
+
+ for (Vector<Double> result : results){
+ metricsValues.add(result);
+ }
+
+ }
+
+ public void printMetrics(){
+ System.out.print("ContigID ");
+ for (String header : metricsHeader){
+ System.out.print(header + " ");
+ }
+ System.out.println();
+
+ for (int i=0; i<hits.size(); i++){
+ System.out.print(hits.get(i).getQueryName() + " ");
+ for (int j=0; j<metricsValues.size(); j++){
+ System.out.print(metricsValues.get(j).get(i) + " ");
+ }
+ System.out.println();
+ }
+ }
+
+ public void export(String fileName){
+ try {
+ BufferedWriter mbw = new BufferedWriter(new FileWriter(new File(fileName)));
+
+ try {
+ mbw.append("ContigID ");
+ for (String header : metricsHeader){
+ mbw.append(header + " ");
+ }
+ mbw.newLine();
+
+ for (int i=0; i<hits.size(); i++){
+ mbw.append(hits.get(i).getQueryName() + " ");
+ for (int j=0; j<metricsValues.size(); j++){
+ mbw.append(metricsValues.get(j).get(i) + " ");
+ }
+ mbw.newLine();
+ }
+ }
+ finally {
+ mbw.close();
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private Vector<Double> matchCount(){
+ metricsHeader.add("MatchCount");
+ Vector<Double> values = new Vector<Double>();
+ double count;
+ for (Vector<Boolean> matchSeq : matchSequences){
+ count = 0;
+ for (Boolean match : matchSeq){
+ if (match) count++;
+ }
+ values.add(count);
+ }
+ return(values);
+ }
+
+ private Vector<Double> errorCount(Vector<Double> matchCounts){
+ metricsHeader.add("ErrorCount");
+ Vector<Double> values = new Vector<Double>();
+ for (int i=0; i<hits.size(); i++){
+ values.add(matchSequences.get(i).size() - matchCounts.get(i));
+ }
+ return(values);
+ }
+
+ private Vector<Double> errorSubtraction(Vector<Double> errorCounts){
+ metricsHeader.add("ErrorSubtraction");
+ Vector<Double> values = new Vector<Double>();
+ for (int i=0; i<hits.size(); i++){
+ values.add( Math.max(0, (hits.get(i).getQuerySize() - errorCounts.get(i))) ); // set to 0 if error count > contig size, e.g. big gaps
+ }
+ return(values);
+ }
+
+ private Vector<Double> normMatchCount1(Vector<Double> matchCounts){
+ metricsHeader.add("NormedMatchCount1");
+ Vector<Double> values = new Vector<Double>();
+ for (int i=0; i<hits.size(); i++){
+ values.add(matchCounts.get(i) / hits.get(i).getQuerySize());
+ }
+ return(values);
+ }
+
+ private Vector<Double> normMatchCount2(Vector<Double> matchCounts){
+ metricsHeader.add("NormedMatchCount2");
+ Vector<Double> values = new Vector<Double>();
+ for (int i=0; i<hits.size(); i++){
+ values.add(matchCounts.get(i) / matchSequences.get(i).size());
+ }
+ return(values);
+ }
+
+ private Vector<Double> normErrorCount1(Vector<Double> errorCounts){
+ metricsHeader.add("NormedErrorCount1");
+ Vector<Double> values = new Vector<Double>();
+ for (int i=0; i<hits.size(); i++){
+ values.add( Math.min(1, (errorCounts.get(i) / hits.get(i).getQuerySize())) ); // set to 1 if error count > contig size, e.g. big gaps
+ }
+ return(values);
+ }
+
+ private Vector<Double> normErrorCount2(Vector<Double> errorCounts){
+ metricsHeader.add("NormedErrorCount2");
+ Vector<Double> values = new Vector<Double>();
+ for (int i=0; i<hits.size(); i++){
+ values.add(errorCounts.get(i) / matchSequences.get(i).size());
+ }
+ return(values);
+ }
+
+ private Vector<Double> normErrorSubtraction(Vector<Double> errorSubtractions){
+ metricsHeader.add("NormedErrorSubtraction");
+ Vector<Double> values = new Vector<Double>();
+ for (int i=0; i<hits.size(); i++){
+ values.add( Math.max(0, (errorSubtractions.get(i) / hits.get(i).getQuerySize())) ); // set to 0 if error count > contig size, e.g. big gaps
+ }
+ return(values);
+ }
+
+ private Vector<Double> maxContError(){
+ metricsHeader.add("MaxContiguousError");
+ Vector<Double> values = new Vector<Double>();
+ double max, count;
+ for (int i=0; i<hits.size(); i++){
+ max = 0;
+ count = 0;
+ for (Boolean b : matchSequences.get(i)){
+ if (b){
+ max = Math.max(max, count);
+ count = 0;
+ }
+ else count++;
+ }
+ max = Math.max(max, count);
+ values.add( Math.min(1, (max / hits.get(i).getQuerySize())) ); // normed by contig size and set to 1 if error count > contig size, e.g. very big gaps
+ }
+ return(values);
+ }
+
+ private Vector<Double> maxRegionError(int regionSize){
+ metricsHeader.add("MaxRegionError");
+ Vector<Double> values = new Vector<Double>();
+ double max, count;
+ Vector<Boolean> matchSeq;
+ for (int i=0; i<hits.size(); i++){
+ max = 0;
+ count = 0;
+ matchSeq = matchSequences.get(i);
+ for (int j=0; j<Math.min(regionSize,matchSeq.size()); j++){
+ if (!matchSeq.get(j)) count++;
+ max = count;
+ }
+ for (int j=Math.min(regionSize, matchSeq.size()); j<matchSeq.size(); j++){
+ if (!matchSeq.get(j-regionSize)) count--;
+ if (!matchSeq.get(j)) count++;
+ max = Math.max(max, count);
+ }
+ values.add(max);
+ }
+ return(values);
+ }
+
+
+ private Vector<Double> maxEndErrorStretch(){
+ metricsHeader.add("MaxEndErrorStretch");
+ Vector<Double> values = new Vector<Double>();
+ double left, right;
+ int index;
+ Vector<Boolean> matchSeq;
+ for (int i=0; i<hits.size(); i++){
+ left = 0;
+ right = 0;
+ matchSeq = matchSequences.get(i);
+
+ // left error stretch size
+ index = 0;
+ while (!matchSeq.get(index++)){
+ left++;
+ }
+
+ // right error stretch size
+ index = matchSeq.size()-1;
+ while (!matchSeq.get(index--)){
+ right++;
+ }
+
+ values.add( Math.max(left, right) / hits.get(i).getQuerySize() ); // normed by contig size
+ }
+ return(values);
+ }
+
+ private Vector<Double> maxEndErrorCount(int endSize){
+ metricsHeader.add("MaxEndErrorCount");
+ Vector<Double> values = new Vector<Double>();
+ double left, right;
+ Vector<Boolean> matchSeq;
+ for (int i=0; i<hits.size(); i++){
+ left = 0;
+ right = 0;
+ matchSeq = matchSequences.get(i);
+
+ for (int j=0; j<Math.min(endSize,matchSeq.size()); j++){
+ if (!matchSeq.get(j)) left++;
+ }
+
+ for (int j=matchSeq.size()-1; j>=Math.max(matchSeq.size()-endSize,0); j--){
+ if (!matchSeq.get(j)) right++;
+ }
+
+ values.add(Math.max(left, right));
+ }
+ return(values);
+ }
+
+ private Vector<Double> maxEndContError(int endSize){
+ metricsHeader.add("MaxEndContError");
+ Vector<Double> values = new Vector<Double>();
+ double left, right, count;
+ Vector<Boolean> matchSeq;
+ for (int i=0; i<hits.size(); i++){
+ left = 0;
+ right = 0;
+ matchSeq = matchSequences.get(i);
+
+ count = 0;
+ for (int j=0; j<Math.min(endSize,matchSeq.size()); j++){
+ if (matchSeq.get(j)){
+ left = Math.max(left, count);
+ count = 0;
+ }
+ else count++;
+ }
+ left = Math.max(left, count);
+
+ count = 0;
+ for (int j=matchSeq.size()-1; j>=Math.max(matchSeq.size()-endSize,0); j--){
+ if (matchSeq.get(j)){
+ right = Math.max(right, count);
+ count = 0;
+ }
+ else count++;
+ }
+ right = Math.max(right, count);
+
+ values.add(Math.max(left, right));
+ }
+ return(values);
+ }
+
+}
diff --git a/src/de/rki/ng4/surankco/scoring/PSLHit.java b/src/de/rki/ng4/surankco/scoring/PSLHit.java
new file mode 100755
index 0000000..80e2d4b
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/PSLHit.java
@@ -0,0 +1,139 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring;
+
+import java.util.Vector;
+
+public class PSLHit {
+
+ private String qName, strand;
+ private int qStart, qEnd, qSize;
+
+ private String tName;
+ private int tStart, tEnd, tSize;
+
+ private Vector<String> lines;
+
+ private String query, target;
+
+ private Vector<Boolean> matchSeq;
+ private Vector<Boolean> matchSeqExt;
+
+
+ public PSLHit(String header){
+ parseHeader(header);
+ lines = new Vector<String>();
+ }
+
+ private void parseHeader(String header){
+ // >contig00001:7626+41627 of 41627 gi|170079663|ref|NC_010473.1|:563130+597131 of 4686137
+ String[] mainSplits = header.split(" ");
+
+ String[] namePositionSplits = mainSplits[0].split("\\:");
+ String[] positionSplits;
+ if (namePositionSplits[1].contains("+")){
+ positionSplits = namePositionSplits[1].split("\\+");
+ strand = "+";
+ }
+ else{
+ positionSplits = namePositionSplits[1].split("\\-");
+ strand = "-";
+ }
+ qName = namePositionSplits[0].substring(1);
+ qStart = Integer.parseInt(positionSplits[0]);
+ qEnd = Integer.parseInt(positionSplits[1]);
+ qSize = Integer.parseInt(mainSplits[2]);
+
+ namePositionSplits = mainSplits[3].split("\\:");
+ positionSplits = namePositionSplits[1].split("\\+");
+ tName = namePositionSplits[0];
+ tStart = Integer.parseInt(positionSplits[0]);
+ tEnd = Integer.parseInt(positionSplits[1]);
+ tSize = Integer.parseInt(mainSplits[5]);
+ }
+
+ private String getHeader(){
+ return ">" + qName + ":" + qStart + strand + qEnd + " of " + qSize + " " +
+ tName + ":" + tStart + "+" + tEnd + " of " + tSize;
+ }
+
+ private String getAlignment(){
+ StringBuffer buffer = new StringBuffer();
+ for (String line : lines){
+ buffer.append(line + "\n");
+ }
+ return buffer.substring(0, buffer.length()-1);
+ }
+
+ public String toString(){
+ return getHeader() + "\n" + getAlignment();
+ }
+
+ public void appendAlignment(String line) {
+ lines.add(line);
+ }
+
+ public void parseAlignment() {
+ StringBuffer query = new StringBuffer();
+ StringBuffer target = new StringBuffer();
+
+ int index = 0;
+
+ while (index+2 <= lines.size()){
+ query.append(lines.get(index++));
+ index++;
+ target.append(lines.get(index++));
+ index++;
+ }
+
+ this.query = query.toString().toUpperCase();
+ this.target = target.toString().toUpperCase();
+ }
+
+ public Vector<Boolean> getMatchSequence(){
+ if (matchSeq != null) return matchSeq;
+
+ matchSeq = new Vector<Boolean>();
+
+ for (int i=0; i<query.length(); i++){
+ matchSeq.add(query.charAt(i) == target.charAt(i));
+ }
+
+ return matchSeq;
+ }
+
+ public Vector<Boolean> getExtendedMatchSequence(){
+ if (matchSeqExt != null) return matchSeqExt;
+
+ matchSeqExt = new Vector<Boolean>();
+
+ int prefix = qStart;
+ for (int j=0; j<prefix; j++){
+ matchSeqExt.add(false);
+ }
+
+ matchSeqExt.addAll(getMatchSequence());
+
+ int suffix = qSize-qEnd;
+ for (int j=0; j<suffix; j++){
+ matchSeqExt.add(false);
+ }
+
+ return matchSeqExt;
+ }
+
+ public String getQueryName(){
+ return qName;
+ }
+
+ public int getQuerySize(){
+ return qSize;
+ }
+
+ public int getTargetSize(){
+ return tSize;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/scoring/PSLPrettyParser.java b/src/de/rki/ng4/surankco/scoring/PSLPrettyParser.java
new file mode 100755
index 0000000..009acfe
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/PSLPrettyParser.java
@@ -0,0 +1,93 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.util.Vector;
+
+
+public class PSLPrettyParser {
+
+ private String fileName;
+ private Vector<PSLHit> hits;
+
+ public PSLPrettyParser(String fileName){
+ this.fileName = fileName;
+ hits = new Vector<PSLHit>();
+
+ try {
+ LineNumberReader reader = new LineNumberReader(new FileReader(new File(fileName)));
+ try {
+ String line;
+
+ while ((line = reader.readLine()) != null){
+ if (line.startsWith(">")){
+ hits.add(new PSLHit(line));
+ }
+ else {
+ hits.lastElement().appendAlignment(line);
+ }
+ }
+
+ for (PSLHit hit : hits){
+ hit.parseAlignment();
+ }
+
+ }
+ finally {
+ reader.close();
+ }
+ }
+ catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + fileName);
+ System.exit(1);
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems reading file: " + fileName);
+ System.exit(1);
+ }
+ }
+
+ public void printPrettyPslHits(){
+ for (PSLHit hit : hits){
+ System.out.print(hit.toString());
+ }
+ }
+
+ public void export(String filename){
+ try {
+ BufferedWriter writer = new BufferedWriter(new FileWriter(new File(filename)));
+
+ try {
+ for (PSLHit hit : hits){
+ writer.write(hit.toString());
+ writer.newLine();
+ }
+ }
+ finally {
+ writer.close();
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.out.println("error: problems writing file: " + fileName);
+ System.exit(1);
+ }
+ }
+
+ public Vector<PSLHit> getHits() {
+ return hits;
+ }
+
+}
\ No newline at end of file
diff --git a/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength1.java b/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength1.java
new file mode 100755
index 0000000..8319359
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength1.java
@@ -0,0 +1,29 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring.scores;
+
+import java.util.Vector;
+
+import de.rki.ng4.surankco.scoring.PSLHit;
+
+public class NormedContigLength1 extends Score{
+
+ private static final String SCORE_NAME = "NormedContigLength1";
+
+ @Override
+ public String getScoreName() {
+ return SCORE_NAME;
+ }
+
+ @Override
+ public double calculate(PSLHit hit) {
+
+ int contigSize = hit.getQuerySize();
+ Vector<Boolean> matchSequence = hit.getExtendedMatchSequence();
+
+ return (double) contigSize / (double) matchSequence.size();
+ }
+}
diff --git a/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength2.java b/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength2.java
new file mode 100755
index 0000000..ef4f78b
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength2.java
@@ -0,0 +1,28 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring.scores;
+
+import de.rki.ng4.surankco.scoring.PSLHit;
+
+
+public class NormedContigLength2 extends Score{
+
+ private static final String SCORE_NAME = "NormedContigLength2";
+
+ @Override
+ public String getScoreName() {
+ return SCORE_NAME;
+ }
+
+ @Override
+ public double calculate(PSLHit hit) {
+
+ int contigSize = hit.getQuerySize();
+ int referenceSize = hit.getTargetSize();
+
+ return (double) contigSize / (double) referenceSize;
+ }
+}
diff --git a/src/de/rki/ng4/surankco/scoring/scores/Score.java b/src/de/rki/ng4/surankco/scoring/scores/Score.java
new file mode 100755
index 0000000..b89b9aa
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/scores/Score.java
@@ -0,0 +1,15 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring.scores;
+
+import de.rki.ng4.surankco.scoring.PSLHit;
+
+public abstract class Score {
+
+ public abstract String getScoreName();
+ public abstract double calculate(PSLHit hit);
+
+}
diff --git a/src/de/rki/ng4/surankco/tools/Statistic.java b/src/de/rki/ng4/surankco/tools/Statistic.java
new file mode 100755
index 0000000..d5e4154
--- /dev/null
+++ b/src/de/rki/ng4/surankco/tools/Statistic.java
@@ -0,0 +1,111 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.tools;
+
+import java.util.Collections;
+import java.util.Vector;
+
+public class Statistic {
+
+ public static <N extends Number> double mad(Vector<N> data){
+ final double scalefactor = 1.4826;
+ double median = median(data);
+ Vector<Double> diffs = new Vector<Double>();
+ for (N num : data){ diffs.add(Math.abs(num.doubleValue()-median)); }
+ return median(diffs) * scalefactor;
+ }
+
+ public static <N extends Number> Vector<Double> mads(Vector<Vector<N>> data){
+ Vector<Double> mads = new Vector<Double>();
+ for (Vector<N> vn : data){ mads.add(mad(vn)); }
+ return mads;
+ }
+
+ public static <N extends Number> double max(Vector<N> data){
+ double max = Double.MIN_VALUE;
+ for (N num : data){ max = Math.max(max, num.doubleValue()); }
+ return max;
+ }
+
+ public static <N extends Number> Vector<Double> maxs(Vector<Vector<N>> data){
+ Vector<Double> maxs = new Vector<Double>();
+ for (Vector<N> vn : data){ maxs.add(max(vn)); }
+ return maxs;
+ }
+
+ public static <N extends Number> double mean(Vector<N> data){
+ return sum(data)/data.size();
+ }
+
+ public static <N extends Number> Vector<Double> means(Vector<Vector<N>> data){
+ Vector<Double> means = new Vector<Double>();
+ for (Vector<N> vn : data){ means.add(mean(vn)); }
+ return means;
+ }
+
+ public static <N extends Number> double median(Vector<N> data){
+ Vector<Double> copy = new Vector<Double>();
+ for (N num : data){ copy.add(num.doubleValue()); }
+ Collections.sort(copy);
+ int n = copy.size();
+ if (n % 2 == 1){ return copy.get(((n+1)/2) -1); }
+ else{ return 0.5 * (copy.get((n/2) -1) + copy.get(((n/2)+1) -1)); }
+ }
+
+ public static <N extends Number> Vector<Double> medians(Vector<Vector<N>> data){
+ Vector<Double> medians = new Vector<Double>();
+ for (Vector<N> vn : data){ medians.add(median(vn)); }
+ return medians;
+ }
+
+ public static <N extends Number> double min(Vector<N> data){
+ double min = Double.MAX_VALUE;
+ for (N num : data){ min = Math.min(min, num.doubleValue()); }
+ return min;
+ }
+
+ public static <N extends Number> Vector<Double> mins(Vector<Vector<N>> data){
+ Vector<Double> mins = new Vector<Double>();
+ for (Vector<N> vn : data){ mins.add(min(vn)); }
+ return mins;
+ }
+
+ public static <N extends Number> double sd(Vector<N> data){
+ return Math.sqrt(var(data));
+ }
+
+ public static <N extends Number> Vector<Double> sds(Vector<Vector<N>> data){
+ Vector<Double> sds = new Vector<Double>();
+ for (Vector<N> vn : data){ sds.add(sd(vn)); }
+ return sds;
+ }
+
+ public static <N extends Number> double sum(Vector<N> data){
+ double sum = 0;
+ for (N num : data){ sum += num.doubleValue(); }
+ return sum;
+ }
+
+ public static <N extends Number> Vector<Double> sums(Vector<Vector<N>> data){
+ Vector<Double> sums = new Vector<Double>();
+ for (Vector<N> vn : data){ sums.add(sum(vn)); }
+ return sums;
+ }
+
+ public static <N extends Number> double var(Vector<N> data){
+ double var = 0;
+ double mean = mean(data);
+ for (N num : data){ var += Math.pow((num.doubleValue()-mean), 2); }
+ return var/(data.size()-1);
+ }
+
+ public static <N extends Number> Vector<Double> vars(Vector<Vector<N>> data){
+ Vector<Double> vars = new Vector<Double>();
+ for (Vector<N> vn : data){ vars.add(var(vn)); }
+ return vars;
+ }
+
+}
diff --git a/surankco-feature b/surankco-feature
new file mode 100755
index 0000000..433a6a5
--- /dev/null
+++ b/surankco-feature
@@ -0,0 +1,111 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# surankco-feature: feature generation from contigs (ACE format) and
+# corresponding reads (QUAL or FASTQ format)
+
+
+# get script path
+args <- commandArgs(trailingOnly = FALSE)
+script.arg <- "--file="
+script.name <- sub(script.arg, "", args[grep(script.arg, args)])
+script.path <- dirname(script.name)
+
+
+# testing/debugging
+# args <- c("--directory=data", "--split.regex=\\.\\d+-")
+# script.path <- getwd()
+DEBUGGING <- FALSE
+
+
+# sources and libraries
+source(paste(script.path, '/r/parameter.R', sep=""))
+source(paste(script.path, '/r/feature.R', sep=""))
+source(paste(script.path, '/r/functions.R', sep=""))
+source(paste(script.path, '/r/coverage.R', sep=""))
+source(paste(script.path, '/r/covcurv.R', sep=""))
+source(paste(script.path, '/r/import.R', sep=""))
+loadPackages(c("optparse","parallel"), quietly=TRUE)
+
+
+# parsing parameter
+cat("prepare files\n")
+parameters <- parseSurankcoFeature()
+files <- parameters$files
+
+if (DEBUGGING){
+ print(args)
+ print(parameters)
+ print(files)
+}
+
+
+input.file.assembly <- files[ ,parameters$assembly.format]
+input.file.quality <- files[ ,parameters$read.quality.format]
+output.file.features <- paste(rownames(files), ".feature.tmp", sep="")
+output.file.coverage <- paste(rownames(files), ".coverage.tmp", sep="")
+output.file.ccoverage <- paste(rownames(files), ".ccoverage.tmp", sep="")
+
+# extract features in java and prepare/export data for R
+# multicore?
+cat("extract features (part 1): ")
+for (i in 1:nrow(files)){
+ cat(paste0(i, " "))
+ java.call <- paste("java",
+ paste("-Xms", parameters$memory, "G", sep=""),
+ paste("-Xmx", parameters$memory, "G", sep=""),
+ "-Dcom.ibm.tools.attach.enable=no -jar",
+ paste(script.path, "surankco.jar", sep="/"),
+ input.file.assembly[i], # args[0] Assembly File
+ input.file.quality[i], # args[1] Fasta/Qual File
+ output.file.features[i], # args[2] Tmp Feature Output File
+ output.file.coverage[i], # args[3] Tmp Coverage Output File
+ output.file.ccoverage[i], # args[4] Tmp CCoverage Output File
+ parameters$fastq.version, # args[5] Fastq version
+ parameters$split.regex, # args[6] Read name split regex
+ parameters$kmer.features, # args[7] kmer on/off
+ parameters$contig.size.filter) # args[8] minimum contig size
+
+ if(DEBUGGING){
+ print(java.call)
+ }
+
+ if(code <- system(java.call, ignore.stdout=FALSE)){
+ complainAndStop("java not successfully executed", code)
+ }
+}
+cat("\n")
+
+cat("extract features (part 2)\n")
+# import java features and prepared data (coverages)
+features.raw <- importFeatures(input.file.assembly,
+ input.file.quality,
+ output.file.features,
+ output.file.coverage,
+ output.file.ccoverage)
+
+# extract features in R
+features.raw <- attachEGS(features.raw, parameters$expected.genome.size)
+features.final <- extractFeatures(features.raw,
+ parameters$threads)
+#parameters$expected.genome.size)
+
+# export features as csv
+cat("export features\n")
+final.file.features <- paste(rownames(files), ".features.txt", sep="")
+for (i in 1:nrow(files)){
+ write.table(features.final[[i]], file=final.file.features[i],
+ sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
+}
+
+# unlink tmp files
+if (!DEBUGGING){
+ cat("remove temporary files\n")
+ unlink(c(output.file.features,output.file.coverage,output.file.ccoverage))
+}
+
+# done
+cat("surankco-feature calculations done\n")
diff --git a/surankco-prediction b/surankco-prediction
new file mode 100755
index 0000000..72982e9
--- /dev/null
+++ b/surankco-prediction
@@ -0,0 +1,78 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# surankco-prediction: prediction of scores and ranking of contigs using
+# trained random forests (from surankco-training) and
+# contig features (from surankco-feature)
+
+
+# get script path
+args <- commandArgs(trailingOnly = FALSE)
+script.arg <- "--file="
+script.name <- sub(script.arg, "", args[grep(script.arg, args)])
+script.path <- dirname(script.name)
+
+
+# testing/debugging
+# args <- c("--features=data/adenoABC.features.txt", "-r", "surankco_RFs.RData")
+# script.path <- getwd()
+DEBUGGING <- FALSE
+
+
+# sources and libraries
+source(paste(script.path, '/r/parameter.R', sep=""))
+source(paste(script.path, '/r/import.R', sep=""))
+source(paste(script.path, '/r/rf.R', sep=""))
+source(paste(script.path, '/r/voting.R', sep=""))
+# ...
+loadPackages(c("optparse","randomForest"), quietly=TRUE)
+
+
+# parsing parameter
+cat("prepare files\n")
+parameters <- parseSurankcoPrediction()
+
+if (DEBUGGING){
+ print(args)
+ print(parameters)
+ print(parameters$files.features)
+ print(parameters$files.rf)
+}
+
+# import feature files and RFs
+cat("import features and RFs\n")
+features <- read.table(file=parameters$files.features, sep="\t",
+ dec = ".", header=TRUE,
+ colClasses=get.colClasses(parameters$files.features, 3))
+rf.loaded <- load(parameters$files.rf)
+if (!all(c("rfs") %in% rf.loaded)){
+ print("error")
+}
+
+# prepare datacat
+cat("prepare data\n")
+features <- dataFilter(features)
+metadata <- features[ ,1:3]
+features <- features[ ,-c(1:3)]
+
+# prediction
+cat("predict scores\n")
+prediction <- scorePrediction(rfs=rfs, input=features, type="response")
+weighting <- scorePrediction(rfs=rfs, input=features, type="prob")
+
+# voting and ranking
+cat("vote and rank\n")
+final.scores <- weightedVotingSum(scoreList=prediction, weightList=weighting)
+results <- cbind(metadata, SurankcoContigScore=final.scores)
+results <- results[order(results$SurankcoContigScore, decreasing=TRUE),]
+
+# export rankings
+cat("export ranking\n")
+write.table(results, file=parameters$output.filename,
+ sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
+
+# done
+cat("surankco-prediction calculations done\n")
\ No newline at end of file
diff --git a/surankco-score b/surankco-score
new file mode 100755
index 0000000..fbb01f1
--- /dev/null
+++ b/surankco-score
@@ -0,0 +1,151 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# surankco-score: alignment of contigs (FASTA format) and reference genomes
+# (FASTA format) and score calculation
+
+
+# get script path
+args <- commandArgs(trailingOnly = FALSE)
+script.arg <- "--file="
+script.name <- sub(script.arg, "", args[grep(script.arg, args)])
+script.path <- dirname(script.name)
+
+
+# testing/debugging
+# args <- c("--directory=data")
+# script.path <- getwd()
+DEBUGGING <- FALSE
+
+
+# sources and libraries
+source(paste(script.path, '/r/parameter.R', sep=""))
+source(paste(script.path, '/r/scores.R', sep=""))
+source(paste(script.path, '/r/expofits.R', sep=""))
+source(paste(script.path, '/r/import.R', sep=""))
+loadPackages(c("optparse","MASS"), quietly=TRUE)
+
+
+# parsing parameter
+cat("prepare files\n")
+parameters <- parseSurankcoScore()
+files <- parameters$files
+
+if (DEBUGGING){
+ print(args)
+ print(parameters)
+ print(files)
+}
+
+files$scores <- vector(mode="character", length=nrow(files))
+
+# external processes
+cat("align and score contigs: ")
+for (i in 1:nrow(files)){
+ cat(paste0(i, " "))
+ input.file.assembly <- files[i, parameters$assembly.suffix]
+ input.file.reference <- files[i, parameters$reference.suffix]
+ output.file.psl <- paste(rownames(files)[i], ".psl.tmp", sep="")
+ output.file.filter <- paste(rownames(files)[i], ".filter.tmp", sep="")
+ output.file.pretty <- paste(rownames(files)[i], ".pretty.tmp", sep="")
+ output.file.scores <- paste(rownames(files)[i], ".scores.tmp", sep="")
+ files$scores[i] <- output.file.scores
+
+ # build alignments
+ blat.call <- paste(paste("blat", sep=""),
+ input.file.reference, input.file.assembly,
+ output.file.psl, "-minIdentity=0")
+
+ if (DEBUGGING){
+ print(blat.call)
+ }
+
+ if(code <- system(blat.call, ignore.stdout=TRUE)){
+ complainAndStop("blat not successfully executed", code)
+ }
+
+ # filter alignments
+ filter.call <- paste(paste(script.path, "/r/pslMatchFilter", sep=""),
+ output.file.psl, output.file.filter)
+
+ if (DEBUGGING){
+ print(filter.call)
+ }
+
+ if(code <- system(filter.call, ignore.stdout=FALSE)){
+ complainAndStop("pslMatchFilter not successfully executed", code)
+ }
+
+ # extract alignments from psl
+ pretty.call <- paste(paste("pslPretty", sep=""),
+ output.file.filter, input.file.reference,
+ input.file.assembly, output.file.pretty, "-long")
+
+ if (DEBUGGING){
+ print(pretty.call)
+ }
+
+ if(code <- system(pretty.call, ignore.stdout=FALSE)){
+ complainAndStop("pslPretty not successfully executed", code)
+ }
+
+ # calculate scores in Java and export data for R
+ java.call <- paste("java",
+ paste("-Xms", parameters$memory, "G", sep=""),
+ paste("-Xmx", parameters$memory, "G", sep=""),
+ "-cp",
+ paste(script.path, "surankco.jar", sep="/"),
+ "de.rki.ng4.surankco.scoring.Main",
+ output.file.pretty,
+ output.file.scores)
+
+ if (DEBUGGING){
+ print(java.call)
+ }
+
+ if(code <- system(java.call, ignore.stdout=FALSE)){
+ complainAndStop("java not successfully executed", code)
+ }
+
+ if (!DEBUGGING){
+ unlink(c(output.file.psl, output.file.filter, output.file.pretty))
+ }
+}
+cat("\n")
+
+# import Java score
+cat("finalize scores\n")
+scores.raw <- importScores(files$scores,
+ files[ ,parameters$assembly.suffix],
+ files[ ,parameters$reference.suffix])
+
+# post process scores
+# remove redundant and highly correlating scores
+# (???could be moved to training/prediction to export a full set of score???)
+scores.final <- correctScores(scores.raw)
+
+# calculate fittings
+# print score distributions and threshold suggestions
+cat("export histograms\n")
+expoFit(do.call(rbind, scores.final), TRUE, parameters$pdf.histograms)
+
+# export score as csv
+cat("export scores\n")
+for (i in 1:nrow(files)){
+ output.file.scores <- paste(rownames(files)[i], ".scores.txt", sep="")
+
+ write.table(scores.final[[i]], file=output.file.scores,
+ sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
+}
+
+if (!DEBUGGING){
+ # unlink tmp filescat
+ cat("remove temporary files\n")
+ unlink(files$scores)
+}
+
+# done
+cat("surankco-score calculations done\n")
diff --git a/surankco-training b/surankco-training
new file mode 100755
index 0000000..f8071ad
--- /dev/null
+++ b/surankco-training
@@ -0,0 +1,93 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany,
+# All rights reserved. For details, please note the license.txt.
+
+# surankco-training: training of random forest using contig features (from
+# surankco-feature) and contig scores (surankco-score)
+
+
+# get script path
+args <- commandArgs(trailingOnly = FALSE)
+script.arg <- "--file="
+script.name <- sub(script.arg, "", args[grep(script.arg, args)])
+script.path <- dirname(script.name)
+
+
+# testing/debugging
+# args <- c("--directory=data")
+# script.path <- getwd()
+DEBUGGING <- FALSE
+
+
+# sources and libraries
+source(paste(script.path, '/r/parameter.R', sep=""))
+source(paste(script.path, '/r/import.R', sep=""))
+source(paste(script.path, '/r/rf.R', sep=""))
+source(paste(script.path, '/r/scores.R', sep=""))
+source(paste(script.path, '/r/expofits.R', sep=""))
+loadPackages(c("optparse","MASS","randomForest"), quietly=TRUE)
+
+
+# parsing parameter
+cat("prepare files\n")
+parameters <- parseSurankcoTraining()
+files <- parameters$files
+
+
+if (DEBUGGING){
+ print(args)
+ print(parameters)
+ print(files)
+}
+
+
+# import feature and score files
+cat("import features and scores\n")
+features <- readSurankcoFeatures(files$features.txt)
+scores <- readSurankcoFeatures(files$scores.txt)
+
+
+# merge and preparate data
+cat("prepare and merge data\n")
+training <- selectContigs(features,scores)
+training <- dataFilter(training)
+
+firstFeature <- which(colnames(training)=="Length")
+firstScore <- which(colnames(training)=="NormedMatchCount1")
+lastScore <- ncol(training)
+
+input <- training[,firstFeature:(firstScore-1)]
+targets <- training[,firstScore:lastScore]
+
+cat("divide into classes\n")
+if (DEBUGGING){
+ cat("before:\n")
+ summary(targets)
+}
+# set thresholds/classes (maybe with fitting)
+if ("manual.thresholds" %in% names(parameters)){
+ targets <- manualClasses(targets, parameters$manual.thresholds)
+}else{
+ targets <- expoClasses(targets, parameters$exponential.quantile)
+}
+
+# training
+if (DEBUGGING){ # show always?
+ cat("after:\n")
+ summary(targets)
+
+ write.table(cbind(training$ContigID,targets), file="classification.tmp",
+ sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
+}
+cat("train random forests\n")
+rfs <- rfTraining(input, targets)
+
+# export the RFs
+cat("export random forests\n")
+save(rfs, file=parameters$output.filename)
+#save(rfs, file="surankco_RFs.RData")
+
+# done
+cat("surankco-training calculations done\n")
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/surankco.git
More information about the debian-med-commit
mailing list