[med-svn] [surankco] 01/03: Imported Upstream version 0.0.r5+dfsg

Andreas Tille tille at debian.org
Thu Mar 24 13:58:49 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository surankco.

commit 8076ab1a04a14f23ba8688e2aa5dc039213d10dc
Author: Andreas Tille <tille at debian.org>
Date:   Thu Mar 24 14:48:36 2016 +0100

    Imported Upstream version 0.0.r5+dfsg
---
 license.txt                                        |  24 +
 r/covcurv.R                                        | 167 ++++++
 r/coverage.R                                       |  47 ++
 r/expofits.R                                       | 134 +++++
 r/feature.R                                        | 164 ++++++
 r/functions.R                                      |  35 ++
 r/import.R                                         |  81 +++
 r/parameter.R                                      | 577 +++++++++++++++++++++
 r/pslMatchFilter                                   |  27 +
 r/rf.R                                             | 103 ++++
 r/scores.R                                         |  88 ++++
 r/voting.R                                         |  42 ++
 readme.txt                                         | 354 +++++++++++++
 src/de/rki/ng4/surankco/data/AceContig.java        | 220 ++++++++
 src/de/rki/ng4/surankco/data/AceRead.java          | 182 +++++++
 src/de/rki/ng4/surankco/data/Assembly.java         |  30 ++
 src/de/rki/ng4/surankco/data/AssemblyAce.java      | 162 ++++++
 src/de/rki/ng4/surankco/data/AssemblySam.java      | 174 +++++++
 src/de/rki/ng4/surankco/data/Contig.java           | 100 ++++
 src/de/rki/ng4/surankco/data/FastaContig.java      |  24 +
 src/de/rki/ng4/surankco/data/FastqReads.java       | 109 ++++
 src/de/rki/ng4/surankco/data/Read.java             |  91 ++++
 src/de/rki/ng4/surankco/data/Reads.java            | 109 ++++
 src/de/rki/ng4/surankco/data/SamContig.java        |  43 ++
 src/de/rki/ng4/surankco/feature/Coverage.java      | 166 ++++++
 src/de/rki/ng4/surankco/feature/Kmer.java          | 279 ++++++++++
 src/de/rki/ng4/surankco/feature/Main.java          | 257 +++++++++
 src/de/rki/ng4/surankco/feature/Several.java       | 287 ++++++++++
 src/de/rki/ng4/surankco/files/Ace.java             | 175 +++++++
 src/de/rki/ng4/surankco/files/ExportR.java         | 131 +++++
 src/de/rki/ng4/surankco/files/Fasta.java           |  52 ++
 src/de/rki/ng4/surankco/files/Fastq.java           | 217 ++++++++
 src/de/rki/ng4/surankco/files/Qual.java            |  61 +++
 src/de/rki/ng4/surankco/files/Sam.java             |  72 +++
 src/de/rki/ng4/surankco/scoring/Main.java          |  32 ++
 src/de/rki/ng4/surankco/scoring/MetricsPSL.java    | 349 +++++++++++++
 src/de/rki/ng4/surankco/scoring/PSLHit.java        | 139 +++++
 .../rki/ng4/surankco/scoring/PSLPrettyParser.java  |  93 ++++
 .../scoring/scores/NormedContigLength1.java        |  29 ++
 .../scoring/scores/NormedContigLength2.java        |  28 +
 src/de/rki/ng4/surankco/scoring/scores/Score.java  |  15 +
 src/de/rki/ng4/surankco/tools/Statistic.java       | 111 ++++
 surankco-feature                                   | 111 ++++
 surankco-prediction                                |  78 +++
 surankco-score                                     | 151 ++++++
 surankco-training                                  |  93 ++++
 46 files changed, 6013 insertions(+)

diff --git a/license.txt b/license.txt
new file mode 100755
index 0000000..205b3db
--- /dev/null
+++ b/license.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2014,
+Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * The name of the author may not be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL Mathias Kuhring BE LIABLE FOR ANY DIRECT, 
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/r/covcurv.R b/r/covcurv.R
new file mode 100755
index 0000000..7899081
--- /dev/null
+++ b/r/covcurv.R
@@ -0,0 +1,167 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+
+# Calculate 25% and 50% drops in the contig coverage
+# Input:
+#  coverage           List of basewise coverage per contig
+#  readlengthsmean    List of ReadLengthMean per contig
+calcFeatureCoverageDrops <- function(coverage, readlengthsmean){
+  
+  drop.result <- data.frame(CoverageDrop25Count=vector(mode="numeric", length(coverage)), 
+                            CoverageDrop25Max=vector(mode="numeric", length(coverage)),
+                            CoverageDrop50Count=vector(mode="numeric", length(coverage)),
+                            CoverageDrop50Max=vector(mode="numeric", length(coverage)))
+  
+  drops <- lapply(1:length(coverage), function(i){ coverageCurve(coverage[[i]],readlengthsmean[i], 0.75) })
+  # if no contig has drops, 'apply' returns 'numeric(0)' instead of a list with lenght = # contigs
+  # so length > 0 has to be tested
+  if (length(drops)>0){
+    for (i in 1:length(drops)){
+      drop.result$CoverageDrop25Count[i] = dim(drops[[i]])[1]
+      if (drop.result$CoverageDrop25Count[i] > 0)
+        drop.result$CoverageDrop25Max[i] = max(c(drops[[i]][,4]-drops[[i]][,2],drops[[i]][,6]-drops[[i]][,2]))
+    }
+  }
+  
+  drops <- lapply(1:length(coverage), function(i){ coverageCurve(coverage[[i]],readlengthsmean[i], 0.50) })
+  if (length(drops)>0){
+    for (i in 1:length(drops)){
+      drop.result$CoverageDrop50Count[i] = dim(drops[[i]])[1]
+      if (drop.result$CoverageDrop50Count[i] > 0)
+        drop.result$CoverageDrop50Max[i] = max(c(drops[[i]][,4]-drops[[i]][,2],drops[[i]][,6]-drops[[i]][,2]))
+    }
+  }
+  
+  return(drop.result)
+}
+
+
+dupdel <- function(data){
+  l = length(data)
+  for (i in l:2)
+    if (data[i] == data[i-1]){
+      data <- data[-i]
+    }
+  return(data)
+}
+
+window <- function(data, w, i){
+  return (data[(max(1, i-w)):(min(i+w, length(data)))])
+}
+
+left <- function(data, w, i){
+  return (data[(max(1, i-w)):(i-1)])
+}
+
+right <- function(data, w, i){
+  return (data[(i+1):(min(i+w, length(data)))])
+}
+
+leftMax <- function(data, w, i){
+  d <- left(data, w, i)
+  y <- max(d)
+  idx <- which(d==y)
+  x <- tail(idx,1) + (i-w-1)
+  return(c(x,y))
+}
+
+rightMax <- function(data, w, i){
+  d <- right(data, w, i)
+  y <- max(d)
+  idx <- which(d==y)
+  x <- head(idx,1) + i
+  return(c(x,y))
+}
+
+size <- function(cl, rl){
+  if (rl <= 0.1 * cl)
+    return (rl)
+  else
+    return (cl * 0.1)
+}
+
+mySmooth <- function(data, w){
+  l <- length(data)
+  output <- vector(mode="numeric", length=l)
+  for (i in 1:l){
+    output[i] = mean(window(data, w, i))
+  }
+  return(output)
+}
+
+covDrop <- function(data, w, p){
+  l <- length(data)
+  maxl <- vector(mode="numeric")
+  maxr <- vector(mode="numeric")
+  min <- vector(mode="numeric")
+  # find minima
+  for (i in 2:(l-1)){
+    left <- leftMax(data, w, i)
+    right <- rightMax(data, w, i)
+    if ((data[i]/left[2]<=p) && (data[i]/right[2]<=p)){
+      maxl <- rbind(maxl, left)
+      maxr <- rbind(maxr, right)
+      min <- rbind(min, c(i,data[i]))
+    }
+  }
+  # merge minima
+  maxlf <- vector(mode="numeric")
+  maxrf <- vector(mode="numeric")
+  minf <- vector(mode="numeric")
+  if (length(dim(min))>0){
+    group <- 1
+    if (dim(min)[1] > 1){
+      for (i in 2:dim(min)[1]){
+        if (maxl[i,1] < min[tail(group,1),1]){
+          group <- c(group,i)
+        }
+        else{
+          maxlf <- rbind(maxlf, c(mean(maxl[group,1][maxl[group,2]==max(maxl[group,2])]), max(maxl[group,2])))
+          maxrf <- rbind(maxrf, c(mean(maxr[group,1][maxr[group,2]==max(maxr[group,2])]), max(maxr[group,2])))
+          minf <- rbind(minf, c(mean(min[group,1][min[group,2]==min(min[group,2])]), min(min[group,2])))
+          group <- i
+        }
+      }
+    }
+    maxlf <- rbind(maxlf, c(mean(maxl[group,1][maxl[group,2]==max(maxl[group,2])]), max(maxl[group,2])))
+    maxrf <- rbind(maxrf, c(mean(maxr[group,1][maxr[group,2]==max(maxr[group,2])]), max(maxr[group,2])))
+    minf <- rbind(minf, c(mean(min[group,1][min[group,2]==min(min[group,2])]), min(min[group,2])))
+  }
+  return(cbind(minf,maxlf,maxrf))
+}
+
+
+covcurv <- function(input, num, p){
+  cov <- unlist(input$Coverage[num])
+  x <- 1:length(cov)
+  r <- input$ReadLengthsMean[num]
+  w <- size(length(cov),r)
+  sm <- mySmooth(cov,w)
+  title <-  paste('contig', num, ', drop', (1-p)*100,'%',collapse=' ')
+  plot(cov,col="grey", main=title, xlab='contig', ylab='coverage')
+  lines(sm, col='red', lwd=2)
+  test <- covDrop(sm, 2*w, p)
+  if (length(test) > 0){
+    points(test[,1],test[,2],col='blue',lwd=2, cex=2)
+    points(test[,3],test[,4],col='purple',lwd=2, cex=2)
+    points(test[,5],test[,6],col='purple',lwd=2, cex=2)
+  }
+}
+
+coverageCurve <- function(cov, read, p){  
+  w <- size(length(cov),read)
+  sm <- mySmooth(cov,w)
+  drops <- covDrop(sm, 2*w, p)
+  return(drops)
+}
+
+mypdf <- function(input, max){
+  pdf(height=5)
+  for (i in 1:max){
+    covcurv(input,i,0.75)
+    covcurv(input,i,0.50)
+  }
+  dev.off()
+}
\ No newline at end of file
diff --git a/r/coverage.R b/r/coverage.R
new file mode 100755
index 0000000..0f61871
--- /dev/null
+++ b/r/coverage.R
@@ -0,0 +1,47 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+
+# BaseConfirmation for the ends
+endConfirmation <- function(coverage, corecoverage, endSize){
+  
+  num <- length(coverage)
+  leftconf <- vector(mode="list", length=num)
+  rightconf <- vector(mode="list", length=num)
+  for (i in 1:num){
+    leftc <- getEnd(coverage[[i]], endSize[i], "left")
+    leftcc <- getEnd(corecoverage[[i]], endSize[i], "left")
+    leftconf[[i]] <- apply(cbind(leftcc,leftc), 1, mybinom)
+    
+    rightc <- getEnd(coverage[[i]], endSize[i], "right")
+    rightcc <- getEnd(corecoverage[[i]], endSize[i], "right")
+    rightconf[[i]] <- apply(cbind(rightcc,rightc), 1, mybinom)
+  }
+    
+  leftright <- data.frame(sapply(leftconf, mean), sapply(leftconf, sd),
+                          sapply(leftconf, median), sapply(leftconf, mad),
+                          sapply(leftconf, min), sapply(leftconf, max),
+                          sapply(rightconf, mean), sapply(rightconf, sd),
+                          sapply(rightconf, median), sapply(rightconf, mad),
+                          sapply(rightconf, min), sapply(rightconf, max))
+  
+  ordered <- t(apply(leftright, 1, orderMinMax))
+  colnames(ordered) <- c('BaseConfirmationMinEndMean', 'BaseConfirmationMinEndSD',
+                         'BaseConfirmationMinEndMedian', 'BaseConfirmationMinEndMad',
+                         'BaseConfirmationMinEndMin', 'BaseConfirmationMinEndMax',
+                         'BaseConfirmationMaxEndMean', 'BaseConfirmationMaxEndSD',
+                         'BaseConfirmationMaxEndMedian', 'BaseConfirmationMaxEndMad',
+                         'BaseConfirmationMaxEndMin', 'BaseConfirmationMaxEndMax')
+  return(ordered)
+}
+
+orderMinMax <- function(line){
+  if (line[1] <= line[7]) return(line)
+  else return(cbind(line[7:12],line[1:6]))
+}
+
+getEnd <- function(sequence, endsize, side){
+  if (side=='right') sequence <- rev(sequence)
+  sequence[1:min(endsize,length(sequence))]
+}
\ No newline at end of file
diff --git a/r/expofits.R b/r/expofits.R
new file mode 100755
index 0000000..05d80d9
--- /dev/null
+++ b/r/expofits.R
@@ -0,0 +1,134 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+#
+getExpoClassThresholds <- function(training, quantile=0.25){
+  
+  training$NormedMatchCount1 <- 1 - training$NormedMatchCount1
+  training$NormedMatchCount2 <- 1 - training$NormedMatchCount2
+  training$NormedContigLength1 <- 1 - training$NormedContigLength1
+  
+  firstMetric <- which(colnames(training)=="NormedMatchCount1")
+  lastMetric <- ncol(training)
+  
+  quant <- vector(mode="numeric")
+
+  quantiles <- rep(quantile, length(firstMetric:lastMetric))
+  names(quantiles) <- names(training)[firstMetric:lastMetric]
+  
+  for (i in firstMetric:lastMetric){
+    fit <- fitExpDist(training[[i]], plot=F, main=names(training)[i])
+    quant <- append( quant, expPosByQuantile(fit, quantile=quantiles[names(training)[i]], plot=F) )
+  }
+  
+  names(quant) <- names(training)[firstMetric:lastMetric]
+  
+  quant["NormedMatchCount1"] <- 1 - quant["NormedMatchCount1"]
+  quant["NormedMatchCount2"] <- 1 - quant["NormedMatchCount2"]
+  quant["NormedContigLength1"] <- 1 - quant["NormedContigLength1"]
+  
+  return(quant)
+}
+
+
+#
+expoFit <- function(scores, pdf=TRUE, 
+                    filename=paste("expofit_", 
+                                   format(Sys.time(), "%Y-%m-%d_%H-%M-%S"),
+                                   ".pdf", sep="")){
+        
+    fM <- which(colnames(scores)=="NormedMatchCount1")
+    lM <- ncol(scores)
+    metrics <- scores[ ,fM:lM]
+    
+    metrics$NormedMatchCount1 <- 1 - metrics$NormedMatchCount1
+    metrics$NormedMatchCount2 <- 1 - metrics$NormedMatchCount2
+    metrics$NormedContigLength1 <- 1 - metrics$NormedContigLength1
+    
+    if (pdf){
+      pdf(file=filename, width=5.875, height=5.875, onefile=TRUE)
+    }
+    
+    par.default <- par()
+    par.mar <- c(3, 3, 2, 1)
+    par.mgp <- c(1.5, 0.5, 0)
+    par(mfrow=c(4,2),mar=par.mar, mgp=par.mgp)
+    
+    fM <- which(colnames(metrics)=="NormedMatchCount1")
+    lM <- ncol(metrics)
+        
+    quantiles <- c(0.50, 0.50, 0.50, 0.50, 0.50, 0.25, 0.33, 0.50, 0.33, 0.25, 0.25, 0.25)
+    quantiles <- rep(0.25, length(fM:lM))
+    names(quantiles) <- names(metrics)[fM:lM]
+    
+    for (name in names(metrics)[fM:lM]){
+      fit <- fitExpDist(metrics[[name]], plot=T, main=name)
+      quant <- expPosByQuantile(fit, quantile=quantiles[name], metrics[[name]], plot=T)
+    }
+    
+#     par(par.default)
+    
+    if (pdf) garbage <- dev.off()
+}
+
+
+# fit an exponential distribution to a histogram of metric
+fitExpDist <- function(metric, plot=FALSE, logIt=FALSE, ...){ 
+  
+  # use logarithmic scale (+1 shift cause of possible zeros)
+  if (logIt) metric <- log(metric +1)
+  
+  # fit exponential distribution
+  require(MASS)
+  fit <- fitdistr(metric, densfun="exponential")
+  
+  # plot it
+  if (plot){
+    breaks <- getBreaks(metric)
+    
+    h <- hist(metric, breaks=breaks, freq=F, plot=plot, ...)
+    x <- h$mids
+    y <- h$density
+    
+    lines(x, dexp(x, rate=fit$estimate), col="red") 
+  }
+  
+  return(fit)
+}
+
+expPosByQuantile <- function(fit, quantile=0.50, metric, plot=FALSE, expIt=FALSE, ...){
+  
+  # calc quantiles from exponential fit
+  quant <- qexp(quantile, rate=fit$estimate)
+#   quant <- qnorm(quantile, mean=fit$estimate["mean"], sd=fit$estimate["sd"])
+  
+  # plot abline at quant positions
+  if (plot){
+    y <- diff(par("usr")[3:4])*0.6 + par("usr")[3]
+    for (i in 1:length(quantile)){
+      abline(v=quant[i], col="red", ...)
+      lab <- paste(100*quantile[i], "% quantile", "\n# <=", ":", sum(metric<=quant[i]), "\n#  >", ":", sum(metric>quant[i]), sep="")
+      text(x=quant[i],y=y, pos=4, labels=lab, col="black")
+    }
+  }
+  
+  # reverse logarithmic scale (-1 to reverse +1 shift)
+  if (expIt) quants <- exp(quant) -1
+  
+  return(quant)
+}
+
+
+getBreaks <- function(metric){
+  
+  if (diff(range(metric)) <= 1){ # if normed metric
+    breaks=seq(from=0.00, to=1.00, by=0.01)
+  } else if (all(floor(metric) == metric, na.rm = TRUE)) { # if integer scale 
+    breaks=(min(metric)-0.5):(max(metric)+0.5)
+  } else { # other scales, e.g. log scale
+    breaks=100
+  }
+  
+  return(breaks)
+}
\ No newline at end of file
diff --git a/r/feature.R b/r/feature.R
new file mode 100755
index 0000000..4095bfa
--- /dev/null
+++ b/r/feature.R
@@ -0,0 +1,164 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# imports the tmp features from the java routine
+importFeatures <- function(assembly.files, quality.files, 
+                           features.files, coverage.files, ccoverage.files){
+  
+  rawdata <- vector('list', length(features.files))
+  if (DEBUGGING){ print("feature import") }
+  for (i in 1:length(features.files)){
+    if (DEBUGGING){ print(as.character(features.files[i])) }
+    
+    rawdata[[i]] <- cbind(Assembly=assembly.files[i],
+                          ReadQuality=quality.files[i],
+                          read.table(features.files[i], header=TRUE, 
+                                     colClasses=get.colClasses(features.files[i])))
+    rawdata[[i]]$Coverage <- read.coverage(coverage.files[i])
+    rawdata[[i]]$CoreCoverage <- read.coverage(ccoverage.files[i])
+  }
+  if (DEBUGGING){ print("import done") }
+  return(rawdata)
+}
+
+
+# Attach an expected genome size from egs to every data.frame in feature.list
+attachEGS <- function(feature.list, egs){
+  if (length(egs)==1){
+    egs <- rep(egs, length(feature.list))
+  }
+  for (i in 1:length(feature.list)){
+    attr(feature.list[[i]], "egs") <- egs[i]
+  }
+  return(feature.list)
+}
+
+
+# Extract further features, check if single or multi core should be used
+extractFeatures <- function(rawdata, cores=1){
+  if (DEBUGGING){ cat("feature extraction... ") }
+  
+  if (cores>1) results <- mcFeatures(rawdata, cores)
+  else      results <- scFeatures(rawdata)
+  
+  if (DEBUGGING){ cat("done") }
+  return(results)
+}
+
+
+# Singlecore feature extraction
+scFeatures <- function(rawdata){
+  results <- lapply(rawdata, function(x){ return(features(x)) })
+  return(results)
+}
+
+
+# Multicore feature extraction
+mcFeatures <- function(rawdata, cores){
+#   require("parallel")
+  results <- mclapply(rawdata, function(x){ return(features(x)) }, mc.cores=cores)
+  return(results)
+}
+
+
+# Step by step feature extraction
+features <- function(result){
+
+  # Convert ContigID from factor to characters
+  result$ContigID <- as.character(result$ContigID)  
+  
+  # GenomeSize & GenomeRelation
+  egs = attr(result, "egs", exact=TRUE)
+  if (DEBUGGING) print(paste("egs:", egs))
+  if (egs == 0){
+    result <- insert(result, sum(result$Length), 'EstimatedGenomeSize', 'ReadCount')    
+  }
+  else{
+    result <- insert(result, egs, 'EstimatedGenomeSize', 'ReadCount')  
+  }
+  result <- insert(result, result$Length/result$EstimatedGenomeSize, 'GenomeRelation', 'EstimatedGenomeSize')
+  
+  # N50 & N50Relation
+  result <- insert(result, N50(result$Length), 'N50', 'GenomeRelation')
+  result <- insert(result, result$Length/result$N50, 'N50Relation', 'N50')
+  
+  # ReadLengthQuotients
+  result <- insert(result, result$ReadLengthsMean/result$ReadLengthsPaddedMean, 'ReadLengthsQuotientMean', 'ReadLengthsPaddedMax')
+  result <- insert(result, result$ReadLengthsMedian/result$ReadLengthsPaddedMedian, 'ReadLengthsQuotientMedian', 'ReadLengthsQuotientMean')
+  result <- insert(result, result$ReadLengthsMin/result$ReadLengthsPaddedMin, 'ReadLengthsQuotientMin', 'ReadLengthsQuotientMedian')
+  result <- insert(result, result$ReadLengthsMax/result$ReadLengthsPaddedMax, 'ReadLengthsQuotientMax', 'ReadLengthsQuotientMin')
+  
+  # ReadLengthQuotions Old: had to remove ReadLengthsQuotientSD and ReadLengthsQuotientMAD
+  # They can become NAs if the ReadLengthsPaddesSD/MAD are all 0 (division by zero -> NA). 
+  # This occures only if the PaddedReadLength of all reads of a contig is exactly the same (i.e. no gaps at all),
+  # which is extremely unlikely in a real assembly. However, we observed data were this occures.
+#   result <- insert(result, result$ReadLengthsMean/result$ReadLengthsPaddedMean, 'ReadLengthsQuotientMean', 'ReadLengthsPaddedMax')
+#   result <- insert(result, result$ReadLengthsSD/result$ReadLengthsPaddedSD, 'ReadLengthsQuotientSD', 'ReadLengthsQuotientMean')
+#   result <- insert(result, result$ReadLengthsMedian/result$ReadLengthsPaddedMedian, 'ReadLengthsQuotientMedian', 'ReadLengthsQuotientSD')
+#   result <- insert(result, result$ReadLengthsMAD/result$ReadLengthsPaddedMAD, 'ReadLengthsQuotientMAD', 'ReadLengthsQuotientMedian')
+#   result <- insert(result, result$ReadLengthsMin/result$ReadLengthsPaddedMin, 'ReadLengthsQuotientMin', 'ReadLengthsQuotientMAD')
+#   result <- insert(result, result$ReadLengthsMax/result$ReadLengthsPaddedMax, 'ReadLengthsQuotientMax', 'ReadLengthsQuotientMin')
+  
+  # CoverageComparison
+  result <- insert(result, result$CoverageGlobalMean/mean(result$CoverageGlobalMean),
+                   'CoverageGlobalMeanComparison', 'CoverageMaxEndMax')
+  result <- insert(result, result$CoverageMinEndMean/mean(result$CoverageMinEndMean),
+                   'CoverageMinEndMeanComparison', 'CoverageGlobalMeanComparison')
+  result <- insert(result, result$CoverageMaxEndMean/mean(result$CoverageMaxEndMean),
+                   'CoverageMaxEndMeanComparison', 'CoverageMinEndMeanComparison')
+  
+  # BaseConfirmation
+  conf <- apply(cbind(result$CoreCoverage, result$Coverage), 1, multibinom)
+  result <- insert(result, sapply(conf, mean), 'BaseConfirmationGlobalMean', 'CoreCoverageMaxEndMax')
+  result <- insert(result, sapply(conf, sd), 'BaseConfirmationGlobalSD', 'BaseConfirmationGlobalMean')
+  result <- insert(result, sapply(conf, median), 'BaseConfirmationGlobalMedian', 'BaseConfirmationGlobalSD')
+  result <- insert(result, sapply(conf, mad), 'BaseConfirmationGlobalMAD', 'BaseConfirmationGlobalMedian')
+  result <- insert(result, sapply(conf, min), 'BaseConfirmationGlobalMin', 'BaseConfirmationGlobalMAD')
+  result <- insert(result, sapply(conf, max), 'BaseConfirmationGlobalMax', 'BaseConfirmationGlobalMin')
+  
+  pos <- which(names(result)=='BaseConfirmationGlobalMax')
+  endconf <- endConfirmation(result$Coverage, result$CoreCoverage, result$ReadLengthsMean)
+  result <- cbind(result[1:pos], endconf, result[(pos+1):length(result)])
+  
+  # CoverageDrop
+  result <- cbind(result, calcFeatureCoverageDrops(result$Coverage, result$ReadLengthsMean))
+  
+  # remove Coverega and CoreCoverage
+  result$Coverage <- NULL
+  result$CoreCoverage <- NULL
+  
+  return(result)
+}
+
+
+export <- function(results){
+  for (i in 1:length(results)){
+    filename <- paste("output", as.character(i), ".csv", sep="")
+    write.table(results[[i]],file=filename,sep=",",row.names=F)
+  }
+}
+
+featureReduction <- function(){
+  featCor <- cor(training[,3:(length(training)-1)],training[,3:(length(training)-1)])
+  index <- is.na(featCor)
+  featCor2 <- replace(featCor, index, 0)
+  #   corrgram(featCor2, type='cor')
+  
+  i=1
+  while (i<=dim(featCor2)[1]){
+    idx <- abs(featCor2[i,])<0.20
+    idx[i] <- TRUE
+    featCor2 <- featCor2[idx,idx]
+    i <- i+1
+  }
+  
+  featNames <- rownames(featCor2)
+  featNames <- c(names(training)[1:2], featNames, names(training)[length(training)])
+  return(featNames)
+  
+  featCor <- cor(training[,featNames],training[,featNames])
+  index <- is.na(featCor)
+  featCor2 <- replace(featCor, index, 0)
+  corrgram(featCor2, type='cor')
+}
\ No newline at end of file
diff --git a/r/functions.R b/r/functions.R
new file mode 100755
index 0000000..fea100c
--- /dev/null
+++ b/r/functions.R
@@ -0,0 +1,35 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# calculation of N50
+N50 <- function(contigLengths){
+  median(rep(contigLengths,contigLengths))
+}
+
+
+baseconf <- function(corecoverage, coverage, prob){
+  output <- vector(mode="numeric", length=length(coverage))
+  for (i in 1:length(coverage)){
+    output[i] <- binom.test(corecoverage[i], coverage[i], prob, "greater")$p.value
+  }
+  return(output)
+}
+
+mybinom <- function(covpair){
+  if (covpair[2]<covpair[1]) covpair[2] <- covpair[1]
+  return(binom.test(covpair[1], covpair[2], 0.98, "greater")$p.value)
+}
+
+multibinom <- function(covpairs){
+  return(apply(cbind(unlist(covpairs[1]),unlist(covpairs[2])), 1, mybinom))
+}
+
+# insert vector "input" with colname "name" 
+# into data.frame "data" after column "after"
+insert <- function(data, input, name, after){
+  pos <- which(names(data)==after)
+  output <- cbind(data[1:pos], input, data[(pos+1):length(data)])
+  colnames(output)[(pos+1)] <- name
+  return(output)
+}
\ No newline at end of file
diff --git a/r/import.R b/r/import.R
new file mode 100755
index 0000000..47861b5
--- /dev/null
+++ b/r/import.R
@@ -0,0 +1,81 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# Import of a quality or coverage file in fasta like style but single quality 
+# values are separated by spaces
+# Code inspired by the function "read.fasta" from the R package "seqinr"
+#
+# Coverage example:
+# >Contig00001
+# 31 31 31 37 35 37 37 37 39 39 39 32 35 10 34 36 38 36 25 31 32 37 33 32 36 36 
+#
+read.coverage <- function(file){
+    
+  if (file.exists(file)){
+    if (DEBUGGING){ cat(paste("read file:", file, "\n")) }
+    
+    lines <- readLines(file)
+    
+    ind <- which(substr(lines, 1L, 1L) == ">")
+    nseq <- length(ind)
+    if (nseq == 0) {
+      stop("no line starting with a > character found")
+    }
+    start <- ind + 1
+    end <- ind - 1
+    end <- c(end[-1], length(lines))
+        
+    coverage <- lapply(seq_len(nseq), function(i) { 
+      as.integer(do.call(c, strsplit(lines[start[i]:end[i]], " "))) })
+    
+    titles <- lapply(seq_len(nseq), function(i) {
+      firstword <- strsplit(lines[ind[i]], " ")[[1]][1]
+      substr(firstword, 2, nchar(firstword)) })
+    
+    names(coverage) <- titles
+    
+#     workaround for zero coverage positions:
+#     pseude count, increase all by 1
+    coverage <- lapply(coverage, "+", 1)
+    
+    return(coverage)
+    
+  } else {
+    cat(paste("can not read file:", file, "\n"))
+  }
+}
+
+
+# import of files with SuRankCo features, as exported from the feature module
+readSurankcoFeatures <- function(files){
+  features <- vector(mode="list", length=length(files))
+  for (i in 1:length(files)){
+    features[[i]] <- read.table(file=files[i], sep="\t", dec = ".", header=TRUE,
+                                colClasses=get.colClasses(files[i], 3))
+  }
+  return(features)
+}
+
+
+# import of files with SuRankCo scores, as exported from the score module
+readSurankcoScores <- function(files){
+  scores <- vector(mode="list", length=length(files))
+  for (i in 1:length(files)){
+    scores[[i]] <- read.table(file=files[i], sep="\t", dec = ".", header=TRUE,
+                              colClasses=get.colClasses(files[i], 3))
+  }
+  return(scores)
+}
+
+
+# predefine the column classes for read.table imports
+# the first char.num columns will be character and the rest numeric
+get.colClasses <- function(filename, char.num=1){
+  feature.names <- scan(filename, what="character", nlines=1, quiet=TRUE)
+  colClasses <- c(rep("character", char.num), 
+                  rep("numeric", length(feature.names)-char.num))
+  names(colClasses) <- feature.names
+  if (DEBUGGING){ print(colClasses) }
+  return(colClasses)
+}
\ No newline at end of file
diff --git a/r/parameter.R b/r/parameter.R
new file mode 100755
index 0000000..f0c114a
--- /dev/null
+++ b/r/parameter.R
@@ -0,0 +1,577 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# Function for parameter parsing, file checks and preparation, 
+# package loading and error handling
+
+
+loadPackages <- function(names, quietly=TRUE){
+  # Tries to require R packages/libraries and stop executions if failing
+  #
+  # Args:
+  #   names: vector with package names
+  #
+  # Return:
+  #   Nothing, but will stop execution if packages are missing
+  
+  check <- vector(mode="logical", length=length(names))
+  
+  for (i in 1:length(names)){
+    suppressMessages(check[i] <- require(names[i], 
+                                         quietly=quietly, 
+                                         character.only=TRUE))
+  }
+  
+  if (!all(check)){
+    complainAndStop("missing R-package(s)", names[!check])
+  }
+}
+
+
+# surankco-feature argument parser
+parseSurankcoFeature <- function(args=commandArgs(trailingOnly = TRUE)){
+  # Parses parameters for the surankco feature module
+  # 
+  # Args:
+  #   args: commandline parameters, default = commandArgs(trailingOnly = TRUE) 
+  #
+  # Returns:
+  #   List of parameter values as provided by parse_args (optparse package)
+  
+  parser.options <- list(
+    # Input parameter
+    make_option(opt_str=c("-a", "--assemblies"),
+                action="store",
+                help="Indicate a list of assembly files (comma separated), correct 
+                suffixes are mandatory (e.g. \".ace\")"),
+    
+    make_option(opt_str=c("-d", "--directory"),
+                action="store",
+                help="Indicate a directory containing assembly files with indicated 
+                format (default: ace, see parameter -f)"),
+    
+    make_option(opt_str=c("-f", "--assembly.format"),
+                action="store",
+                default="ace",
+                help="Indicate assembly/contig format (resp. suffix), 
+                either \"ace\" (default) or \"contigs.fasta\""),
+    
+    make_option(opt_str=c("-r", "--read.quality.format"),
+                action="store",
+                default="no value",
+                help="Indicate the read quality format: qual, qua or fastq for ACE 
+                (default=\"qual\") resp. sam or bam for contigs.fasta (default=\"sam\")"),
+    
+    make_option(opt_str=c("-q", "--fastq.version"),
+                action="store",
+                default="illumina18",
+                help="Indicate the fastq version: auto, sanger, solexa, illumina13, 
+                illumina15, illumina18 (default). Only needed for ACE assemblies."),
+    
+    # Read name regex
+    make_option(opt_str=c("-s", "--split.regex"),
+                action="store",
+                #default=".[0-9]*-",
+                default="nosplit",
+                type="character",
+                help="Indicate a regular expression to cutoff read names (e.g. if 
+                modified by the assembler). Only needed for ACE assemblies. 
+                Note, if a backslash \"\\\" is needed use \"\\\\\\\\\"!"),
+    
+    #     make_option(opt_str=c("-n", "--name.regex"),
+    #                 action="store",
+    #           help="Indicate a regular expression to match read names
+    #                 (e.g. if modified by the assembler)"),
+    
+    # Performance parameter
+    make_option(opt_str=c("-t", "--threads"),
+                action="store",
+                default=1,
+                type="integer",
+                help="Indicate a number of cores or threads to use. Might speed up 
+                some parallelized operations (default: 1)"),
+    
+    make_option(opt_str=c("-m", "--memory"),
+                action="store",
+                default=32,
+                type="double",
+                help="Indicate the maximum memory usage (in Gb) of Javas virtual 
+                machine (default: 32). Try to increase if big data sets report 
+                heap space problems."),
+    
+    # Feature parameter
+    make_option(opt_str=c("-k", "--kmer.features"),
+                action="store_true",
+                default=FALSE,
+                help="Indicates whether k-mer features should be computed 
+                (experimental, very long runtime) or not (default)"),
+    
+    make_option(opt_str=c("-g", "--expected.genome.size"),
+                action="store",
+                default="0",
+                type="character",
+                help="Indicate a list of expected genome sizes (comma separated) or 
+                one value for all assemblies. Default is 0, which will estimate 
+                the genome sizes as sum of contig lengths."),
+    
+    make_option(opt_str=c("-c", "--contig.size.filter"),
+                action="store",
+                default="0",
+                type="integer",
+                help="Indicate a minimum contig size. Default: 0")
+  )
+  
+  myparser <- OptionParser(usage="usage: surankco-feature [options]",
+                           option_list=parser.options,
+                           prog = NULL, description = "", epilogue = "")
+  
+  parameters <- parse_args(object = myparser,
+                           args = args,
+                           print_help_and_exit = TRUE,
+                           positional_arguments = FALSE)
+  
+  
+  # check obligatory parameter
+  checkObligatoryParameters(parameters, need.xor=c("assemblies", "directory"))
+  
+  # check assembly.format
+  if (!(parameters$assembly.format %in% c("ace", "contigs.fasta"))){
+    complainAndStop(paste("invalid \"assembly.format\" (resp. suffix)", 
+                          "(expecting ace or contigs.fasta)"),
+                    parameters$assembly.format)
+  }
+  
+  # check read.quality.format
+  if (parameters$assembly.format == "ace"){
+    if (parameters$read.quality.format == "no value") parameters$read.quality.format <- "qual"
+    else if (!(parameters$read.quality.format %in% c("qual", "qua", "fastq"))){
+      complainAndStop(paste("invalid \"read.quality.format\"", 
+                            "(expecting qual, qua, fastq)"),
+                      parameters$read.quality.format)
+    }
+  }
+  if (parameters$assembly.format == "contigs.fasta"){
+    if (parameters$read.quality.format == "no value") parameters$read.quality.format <- "sam"
+    else if (!(parameters$read.quality.format %in% c("sam", "bam"))){
+      complainAndStop(paste("invalid \"read.quality.format\"", 
+                            "(expecting sam or bam)"),
+                      parameters$read.quality.format)
+    }
+  }
+  
+  # check files
+  parameters$files <- prepareFiles(parameters, "assemblies",
+                                   parameters$assembly.format,
+                                   parameters$read.quality.format)
+  
+  # parameter checks
+  # fastq.version
+  if (!(parameters$fastq.version %in% c("auto", "sanger", "solexa",
+                                        "illumina13", "illumina15", 
+                                        "illumina18"))){
+    complainAndStop(paste("invalid \"fastq.version\"", 
+                          "(expecting auto, sanger, solexa, illumina13, ", 
+                          "illumina15 or illumina18)"),
+                    parameters$fastq.version)
+  }
+  
+  # threads
+  if (is.na(parameters$threads) || parameters$threads < 1){
+    complainAndStop(paste("invalid thread number", 
+                          "(expecting a positiv integer)"),
+                    parameters$threads)
+  }
+  parameters$threads <- min(parameters$threads, nrow(parameters$files))
+  
+  # memory
+  if (is.na(parameters$memory) || parameters$memory <= 0){
+    complainAndStop(paste("invalid memory value ", 
+                          "(expecting a positiv number)"),
+                    parameters$memory)
+  }
+  
+  # expected.genome.size
+  splits <- as.numeric(strsplit(parameters$expected.genome.size, ",")[[1]])
+  if (any(is.na(splits)) || any(floor(splits)!=splits) || any(splits < 0)){
+    complainAndStop(paste("invalid values of \"expected.genome.size\"", 
+                          "(expecting positive integers only)"), splits)
+  }
+  if (length(parameters$expected.genome.size) > 1){
+    if (nrow(parameters$files) != length(parameters$expected.genome.size)){
+      complainAndStop(paste("number of expected.genome.size values and assembly", 
+                            "files not matching"),
+                      paste(length(parameters$expected.genome.size), "vs",
+                            nrow(parameters$files)))
+    }
+  }
+  parameters$expected.genome.size <- splits
+  
+  # contig.size.filter
+  if (is.na(parameters$contig.size.filter) || parameters$contig.size.filter < 0){
+    complainAndStop(paste("invalid minimum contig size", 
+                          "(expecting a positiv integer >= 0)"),
+                    parameters$contig.size.filter)
+  }
+  
+  return(parameters)
+}
+
+
+# surankco-score argument parser
+parseSurankcoScore <- function(args = commandArgs(trailingOnly = TRUE)){
+  # Parses parameters for the surankco score module
+  # 
+  # Args:
+  #   args: commandline parameters, default = commandArgs(trailingOnly = TRUE) 
+  #
+  # Returns:
+  #   List of parameter values as provided by parse_args (optparse package)
+  
+  parser.options <- list(
+    # Input parameter
+    make_option(opt_str=c("-a", "--assemblies"),
+                action="store",
+                help="Indicate a list of assembly files (comma separated), correct 
+                suffixes are mandatory (default \"*.contigs.fasta\")"),
+    
+    make_option(opt_str=c("-d", "--directory"),
+                action="store",
+                help="Indicate a directory containing assembly files with indicated 
+                format (e.g. \"*.contigs.fasta\")"),
+    
+    make_option(opt_str=c("-f", "--assembly.suffix"),
+                action="store",
+                default="contigs.fasta",
+                help="Indicate assembly format/suffix, default=\"contigs.fasta\""),
+    
+    make_option(opt_str=c("-r", "--reference.suffix"),
+                action="store",
+                default="ref.fasta",
+                help="Indicate the reference format/suffix, default=\"ref.fasta\""),
+    
+    make_option(opt_str=c("-p", "--pdf.histograms"),
+                action="store",
+                default="surankco_score_histograms.pdf",
+                help="Indicate a name for the score histogram pdf  
+                (default = \"surankco_score_histograms.pdf\")"),
+    
+    make_option(opt_str=c("-m", "--memory"),
+                action="store",
+                default=32,
+                type="integer",
+                help="Indicate the maximum memory usage (in Gb) of Javas virtual 
+                machine (default: 32). Try to increase if big data sets report 
+                heap space problems.")
+  )
+  
+  myparser <- OptionParser(usage="usage: surankco-score [options]",
+                           option_list=parser.options,
+                           prog = NULL, description = "", epilogue = "")
+  
+  parameters <- parse_args(object = myparser,
+                           args = args,
+                           print_help_and_exit = TRUE,
+                           positional_arguments = FALSE)
+  
+  
+  # check obligatory parameter
+  checkObligatoryParameters(parameters, need.xor=c("assemblies", "directory"))
+  
+  # check files
+  parameters$files <- prepareFiles(parameters, "assemblies",
+                                   parameters$assembly.suffix,
+                                   parameters$reference.suffix)
+  
+  # parameter checks
+  # pdf.histograms
+  if (!file.exists(dirname(parameters$pdf.histograms))){
+    complainAndStop("can not find path for \"pdf.histograms\"",
+                    parameters$pdf.histograms)
+  }
+  
+  # memory
+  if (is.na(parameters$memory) || parameters$memory <= 0){
+    complainAndStop(paste("invalid memory value ", 
+                          "(expecting a positiv number)"),
+                    parameters$memory)
+  }
+  
+  return(parameters)
+}
+
+
+# surankco-training argument parser
+parseSurankcoTraining <- function(args = commandArgs(trailingOnly = TRUE)){
+  # Parses parameters for the surankco training module
+  # 
+  # Args:
+  #   args: commandline parameters, default = commandArgs(trailingOnly = TRUE) 
+  #
+  # Returns:
+  #   List of parameter values as provided by parse_args (optparse package)
+  
+  parser.options <- list(
+    # Input parameter
+    make_option(opt_str=c("-f", "--features"),
+                action="store",
+                help="Indicate a list of surankco feature files (comma separated), 
+                correct suffixes are mandatory (\"*.features.txt\")"),
+    
+    make_option(opt_str=c("-d", "--directory"),
+                action="store",
+                help="Indicate a directory containing surankco feature files 
+                (\"*.features.txt\")"),
+    
+    make_option(opt_str=c("-o", "--output.filename"),
+                action="store",
+                default="surankco_rfs.RData",
+                help="Indicate a name for the export of the random forest classifiers 
+                (default = \"surankco_rfs.RData\")"),
+    
+    make_option(opt_str=c("-e", "--exponential.quantile"),
+                action="store",
+                default=0.25,
+                help="Indicate an exponential distribution quantile to divide scores 
+                (default: 0.25)"),
+    
+    make_option(opt_str=c("-m", "--manual.thresholds"),
+                action="store",
+                help="Indicate manual thresholds instead of quantiles, one per score 
+                each (comma separated) in the same order as in the score file")
+  )
+  
+  myparser <- OptionParser(usage="usage: surankco-training [options]",
+                           option_list=parser.options,
+                           prog = NULL, description = "", epilogue = "")
+  
+  parameters <- parse_args(object = myparser,
+                           args = args,
+                           print_help_and_exit = TRUE,
+                           positional_arguments = FALSE)
+  
+  
+  # check (obligatory) parameters
+  checkObligatoryParameters(parameters, need.xor=c("features", "directory"))
+  
+  # check files
+  parameters$features.suffix <- "features.txt"
+  parameters$scores.suffix <- "scores.txt"
+  parameters$files <- prepareFiles(parameters, "features",
+                                   parameters$features.suffix,
+                                   parameters$scores.suffix)
+  
+  # parameter checks
+  # output.filename
+  if (!file.exists(dirname(parameters$output.filename))){
+    complainAndStop("can not find path for \"output.filename\"",
+                    parameters$output.filename)
+  }
+  
+  # exponential.quantile
+  if (is.na(parameters$exponential.quantile) ||
+        parameters$exponential.quantile <= 0 || 
+        parameters$exponential.quantile >= 1){
+    complainAndStop("\"exponential.quantile\" out of range (0,1)",
+                    parameters$exponential.quantile)
+  }
+  
+  # manual.thresholds
+  if ("manual.thresholds" %in% names(parameters)){
+    splits <- as.numeric(strsplit(parameters$manual.thresholds, ",")[[1]])
+    in.range <- 0 <= splits[-c(5,7)] & splits[-c(5,7)] <= 1 
+    in.range <- append(in.range, 0 <= splits[c(5,7)] & splits[c(5,7)] <= 100)
+    if (length(splits) != 8){
+      complainAndStop("wrong number of \"manual.thresholds\" (expecting 8)", splits)
+    }
+    if (is.na(splits) || !all(in.range)){
+      complainAndStop(paste0("out of range \"manual.thresholds\" (expecting ", 
+                             "[0,1] resp. [0,100] for threshold 5 and 7)"), splits)
+    }
+    parameters$manual.thresholds <- splits
+  }
+  
+  return(parameters)
+}
+
+
+# surankco-prediction argument parser
+parseSurankcoPrediction <- function(args = commandArgs(trailingOnly = TRUE)){
+  # Parses parameters for the surankco prediction module
+  # 
+  # Args:
+  #   args: commandline parameters, default = commandArgs(trailingOnly = TRUE) 
+  #
+  # Returns:
+  #   List of parameter values as provided by parse_args (optparse package)
+  
+  parser.options <- list(
+    # Input parameter
+    make_option(opt_str=c("-f", "--features"),
+                action="store",
+                help="Indicate one surankco feature file, a correct suffixe is 
+                mandatory (\"*.features.txt\")"),
+    
+    #     make_option(opt_str=c("-d", "--directory"),
+    #                 action="store",
+    #                 help="Indicate a directory containing surankco feature files 
+    #                 (\"*.features.txt\")"),
+    
+    make_option(opt_str=c("-r", "--random.forests"),
+                action="store",
+                help="Indicate a surankco random forests file (\"*.RData\")"),
+    
+    make_option(opt_str=c("-o", "--output.filename"),
+                action="store",
+                default="surankco_results.txt",
+                help="Indicate a name for the final results file")
+  )
+  
+  myparser <- OptionParser(usage="usage: surankco-prediction [options]",
+                           option_list=parser.options,
+                           prog = NULL, description = "", epilogue = "")
+  
+  parameters <- parse_args(object = myparser,
+                           args = args,
+                           print_help_and_exit = TRUE,
+                           positional_arguments = FALSE)
+  
+  
+  # check obligatory parameter
+  checkObligatoryParameters(parameters, need.all=c("features", "random.forests"))
+  
+  # check files
+  parameters$features.suffix <- "features.txt"
+  parameters$rf.suffix <- "RData"
+  parameters$files.features <- prepareFiles(parameters, "features", parameters$features.suffix)
+  parameters$files.rf <- prepareFiles(parameters, "random.forests", parameters$rf.suffix)
+  parameters$files.features <- parameters$files.features$features.txt
+  parameters$files.rf <- parameters$files.rf$RData
+  
+  # parameter checks
+  # output.filename
+  if (!file.exists(dirname(parameters$output.filename))){
+    complainAndStop("can not find path for \"output.filename\"",
+                    parameters$output.filename)
+  }
+  
+  return(parameters)
+}
+
+
+checkObligatoryParameters <- function(values, need.all=NULL, need.xor=NULL){
+  # Checks whether all obligatory parameters were indicated
+  #
+  # Args:
+  #   values:   list of parameter values as provided 
+  #             by parse_args (optparse package)
+  #   need.all: vector of obligatory parameters
+  #   need.xor: vector of parameters where only one is needed (exclusive)
+  #
+  # Returns:
+  #   Nothing, but will stop execution if obligatory parameters are missing
+  
+  if (!is.null(need.all)){
+    if (!all(test <- need.all %in% names(values))){
+      complainAndStop("missing parameter(s)", need.all[!test])
+    }
+  }
+  
+  if (!is.null(need.xor)){
+    if (sum(test <- need.xor %in% names(values)) > 1){
+      complainAndStop("conflicting parameters", need.xor[test])
+    }
+    if (sum(test <- need.xor %in% names(values)) < 1){
+      complainAndStop("missing parameters (either)", need.xor)
+    }
+  }
+}
+
+
+prepareFiles <- function(parameters, file.parameter, 
+                         first.suffix, additional.suffixes=NULL){
+  # prepares a list of files from either a directory or a filelist
+  
+  if ("directory" %in% names(parameters)){
+    filelist <- paste(sep="", parameters$directory, "/", 
+                      list.files(path=parameters$directory,
+                                 pattern=paste(first.suffix, "$", sep="")))
+  }
+  else{
+    filelist <- strsplit(parameters[[file.parameter]], split=",", fixed=TRUE)[[1]]
+  }
+  
+  files <- checkFiles(filelist, first.suffix, additional.suffixes)
+  
+  return(files)
+}
+
+
+checkFiles <- function(filelist, first.suffix, additional.suffixes=NULL){
+  # Checks if files have the expected suffix and if files exists,
+  # includes files with same prefix but different suffix
+  # 
+  # Args:
+  #   filelist:             
+  #   first.suffix:         
+  #   additional.suffixes:  
+  #
+  # Returns:
+  #   A data.frame containing filename columns per suffix or
+  #   will stop execution if files are missing
+  
+  suffix <- paste(".", first.suffix, "$", sep="")
+  
+  # check first.suffix 
+  missing.suffix <- !grepl(pattern=suffix, x=filelist)
+  if (any(missing.suffix)){
+    complainAndStop(paste("missing suffix -", first.suffix, 
+                          "- for file(s)", sep=""),
+                    filelist[missing.suffix], 
+                    item.start=":\n\t",
+                    item.sep="\n\t")
+  }
+  
+  prefix <- sub(pattern=suffix, replacement="", x=filelist)
+  files <- data.frame(filelist, stringsAsFactors=FALSE)
+  colnames(files) <- first.suffix
+  rownames(files) <- prefix
+  
+  # build filenames from additional.suffixes
+  if (!is.null(additional.suffixes)){
+    for (add.suf in additional.suffixes){
+      files[add.suf] <- paste(prefix, add.suf, sep=".")
+    }
+  }
+  
+  # check file existence
+  missing.files <- !file.exists(unlist(files))
+  if (any(missing.files)){
+    complainAndStop("missing file(s)",
+                    unlist(files)[missing.files],
+                    item.start=":\n\t",
+                    item.sep="\n\t")
+  }
+  
+  return(files)
+}
+
+
+complainAndStop <- function(message, items=NULL, 
+                            item.start=": ", item.sep=", ", item.end=""){
+  # Prints an error message and stops the programm using stop()
+  #
+  # Args:
+  #   message:    error message to print
+  #   items:      list of items to add to the message (e.g. missing files)
+  #   item.start: separator between message and items (e.g. ": ")
+  #   item.sep:   separator for the item list (e.g. ", ", "\n\t")
+  
+  if (is.null(items)){
+    stop(message, call.=FALSE)
+  }
+  else{
+    items <- paste(items, collapse=item.sep)
+    stop(paste(message, items, sep=item.start), call.=FALSE)
+  } 
+}
\ No newline at end of file
diff --git a/r/pslMatchFilter b/r/pslMatchFilter
new file mode 100755
index 0000000..2aa38d8
--- /dev/null
+++ b/r/pslMatchFilter
@@ -0,0 +1,27 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# R script to filter a BLAT psl file for maximum matching hits
+args <- commandArgs(trailingOnly = TRUE)
+
+# prepare psl column names
+pslnames <- "match mismatch repmatch Ns Qgapcount Qgapbases Tgapcount Tgapbases strand Qname Qsize Qstart Qend Tname Tsize Tstart Tend blockcount blockSizes qStarts tStarts"
+pslnames <- strsplit(pslnames, split="[ \t]+")[[1]]	   					      	     				     				
+
+# import the psl file
+filename <- args[1]
+pslheader <- readLines(filename, n=5)
+psldata <- read.table(filename, sep="\t", skip=5)
+colnames(psldata) <- pslnames
+
+# sort psl data by query name and match count, keep entry with maximum match per query
+sorted <- psldata[order(psldata$Qname, -psldata$match), ]
+uniques <- sorted[!duplicated(sorted$Qname), ]
+
+# export the filter psl data
+filename <- args[2]
+write(pslheader, filename)
+write.table(uniques, filename, append=T, quote=F, sep="\t", row.names=F, col.names=F)
diff --git a/r/rf.R b/r/rf.R
new file mode 100755
index 0000000..f2e235d
--- /dev/null
+++ b/r/rf.R
@@ -0,0 +1,103 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# functions handling the training and prediction of contig qualities with random forest
+#
+# author:   Mathias Kuhring
+
+
+# random forest training of contig qualities
+# returns a list of randomForest objects, one per target column
+rfTraining <- function(input, targets, balance=FALSE){
+  if (DEBUGGING){ print('forest training') }
+  
+  rfList <- vector("list", ncol(targets))
+  names(rfList) <- names(targets)
+  library("randomForest")
+  for (i in 1:ncol(targets)){
+    if (DEBUGGING){ print(i) }
+    
+    # balancing
+    index <- 1:dim(input)[1]
+    if (balance){
+      if (DEBUGGING){
+		print("balancing...")
+		print(paste("size before:", length(index)))
+	  }
+      index <- balancingUp(training[,i])
+      if (DEBUGGING){ print(paste("size after:", length(index))) }
+    }
+
+#     vars <- 3:ncol(input)
+    
+    # training
+    rfList[[i]] <- randomForest(x=input[index, ], y=targets[index,i], 
+                                importance=TRUE, localImp=TRUE, ntree=100)
+  }
+  
+  if (DEBUGGING){ print('training done') }
+  return(rfList)
+}
+
+
+# random forest prediction of contig qualities
+# returns a list of predictions per metric
+scorePrediction <- function(rfs, input, type){
+  if (DEBUGGING){ print("score prediction") }
+  
+  predictions <- vector("list", length(rfs))
+  names(predictions) <- names(rfs)
+  
+#   vars <- 3:ncol(input)
+
+  require("randomForest") 
+  for (i in 1:length(rfs)){
+    # prediction
+    predictions[[i]] <- predict(rfs[[i]], input, type=type)
+  }
+  
+  if (DEBUGGING){ print("prediction done") }
+  return(predictions)
+}
+
+
+selectContigs <- function(features, scores){
+  
+  number <- length(features)
+  combination <- vector(mode="list", number)
+  
+  for (i in 1:number){
+    features[[i]]$Assembly <- NULL
+    features[[i]]$ReadQuality <- NULL
+    scores[[i]]$Assembly <- NULL
+    scores[[i]]$Reference <- NULL
+    combination[[i]] <- merge(features[[i]], scores[[i]], by="ContigID")
+  }
+  
+  return( do.call(rbind, combination) )
+}
+
+dataFilter <- function(data){
+  if (DEBUGGING){ print('data filtering') }
+  check <- vector('logical',nrow(data))
+  for (i in 1:ncol(data)){
+    check <- check | is.infinite(data[,i])
+    check <- check | is.nan(data[,i])
+    check <- check | is.na(data[,i])
+  }
+  
+  # (Temporary) Remove Contigs with MatchCount bigger than Length (shouldn't be)
+  if ("NormedMatchCount1" %in% names(data)){
+    check <- check | data$NormedMatchCount1 > 1
+  }
+  
+  #   # Remove MIRA Contigs with repeats
+  #   check <- check | grepl("*rep*", data$ContigID)
+  
+  data <- data[!check,]
+  if (sum(check)>0) cat(paste("warning: removed", sum(check), " inconsistent contigs\n"))
+  
+  if (DEBUGGING){ print('filtering done') }
+  return(data)
+}
\ No newline at end of file
diff --git a/r/scores.R b/r/scores.R
new file mode 100755
index 0000000..25e98fb
--- /dev/null
+++ b/r/scores.R
@@ -0,0 +1,88 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# import of the scores from the java routine
+importScores <- function(score.files, assembly.files, reference.files){
+  scores <- vector('list', length(score.files))
+  if (DEBUGGING){ print("metrics import") }
+  for (i in 1:length(score.files)){
+    if (DEBUGGING){ print(as.character(score.files[i])) }
+    scores[[i]] <- cbind(Assembly=assembly.files[i],
+                         Reference=reference.files[i],
+                         read.table(score.files[i], header=TRUE,
+                                    colClasses=get.colClasses(score.files[i])))
+  }
+  if (DEBUGGING){ print("import done") }
+  return(scores)
+}
+
+
+# removes contig duplicates with same ID (e.g. different local matches)
+# returns best contig per ID depending on filter metric values (highest MatchCount per default)
+bestDuplicate <- function(metrics, filter="MatchCount", highest=TRUE){
+  if (highest){ mod <- -1 } else{ mod <- 1 }
+  sorted <- metrics[order(metrics$ContigID, mod * metrics[filter]), ]
+  uniques <- sorted[!duplicated(sorted$ContigID),]
+  return(uniques)
+}
+
+
+# removes redundant and correlating scores
+correctScores <- function(metrics){
+  for (i in 1:length(metrics)){
+    metrics[[i]]$Sample <- NULL
+    
+    # unnormalized scores are not further used
+    metrics[[i]]$MatchCount <- NULL
+    metrics[[i]]$ErrorCount <- NULL
+    metrics[[i]]$ErrorSubtraction <- NULL
+    
+    # following scores are directly correlated and yield no further information
+    metrics[[i]]$NormedErrorCount2 <- NULL
+    metrics[[i]]$NormedErrorSubtraction <- NULL
+    metrics[[i]]$MaxEndContError <- NULL
+    
+    metrics[[i]]$NormedContigLength2 <- NULL
+  }
+  return(metrics)
+}
+
+
+# separates the scores into two classes each based on the manual thresholds
+manualClasses <- function(scores, thresholds){
+  return(regToClassByThresholds(scores, thresholds))
+}
+
+
+# separates the scores into two classes each based on exponential fittings
+expoClasses <- function(scores, quantile){
+  
+  thresholds <- getExpoClassThresholds(scores, quantile)
+  scores <- regToClassByThresholds(scores, thresholds)
+  
+  return(scores)
+}
+
+
+# separates data into classes by given thresholds
+regToClassByThresholds <- function(scores, thresholds){  
+  firstMetric <- which(colnames(scores)=="NormedMatchCount1")
+  lastMetric <- ncol(scores)
+  
+  for (i in firstMetric:lastMetric){
+    
+    if (any(colnames(scores)[i] == 
+              c("NormedMatchCount1", "NormedMatchCount2", 
+                "NormedContigLength1", "VitorScore")))
+    {
+      scores[i] <- as.factor(scores[i] >= thresholds[i])
+    }
+    else
+    {
+      scores[i] <- as.factor(scores[i] <= thresholds[i])
+    }
+  }
+  
+  return(scores)
+}
\ No newline at end of file
diff --git a/r/voting.R b/r/voting.R
new file mode 100755
index 0000000..94d96cb
--- /dev/null
+++ b/r/voting.R
@@ -0,0 +1,42 @@
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+
+# metric vote, by default TRUE when half of metrics/scores TRUE, but adjustable with min
+voting <- function(scoreList, min=(length(scoreList)/2)){ 
+  pro <- rep(0, length(scoreList[[1]]))
+  contra <- rep(0, length(scoreList[[1]]))
+  
+  for (i in 1:length(scoreList)){
+    pro <- pro + as.numeric(as.logical(scoreList[[i]]))
+    contra <- contra + as.numeric(!as.logical(scoreList[[i]]))
+  }
+  
+  choise <- (pro >= min)
+  return(choise)
+}
+
+
+# returns the sum of TRUE votes
+votingSum <- function(scoreList){ 
+  pro <- rep(0, length(scoreList[[1]]))
+  
+  for (i in 1:length(scoreList)){
+    pro <- pro + as.numeric(as.logical(scoreList[[i]]))
+  }
+
+  return(pro)
+}
+
+
+# returns the sum of weighted TRUE votes
+weightedVotingSum <- function(scoreList, weightList){ 
+  pro <- rep(0, length(scoreList[[1]]))
+  
+  for (i in 1:length(scoreList)){
+    pro <- pro + as.numeric(as.logical(scoreList[[i]])) * weightList[[i]][,"TRUE"]
+  }
+  
+  return(pro)
+}
\ No newline at end of file
diff --git a/readme.txt b/readme.txt
new file mode 100755
index 0000000..2812b08
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,354 @@
+SuRankCo: Supervised Ranking of Contigs in de novo Assemblies
+
+Author:  Mathias Kuhring
+Contact: KuhringM at rki.de
+
+
+UPDATE
+------
+Jun2015:
+* Minor changes to enable BAM support, too.
+
+Feb2015:
+* Added support for FASTA/SAM assemblies in addition to ACE/FASTQ(QUAL).
+  NOTE: features of FASTA/SAM assemblies will not include BaseCount, 
+  BaseSeqmentCount and ContigQualities, yet.
+
+
+INTRODUCTION
+------------
+
+SuRankCo is a machine learning based software to score and rank contigs from de
+novo assemblies of next generation sequencing data. It trains with alignments of
+contigs with known reference genomes and predicts scores and ranking for contigs 
+which have no related reference genome yet.
+
+For more details about SuRankCo and its functioning, please see
+
+ "SuRankCo: Supervised Ranking of Contigs in de novo Assemblies"
+ Mathias Kuhring, Piotr Wojtek Dabrowski, Andreas Nitsche and Bernhard Y. Renard
+ (Submitted manuscript)
+
+PLEASE NOTE, it is recommended to read the paper and this readme.txt file before 
+using SuRankCo.
+
+	
+INSTRUCTIONS
+------------
+SuRankCo consists of four modules: 
+- surankco-feature:     feature generation from contigs (ACE or FASTA) and 
+                        corresponding reads (QUAL, QUA, FASTQ resp. SAM or BAM)
+- surankco-score:       alignment of unpadded(!) contigs (FASTA format) and 
+                        reference genomes (FASTA format) and score calculation
+- surankco-training:    training of random forest using contig features (from 
+                        surankco-feature) and contig scores (surankco-score)
+- surankco-prediction:  prediction of scores and ranking of contigs using 
+                        trained random forests (from surankco-training) and 
+                        contig features (from surankco-feature)
+
+Different combinations of these modules allow for two workflows, training and 
+prediction. For the training, use surankco-feature and surankco-score with the 
+same contigs followed by surankco-training. For the prediction, use 
+surankco-feature with new contigs followed by surankco-prediction.
+
+
+SYSTEM REQUIREMENTS
+-------------------
+The following software and libraries are required to run SuRankCo:
+- Java 7 
+  (Source: http://www.java.com)
+- GNU R 3 including Rscript 
+  (Source: http://www.r-project.org/)
+- additional GNU R Packages: optparse, MASS, randomForest 
+  (Please refer to the R manuals on how to install packages)
+- Blat and accompanying pslPretty
+  (Source: http://hgdownload.cse.ucsc.edu/admin/exe/)
+
+All required executables (java, Rscript, blat and pslPretty) have to be 
+available in the PATH environment variable. Please refer to the operating 
+systems manual on how to set up the PATH variable, if manual setup is necessary.
+
+NOTE: For Mac/OSX the R version 3.0.3 is recommended. The necessary package 
+randomForest is not yet compatible to the latest version 3.1
+(please refer to the troubleshooting section for more details).
+
+
+USER INTERFACE & EXECUTION
+--------------------------
+SuRankCo is a command-line based software with four executables equivalent to 
+the four modules. Use the -h flag for more parameter details. surankco-feature, 
+surankco-score and surankco-training can process the contigs of several 
+assemblies at once. Thus a list of applicable files or a directory can be 
+indicated. The directory will be scanned for all files that have an applicable 
+suffix. Input file examples are provided below.
+
+surankco-feature
+ - Expects pairs of either ACE and FASTQ/QUAL/QUA or FASTA and SAM/BAM files. 
+   You provide either a list of ACE or FASTA files or a directory with such.
+   NOTE: features of FASTA/SAM/BAM assemblies will not include BaseCount, 
+   BaseSeqmentCount and ContigQualities, yet.
+ - The expected suffix for the contig FASTAs is ".contigs.fasta" to prevent
+   confusion with reference genome fasta files.
+ - FASTQ/QUAL/QUA or SAM/BAM files are automatically detected and expected to 
+   share the same prefix such as their corresponding ACE or FASTA files, resp. 
+ - For ACE: QUAL (default), QUA or FASTQ format can be selected per parameter as well 
+   as the FASTQ version (default: Illumina1.8+).
+ - The FASTQ version is alternatively auto detectable, but the identification is 
+   not (!) 100% reliable and thus uncertainties will stop the process and
+   suggestions for manual settings will be provided.
+ - Some de novo assembler may extend the read names with extra information. A 
+   Java compatible regular expression can be indicated to cut of the extensions 
+   to enable matching between ACE and QUAL/FASTQ reads. E.g., a Newbler read 
+   "myread.000089727.26-146.fm1165.to1077" may by trimmed to the original name 
+   "myread.000089727" by using the regex "\\\\.\\\\d+-".
+   NOTE, if a backslash "\" is needed use "\\\\"!
+ - Some feature generation steps can be executed in parallel if more than one
+   assembly (resp. ACE or FASTA file) is processed. Thus, a number of threads  
+   can be indicated.
+ - The memory used by the Java virtual machine can be adjusted, e.g. if the
+   default (32 GB) is too high for the system or too low for big assemblies.
+ - The feature "Genome Relation" requires an expected genome size. It can be 
+   indicated or roughly estimated as the sum of contig lengths.
+ 
+ -> The output is a set of TXT files (one per input ACE or FASTA file) 
+    containing the calculated features in a table like format. The files are  
+	named with same paths and prefixes as the corresponding assembly files and  
+	the suffix ".features.txt".
+
+surankco-score
+ - Expects a list of contig FASTA files or a directory with FASTA files. 
+ - The expected suffix for the contig FASTAs is ".contigs.fasta" (variable).
+ - Reference genome FASTAs are automatically detected and expected to share the 
+   same prefix as corresponding contig FASTAs followed by the suffix 
+   ".ref.fasta" (variable).
+ - The memory used by the Java virtual machine can be adjusted, e.g. if the
+   default (32 GB) is too high for the system or too low for big assemblies.
+
+ -> The output is a set of TXT files (one per input contig FASTA file) 
+    containing the calculated scores in a table like format. The files are named 
+    with same paths and prefixes as the corresponding FASTA files and the suffix 
+    ".scores.txt". Additionally, a PDF file with histograms per score calculated 
+    over all processed contigs is exported to support the class separation 
+    threshold selection in surankco-training. The default filename
+	"surankco_score_histograms.pdf" can be customized.
+    (Please refer to the indicated paper for details about the thresholds)
+
+surankco-training
+ - Expects a list of SuRankCo feature files or a directory with feature files.
+ - The expected suffix for the feature files is ".features.txt".
+ - Corresponding SuRankCo score files are automatically detected and expected to 
+   share the same prefix as the corresponding feature files followed by the 
+   suffix ".scores.txt".
+ - For the class separation, manual thresholds per score or a quantile for 
+   automatic threshold selection with exponential fittings can be indicated.
+   Manual thresholds need range [0,1], except for threshold 5 and 7 
+   (MaxRegionError and MaxEndErrorCount) which need range [0,100]. Most scores 
+   get separated into <= or > of the threshold, except for NormedMatchCount1, 
+   NormedMatchCount2 and NormedContigLength1 which use >= or <.
+   The exponential quantile needs the range [0,1] and uses <= or > in all cases,
+   since NormedMatchCount1, NormedMatchCount2 and NormedContigLength1 have to be
+   reversed anyway to enable exponential fittings.
+ 
+ -> The output is an R data file containing the trained random forests. 
+    The default file name is "surankco_rfs.RData" but can be changed.
+ 
+surankco-prediction
+ - Expects one SuRankCo feature file with suffix ".features.txt".
+ - Expects a R data file with a random forest trained by surankco-training and 
+   the suffix ".RData".
+
+ -> The output is a table formatted text file (named "surankco_results.txt" per 
+    default, but can be changed), sorted by decreasing scores.
+	
+    The file contains four columns:
+    "Assembly"	          : the filename of the corresponding assembly file
+    "ReadQuality"	      : the filename of the corresponding read quality file
+    "ContigID"	          : the contig id as in the corresponding assembly and 
+                            feature file
+    "SurankcoContigScore" : the final SuRankCo contig score
+
+
+INPUT FILE & EXECUTION EXAMPLES
+-------------------------------
+The following file lists illustrate the required file combinations and suffixes.
+
+Training
+surankco-feature:
+	assembly1.ace, assembly2.ace, assemblyXYZ.ace
+	assembly1.qual, assembly2.qual, assemblyXYZ.qual
+surankco-score:
+	assembly1.contigs.fasta, assembly2.contigs.fasta, assemblyXYZ.contigs.fasta
+	assembly1.ref.fasta, assembly2.ref.fasta, assemblyXYZ.ref.fasta
+surankco-training:
+	assembly1.features.txt, assembly2.features.txt, assemblyXYZ.features.txt
+	assembly1.scores.txt, assembly2.scores.txt, assemblyXYZ.scores.txt
+
+Prediction
+surankco-feature:
+	assembly_new.ace 
+	assembly_new.qual
+surankco-prediction:
+	assembly_new.txt
+	surankco_rfs.RData
+
+The following calls show example parameter uses (short flags and long flags).
+The example calls assume all files to be in the execution directory (-a) or in 
+the indicated directory (-d). However, full pathnames can be indicated.
+Note, file list and directory parameters are mutually exclusive but indicating
+one of them is mandatory. Other parameters are optional due to default values.
+manual.thresholds has higher priority than exponential.quantile when indicated.
+
+surankco-feature 
+    -a assembly1.ace,assembly2.ace,assemblyXYZ.ace
+    -d /home/user/mydata/
+    -r qual
+    -q illumina18
+    -s ////.////d+-
+    -t 4
+    -m 32
+    -k
+    -g 1234567,2000000,111111
+	
+    --assemblies=assembly1.ace,assembly2.ace,assemblyXYZ.ace
+    --directory=/home/user/mydata/
+    --read.quality.format=qual
+    --fastq.version=illumina18
+    --split.regex=////.////d+-
+    --threads=4
+    --memory=32
+    --kmer.features
+    --expected.genome.size=1234567,2000000,111111
+	
+surankco-score 
+    -a assembly1.contigs.fasta,assembly2.contigs.fasta,assemblyXYZ.contigs.fasta
+    -d /home/user/mydata/
+    -f contigs.fasta
+    -r ref.fasta
+    -p /home/user/mydata/surankco_score_histograms.pdf
+    -m 32
+	
+    --assemblies=assembly1.contigs.fasta,assembly2.contigs.fasta,assemblyXYZ.contigs.fasta
+    --directory=/home/user/mydata/
+    --assembly.suffix=contigs.fasta
+    --reference.suffix=ref.fasta
+    --pdf.histograms=/home/user/mydata/surankco_score_histograms.pdf
+    --memory=32
+	
+surankco-training 
+    -f assembly1.features.txt,assembly2.features.txt,assemblyXYZ.features.txt
+    -d /home/user/mydata/
+    -o /home/user/mydata/mydata_rfs.RData
+    -e 0.5
+    -m 0.95,0.95,0.05,0.05,5,0.05,5,0.95
+
+    --features=assembly1.features.txt,assembly2.features.txt,assemblyXYZ.features.txt
+    --directory=/home/user/mydata/
+    --output.filename=/home/user/mydata/mydata_rfs.RData
+    --exponential.quantile=0.5
+    --manual.thresholds=0.95,0.95,0.05,0.05,5,0.05,5,0.95
+	
+surankco-prediction 
+    -f assembly_new.txt
+    -r surankco_rfs.RData
+    -o my_results.txt
+
+    --features=assembly_new.txt 
+    --random.forests=surankco_rfs.RData
+    --output.filename=my_results.txt
+
+
+TROUBLESHOOTING
+---------------
+P1: [OSX] Error loading package "randomForest" in R 3.1
+ Error in dyn.load(file, DLLpath = DLLpath, ...) : 
+  kann shared object '/Library/Frameworks/R.framework/Versions/3.1/Resources/
+  library/randomForest/libs/randomForest.so' nicht laden:
+  dlopen(/Library/Frameworks/R.framework/Versions/3.1/Resources/library/
+  randomForest/libs/randomForest.so, 6): Library not loaded: /Library/
+  Frameworks/R.framework/Versions/3.0/Resources/lib/libgfortran.2.dylib
+  Referenced from: /Library/Frameworks/R.framework/Versions/3.1/Resources/
+  library/randomForest/libs/randomForest.so
+  Reason: image not found
+ Fehler: Laden von Paket oder Namensraum für ‘randomForest’ fehlgeschlagen
+
+ randomForest tries to load external libraries of older R Version 3.0.*. 
+ However, providing them by installing R 3.0.* in parallel leads to 
+ incompatibilities (see next problem P2). Therefore, using R 3.0.3 solely is 
+ recommended for OSX.
+
+P2: [OSX] Memory error running suranco-training with R 3.1
+ *** caught segfault ***
+ address 0x18, cause 'memory not mapped'
+
+ R 3.1 doesn't provide the required versions of external libraries for the
+ randomForest package (see previous problem P1). Providing the libraries of 
+ older R Versions leads to incompatibilities with R 3.1 and seems to produce a 
+ segfault. Therefore, using R 3.0.3 solely is recommended for OSX.
+
+P3: Permission denied when executing surankco, e.g.:
+ -bash: /home/kuhringm/workspace/surankco/surankco-score: Permission denied
+ 
+ The surankco scripts might have lost their execution permission. In a terminal, 
+ go to the suranko directory and run "chmod +x surankco-feature surankco-score 
+ surankco-training surankco-prediction".
+ 
+P4: suranco-score terminates with following Permission denied error:
+ sh: 1: /home/kuhringm/workspace/surankco/r/pslMatchFilter: Permission denied
+ Error: pslMatchFilter not successfully executed: 126
+ Execution halted
+
+ pslMatchFilter needs execution permissions. In a terminal, go to the suranko
+ directory and run "chmod +x r/pslMatchFilter".
+
+P4: surankco-score prints dexp/NaN warning messages:
+ Warning messages:
+ 1: In dexp(x, estimate, log = TRUE) : NaNs produced
+ 2: In dexp(x, rate = fit$estimate) : NaNs produced
+
+ Some scores don't have enough variance to fit an exponential distribution. This
+ is not critical in surankco-score, but might be a problem for the training
+ (see problem P6).
+
+P5: surankco-training prints dexp/NaN warning messages:
+ Warning messages:
+ 1: In dexp(x, estimate, log = TRUE) : NaNs produced
+ 
+ Some scores don't have enough variance to fit an exponential distribution.
+ The training will not be possible (see problem P6).
+
+P6: surankco-training terminates with following training error
+ Error in randomForest.default(x = input[index, ], y = targets[index, i],  :
+  Need at least two classes to do classification.
+ Calls: rfTraining -> randomForest -> randomForest.default
+ Execution halted
+ 
+ Some scores don't provide two classes. Either the variance of these scores is
+ indeed to low or the thresholds for class separation are poorly chosen. The
+ variance might be increased by providing more contigs for the training.
+ 
+	
+--------------------------------------------------------------------------------
+Copyright (c) 2014, 
+Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * The name of the author may not be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL Mathias Kuhring BE LIABLE FOR ANY DIRECT, 
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/src/de/rki/ng4/surankco/data/AceContig.java b/src/de/rki/ng4/surankco/data/AceContig.java
new file mode 100755
index 0000000..196aec6
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/AceContig.java
@@ -0,0 +1,220 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.data;
+
+import java.util.Vector;
+
+public class AceContig{
+	
+	private int[][] nucleotides;
+	
+	private int lineCO;
+	private int lineBQ;
+
+	private String header;
+
+	private String id;
+	private int baseCount;
+	private int readCount;
+	private int baseSeqmentCount;
+	private String orientation;
+
+	private String consensusSequence;
+
+	private Vector<Integer> qualityValues;
+
+	private Vector<AceRead> reads;
+
+	public AceContig(String header){
+		// CO 1 30502 510 273 U
+		this.header = header;
+		
+		String[] splits = header.split(" ");
+		
+		id = splits[1];
+		baseCount = Integer.parseInt(splits[2]);
+		readCount = Integer.parseInt(splits[3]);
+		baseSeqmentCount = Integer.parseInt(splits[4]);
+		orientation = splits[5];
+		
+		qualityValues = new Vector<Integer>();
+		reads = new Vector<AceRead>();
+	}
+	
+	public void addRead(AceRead read){
+		reads.add(read);
+	}
+	
+
+	/**
+	 * @return the nucleotides
+	 */
+	public int[][] getNucleotides() {
+		return nucleotides;
+	}
+
+	/**
+	 * @param nucleotides the nucleotides to set
+	 */
+	public void setNucleotides(int[][] nucleotides) {
+		this.nucleotides = nucleotides;
+	}
+
+	/**
+	 * @return the lineCO
+	 */
+	public int getLineCO() {
+		return lineCO;
+	}
+
+	/**
+	 * @param lineCO the lineCO to set
+	 */
+	public void setLineCO(int lineCO) {
+		this.lineCO = lineCO;
+	}
+
+	/**
+	 * @return the lineBQ
+	 */
+	public int getLineBQ() {
+		return lineBQ;
+	}
+
+	/**
+	 * @param lineBQ the lineBQ to set
+	 */
+	public void setLineBQ(int lineBQ) {
+		this.lineBQ = lineBQ;
+	}
+
+	/**
+	 * @return the header
+	 */
+	public String getHeader() {
+		return header;
+	}
+
+	/**
+	 * @param header the header to set
+	 */
+	public void setHeader(String header) {
+		this.header = header;
+	}
+
+	/**
+	 * @return the id
+	 */
+	public String getId() {
+		return id;
+	}
+
+	/**
+	 * @param id the id to set
+	 */
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	/**
+	 * @return the baseCount
+	 */
+	public int getBaseCount() {
+		return baseCount;
+	}
+
+	/**
+	 * @param baseCount the baseCount to set
+	 */
+	public void setBaseCount(int baseCount) {
+		this.baseCount = baseCount;
+	}
+
+	/**
+	 * @return the readCount
+	 */
+	public int getReadCount() {
+		return readCount;
+	}
+
+	/**
+	 * @param readCount the readCount to set
+	 */
+	public void setReadCount(int readCount) {
+		this.readCount = readCount;
+	}
+
+	/**
+	 * @return the baseSeqmentCount
+	 */
+	public int getBaseSeqmentCount() {
+		return baseSeqmentCount;
+	}
+
+	/**
+	 * @param baseSeqmentCount the baseSeqmentCount to set
+	 */
+	public void setBaseSeqmentCount(int baseSeqmentCount) {
+		this.baseSeqmentCount = baseSeqmentCount;
+	}
+
+	/**
+	 * @return the orientation
+	 */
+	public String getOrientation() {
+		return orientation;
+	}
+
+	/**
+	 * @param orientation the orientation to set
+	 */
+	public void setOrientation(String orientation) {
+		this.orientation = orientation;
+	}
+
+	/**
+	 * @return the consensusSequence
+	 */
+	public String getConsensusSequence() {
+		return consensusSequence;
+	}
+
+	/**
+	 * @param consensusSequence the consensusSequence to set
+	 */
+	public void setConsensusSequence(String consensusSequence) {
+		this.consensusSequence = consensusSequence;
+	}
+
+	/**
+	 * @return the qualityValues
+	 */
+	public Vector<Integer> getQualityValues() {
+		return qualityValues;
+	}
+
+	/**
+	 * @param qualityValues the qualityValues to set
+	 */
+	public void setQualityValues(Vector<Integer> qualityValues) {
+		this.qualityValues = qualityValues;
+	}
+
+	/**
+	 * @return the reads
+	 */
+	public Vector<AceRead> getReads() {
+		return reads;
+	}
+
+	/**
+	 * @param reads the reads to set
+	 */
+	public void setReads(Vector<AceRead> reads) {
+		this.reads = reads;
+	}
+	
+}
diff --git a/src/de/rki/ng4/surankco/data/AceRead.java b/src/de/rki/ng4/surankco/data/AceRead.java
new file mode 100755
index 0000000..53d6595
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/AceRead.java
@@ -0,0 +1,182 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.data;
+
+public class AceRead {
+
+	private int lineAF;
+	private int lineRD;
+	private int lineQA;
+		
+	// AF
+	private String id;
+	private char orientation;
+	private int offset;
+	
+	// RD
+	private int paddedLength;
+	
+	private String paddedSequence;
+	
+	// QA
+	private int alignClippingStart;
+	private int alignClippingEnd;
+
+	public AceRead(){
+	}
+	
+	public void addAF(String line){
+		String[] splits = line.split(" ");
+		id = splits[1];
+		orientation = splits[2].charAt(0);
+		offset = Integer.parseInt(splits[3]);
+	}
+	
+	public void addRD(String line){
+		// RD TBEOG48.y1 619 0 0
+		String[] splits = line.split(" ");
+		id = splits[1];
+		paddedLength = Integer.parseInt(splits[2]);
+	}
+
+	public void addQA(String line){
+		String[] splits = line.split(" ");
+		alignClippingStart = Integer.parseInt(splits[3]);
+		alignClippingEnd = Integer.parseInt(splits[4]);
+	}
+	
+	public int getClippingLength(){
+		return (alignClippingEnd-alignClippingStart+1);
+	}
+
+	/**
+	 * @return the lineRD
+	 */
+	public int getLineRD() {
+		return lineRD;
+	}
+
+	/**
+	 * @param lineRD the lineRD to set
+	 */
+	public void setLineRD(int lineRD) {
+		this.lineRD = lineRD;
+	}
+
+	/**
+	 * @return the id
+	 */
+	public String getId() {
+		return id;
+	}
+
+	/**
+	 * @param id the id to set
+	 */
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	/**
+	 * @return the orientation
+	 */
+	public char getOrientation() {
+		return orientation;
+	}
+
+	/**
+	 * @param orientation the orientation to set
+	 */
+	public void setOrientation(char orientation) {
+		this.orientation = orientation;
+	}
+
+	/**
+	 * @return the paddedSequence
+	 */
+	public String getPaddedSequence() {
+		return paddedSequence;
+	}
+
+	/**
+	 * @param paddedSequence the paddedSequence to set
+	 */
+	public void setPaddedSequence(String paddedSequence) {
+		this.paddedSequence = paddedSequence;
+	}
+
+	/**
+	 * @return the offset
+	 */
+	public int getOffset() {
+		return offset;
+	}
+
+	/**
+	 * @param offset the offset to set
+	 */
+	public void setOffset(int offset) {
+		this.offset = offset;
+	}
+
+	/**
+	 * @return the paddedLength
+	 */
+	public int getPaddedLength() {
+		return paddedLength;
+	}
+
+	/**
+	 * @param paddedLength the paddedLength to set
+	 */
+	public void setPaddedLength(int paddedLength) {
+		this.paddedLength = paddedLength;
+	}
+
+	/**
+	 * @return the alignClippingStart
+	 */
+	public int getAlignClippingStart() {
+		return alignClippingStart;
+	}
+
+	/**
+	 * @param alignClippingStart the alignClippingStart to set
+	 */
+	public void setAlignClippingStart(int alignClippingStart) {
+		this.alignClippingStart = alignClippingStart;
+	}
+
+	/**
+	 * @return the alignClippingEnd
+	 */
+	public int getAlignClippingEnd() {
+		return alignClippingEnd;
+	}
+
+	/**
+	 * @param alignClippingEnd the alignClippingEnd to set
+	 */
+	public void setAlignClippingEnd(int alignClippingEnd) {
+		this.alignClippingEnd = alignClippingEnd;
+	}
+
+	public int getLineAF() {
+		return lineAF;
+	}
+
+	public void setLineAF(int lineAF) {
+		this.lineAF = lineAF;
+	}
+
+	public int getLineQA() {
+		return lineQA;
+	}
+
+	public void setLineQA(int lineQA) {
+		this.lineQA = lineQA;
+	}
+}
\ No newline at end of file
diff --git a/src/de/rki/ng4/surankco/data/Assembly.java b/src/de/rki/ng4/surankco/data/Assembly.java
new file mode 100755
index 0000000..59db0e7
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/Assembly.java
@@ -0,0 +1,30 @@
+package de.rki.ng4.surankco.data;
+
+import java.util.Iterator;
+import java.util.Vector;
+
+public abstract class Assembly {
+	
+	protected String contigFile;
+	protected String readFile;
+	
+	protected Vector<Contig> contigs;
+
+	public abstract Vector<Contig> getContigs();
+
+	public abstract Vector<Vector<Integer>> getPooledReadQualities(
+			Vector<Vector<String>> readIds);
+
+	public abstract Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> readIds);
+
+	public void lengthFilter(int minContigSize) {
+//		int counter = 0;
+		for (Iterator<Contig> iterator = contigs.iterator(); iterator.hasNext(); ){
+			if (iterator.next().getLength() < minContigSize){
+				iterator.remove();
+//				counter++;
+			}
+		}
+//		System.out.println("removed " + counter + " small contigs");
+	}
+}
diff --git a/src/de/rki/ng4/surankco/data/AssemblyAce.java b/src/de/rki/ng4/surankco/data/AssemblyAce.java
new file mode 100755
index 0000000..7a3c5ea
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/AssemblyAce.java
@@ -0,0 +1,162 @@
+package de.rki.ng4.surankco.data;
+
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.Assembly;
+import de.rki.ng4.surankco.files.Ace;
+
+public class AssemblyAce extends Assembly {
+
+	private FastqReads reads;
+	
+	public AssemblyAce(String aceFileName, String readFileName, String fastqParameter){
+		contigs = new Vector<Contig>();
+		
+		contigFile = aceFileName;
+		readFile = readFileName;
+		
+		Ace aceIn = new Ace(aceFileName);
+		aceIn.readConsensusSequences();
+		aceIn.readQualityValues();
+
+		reads = new FastqReads(readFileName, fastqParameter);
+		
+		convertContigs(aceIn);
+	}
+	
+	/*
+	 * convert Ace Contigs & Fastq reads to internal Contigs
+	 */
+	private void convertContigs(Ace aceIn){
+		Contig contig;
+		Read read;
+		Vector<Read> reads;
+		for (AceContig aceContig : aceIn.getContigs()){
+			contig = new Contig(aceContig.getId());
+			contig.setLength(aceContig.getQualityValues().size());
+			contig.setBaseCount(aceContig.getBaseCount());
+			contig.setReadCount(aceContig.getReadCount());
+			contig.setBaseSeqmentCount(aceContig.getBaseSeqmentCount());
+			contig.setConsensusSequence(aceContig.getConsensusSequence());
+			contig.setQualityValues(aceContig.getQualityValues());
+			
+			contig.setNucleotideCount(countNucleotides(aceContig));
+			
+			reads = new Vector<Read>();
+			for (AceRead aceRead : aceContig.getReads()){
+				read = new Read(aceRead.getId());
+				read.setOrientation(aceRead.getOrientation());
+				read.setOffset(aceRead.getOffset());
+				read.setPaddedLength(aceRead.getPaddedLength());
+//				read.setPaddedSequence(aceRead.getPaddedSequence());
+				read.setAlignClippingStart(aceRead.getAlignClippingStart());
+				read.setAlignClippingEnd(aceRead.getAlignClippingEnd());
+				reads.add(read);
+			}
+			contig.setReads(reads);
+			contigs.add(contig);
+		}
+	}
+
+	private int[][] countNucleotides(AceContig c){
+		int[][] nucleotides = new int[c.getConsensusSequence().length()][6];
+
+		String readSeq;
+		int offset, contigPos;
+		boolean clipped;
+		for (AceRead read : c.getReads()){
+			readSeq = read.getPaddedSequence().toUpperCase();
+			offset = read.getOffset();
+			clipped = false;
+			for (int readPos=(read.getAlignClippingStart()-1); 
+					readPos<read.getAlignClippingEnd(); readPos++){
+				contigPos = readPos + offset - 1;
+
+				// workaround for reads out of contig range
+				if ((contigPos < 0) || (contigPos >= nucleotides.length)){
+					clipped = true;
+				}
+				else{
+				
+				switch (readSeq.charAt(readPos)){
+				case 'A':
+					nucleotides[contigPos][0]++;
+					break;
+				case 'C':
+					nucleotides[contigPos][1]++;
+					break;
+				case 'G':
+					nucleotides[contigPos][2]++;
+					break;
+				case 'T':
+					nucleotides[contigPos][3]++;
+					break;
+				case 'N':
+					nucleotides[contigPos][4]++;
+					break;
+				case '*':
+					nucleotides[contigPos][5]++;
+					break;
+				default:
+				}
+				
+				}
+			}
+			if (clipped) System.out.println("warning: clipped read " + read.getId());
+		}
+
+		return nucleotides;
+	}
+	
+	@Override
+	public Vector<Contig> getContigs() {
+		return contigs;
+	}
+
+	@Override
+	public Vector<Vector<Integer>> getPooledReadQualities(
+			Vector<Vector<String>> readIds) {
+		Vector<Vector<Integer>> readQualities = reads.getPooledReadQualities(readIds);
+		reads.readMatchCheck(readQualities, getIds(), contigFile, readFile);
+		return readQualities;
+	}
+
+	@Override
+	public Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> readIds) {
+		Vector<Vector<Integer>> readLengths = reads.getReadLengths(readIds);
+		reads.readMatchCheck(readLengths, getIds(), contigFile, readFile);
+		return readLengths;
+	}
+	
+	public Vector<String> getIds(){
+		Vector<String> ids = new Vector<String>();
+		for (Contig c : contigs){
+			ids.add(c.getId());
+		}
+		return ids;
+	}
+	
+	@SuppressWarnings("unused")
+	private Vector<Vector<Character>> buildAlignment(AceContig c){
+		Vector<Vector<Character>> alignment = new Vector<Vector<Character>>();
+
+		for (Character ch : c.getConsensusSequence().toCharArray()){
+			alignment.add(new Vector<Character>());
+			alignment.lastElement().add(ch);
+		}
+
+		int offset;
+		int contigPos;
+
+		for (AceRead read : c.getReads()){
+			offset = read.getOffset();
+			for (int readPos=(read.getAlignClippingStart()-1); 
+					readPos<read.getAlignClippingEnd(); readPos++){
+				contigPos = readPos + offset - 1;
+				alignment.get(contigPos).add(read.getPaddedSequence().charAt(readPos));
+			}
+		}
+
+		return alignment;
+	}
+}
diff --git a/src/de/rki/ng4/surankco/data/AssemblySam.java b/src/de/rki/ng4/surankco/data/AssemblySam.java
new file mode 100755
index 0000000..3d98888
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/AssemblySam.java
@@ -0,0 +1,174 @@
+package de.rki.ng4.surankco.data;
+
+import htsjdk.samtools.SAMRecord;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.files.Fasta;
+import de.rki.ng4.surankco.files.Sam;
+
+public class AssemblySam extends Assembly {
+
+	public static void main(String[] args){
+		String samfile = "data/ex1.sam";
+		String fastafile = "data/ex1.fa";
+		AssemblySam test = new AssemblySam(fastafile, samfile);
+		for (Contig contig : test.getContigs()){
+			System.out.println(contig.getId());
+		}
+	}
+
+	public AssemblySam(String fastaFileName, String samFileName){
+		contigs = new Vector<Contig>();
+		contigFile = fastaFileName;
+		readFile = samFileName;
+
+		Fasta fasta = new Fasta(fastaFileName);
+		Vector<FastaContig> fastaContigs = fasta.parseFasta();
+
+		//		for (FastaContig contig : fastaContigs){
+		//			System.out.println(contig.getId() + " (length=" + contig.getSequence().length() + ")");
+		//			System.out.println(contig.getSequence());
+		//		}
+
+		Sam sam = new Sam(samFileName);
+		HashMap<String, SamContig> samContigs = sam.getContigs();
+
+		convertContigs(fastaContigs, samContigs);
+
+		//		for (SamContig contig : samContigs.values()){
+		//			System.out.println(contig.getId());
+		//		}
+	}
+
+	/*
+	 * convert Fasta Contigs & Sam Reads to internal Contigs
+	 */
+	private void convertContigs(Vector<FastaContig> fastaContigs, HashMap<String, SamContig> samContigs){
+		SamContig samContig;
+		Contig contig;
+		Read read;
+		Vector<Read> reads;
+		for (FastaContig fastaContig : fastaContigs){
+			if (samContigs.containsKey(fastaContig.getId())){
+				samContig = samContigs.get(fastaContig.getId());
+
+				contig = new Contig(fastaContig.getId());
+
+				contig.setLength(fastaContig.getSequence().length());
+				contig.setConsensusSequence(fastaContig.getSequence());
+
+				contig.setReadCount(samContig.getReads().size());
+
+				contig.setNucleotideCount(countNucleotides(samContig.getAlignment()));
+
+				//			contig.setBaseCount(fastaContig.getSequence().length());
+				//			contig.setBaseSeqmentCount(fastaContig.getBaseSeqmentCount());
+				//			contig.setQualityValues(fastaContig.getQualityValues());
+
+				reads = new Vector<Read>();
+				for (SAMRecord samRead : samContig.getReads()){
+					read = new Read(samRead.getReadName());
+					read.setOffset(samRead.getUnclippedStart());
+					read.setAlignClippingStart(samRead.getAlignmentStart() - samRead.getUnclippedStart() + 1);
+					read.setAlignClippingEnd(samRead.getAlignmentEnd() - samRead.getUnclippedEnd() + 1);
+					//				read.setPaddedSequence(samRead.getReadString()); // sam read sequence is not padded
+
+					if (samRead.getReadNegativeStrandFlag()) read.setOrientation('C');
+					else read.setOrientation('U');
+
+					read.setQualityValues(parsePhred(samRead.getBaseQualityString()));
+
+					// not sure if correct
+					read.setPaddedLength(samRead.getCigar().getReferenceLength());
+
+					reads.add(read);
+				}
+				contig.setReads(reads);
+				contigs.add(contig);
+			}
+			else {
+				System.out.println("warning: skipped contig " + fastaContig.getId() +
+						" (no reads in sam file)");
+			}	
+		}
+	}
+
+	private int[][] countNucleotides(ArrayList<ArrayList<Byte>> alignment) {
+		int[][] nucleotides = new int[alignment.size()][6];
+
+		for (int i=0; i<alignment.size(); i++){
+			for (Byte nucleotide : alignment.get(i)){
+				switch ((char) nucleotide.byteValue()){
+				case 'A':
+					nucleotides[i][0]++;
+					break;
+				case 'C':
+					nucleotides[i][1]++;
+					break;
+				case 'G':
+					nucleotides[i][2]++;
+					break;
+				case 'T':
+					nucleotides[i][3]++;
+					break;
+				case 'N':
+					nucleotides[i][4]++;
+					break;
+				case '*':
+					nucleotides[i][5]++;
+					break;
+				default:
+				}
+			}
+		}
+
+		return nucleotides;
+	}
+
+	private Vector<Integer> parsePhred(String charQualities){
+		Vector<Integer> intQualities = new Vector<Integer>();
+		for (char qual : charQualities.toCharArray()){
+			intQualities.add((int) qual - 33);
+		}
+		return intQualities;
+	}
+
+	@Override
+	public Vector<Contig> getContigs() {
+		return contigs;
+	}
+
+	@Override
+	public Vector<Vector<Integer>> getPooledReadQualities(Vector<Vector<String>> readIds) {
+		Vector<Vector<Integer>> pooledReadQualities = new Vector<Vector<Integer>>();
+
+		for (Contig contig : contigs){
+			Vector<Integer> readQualities = new Vector<Integer>();
+			for (Read read : contig.getReads()){
+				readQualities.addAll(read.getQualityValues());
+			}
+			pooledReadQualities.add(readQualities);
+		}
+
+		return pooledReadQualities;
+	}
+
+	@Override
+	public Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> readIds) {
+		Vector<Vector<Integer>> readLengths = new Vector<Vector<Integer>>();
+
+		for (Contig contig : contigs){
+			Vector<Integer> lengths = new Vector<Integer>();
+			for (Read read : contig.getReads()){
+				lengths.add(read.getQualityValues().size());
+			}
+			readLengths.add(lengths);
+		}
+
+		return readLengths;
+	}
+
+}
diff --git a/src/de/rki/ng4/surankco/data/Contig.java b/src/de/rki/ng4/surankco/data/Contig.java
new file mode 100755
index 0000000..68f51e1
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/Contig.java
@@ -0,0 +1,100 @@
+package de.rki.ng4.surankco.data;
+
+import java.util.Vector;
+
+/*
+ * Addapter Class for different Contig data, e.g. from Ace, Fasta, ...
+ */
+public class Contig {
+	
+	private String id;
+	private int length;
+	private int baseCount;
+	private int readCount;
+	private int baseSeqmentCount;
+
+	private String consensusSequence;
+	private Vector<Integer> qualityValues;
+	private int[][] nucleotideCounts;
+	
+	private Vector<Read> reads;
+
+		
+	public Contig(String id) {
+		this.id = id;
+	}
+	
+	// getter
+	public String getId() {
+		return id;
+	}
+	
+	public Integer getLength() {
+		return length;
+	}
+
+	public Integer getBaseCount() {
+		return baseCount;
+	}
+
+	public Integer getReadCount() {
+		return readCount;
+	}
+
+	public Integer getBaseSeqmentCount() {
+		return baseSeqmentCount;
+	}
+
+	public Vector<Read> getReads() {
+		return reads;
+	}
+
+	public String getConsensusSequence() {
+		return consensusSequence;
+	}
+	
+	public Vector<Integer> getQualityValues() {
+		return qualityValues;
+	}
+
+	public int[][] getNucleotideCount() {
+		return nucleotideCounts;
+	}
+
+	// setter
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public void setLength(int length) {
+		this.length = length;
+	}
+	
+	public void setBaseCount(int baseCount) {
+		this.baseCount = baseCount;
+	}
+
+	public void setReadCount(int readCount) {
+		this.readCount = readCount;
+	}
+
+	public void setBaseSeqmentCount(int baseSeqmentCount) {
+		this.baseSeqmentCount = baseSeqmentCount;
+	}
+
+	public void setConsensusSequence(String consensusSequence) {
+		this.consensusSequence = consensusSequence;
+	}
+
+	public void setQualityValues(Vector<Integer> qualityValues) {
+		this.qualityValues = qualityValues;
+	}
+
+	public void setNucleotideCount(int[][] nucleotideCount) {
+		this.nucleotideCounts = nucleotideCount;
+	}
+	
+	public void setReads(Vector<Read> reads) {
+		this.reads = reads;
+	}
+}
diff --git a/src/de/rki/ng4/surankco/data/FastaContig.java b/src/de/rki/ng4/surankco/data/FastaContig.java
new file mode 100755
index 0000000..024cefc
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/FastaContig.java
@@ -0,0 +1,24 @@
+package de.rki.ng4.surankco.data;
+
+public class FastaContig {
+
+	private String id;
+	private StringBuffer sequence;
+	
+	public FastaContig(String header){
+		this.id = header.substring(1).split(" ")[0];
+		sequence = new StringBuffer();
+	}
+	
+	public void appendSequence(String sequence){
+		this.sequence.append(sequence);
+	}
+	
+	public String getSequence(){
+		return sequence.toString();
+	}
+
+	public String getId() {
+		return id;
+	}
+}
diff --git a/src/de/rki/ng4/surankco/data/FastqReads.java b/src/de/rki/ng4/surankco/data/FastqReads.java
new file mode 100755
index 0000000..5540ded
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/FastqReads.java
@@ -0,0 +1,109 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.data;
+
+import java.util.HashMap;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.feature.Main;
+import de.rki.ng4.surankco.files.Fastq;
+import de.rki.ng4.surankco.files.Qual;
+
+public class FastqReads {
+
+	public Vector<String> readId;
+	public Vector<Vector<Integer>> readQuality;
+	
+	HashMap<String,Integer> lengthMap;
+	HashMap<String,Vector<Integer>> qualityMap;
+	
+	public FastqReads(String filename, String parameter) {
+		readId = new Vector<String>();	
+		readQuality = new Vector<Vector<Integer>>();
+		
+		String readfile_suffix = filename.substring(filename.lastIndexOf(".")).toLowerCase();
+		
+		if (readfile_suffix.matches(".qual") || readfile_suffix.matches(".qua") ){
+			new Qual(filename, this);
+		}
+		else if(readfile_suffix.matches(".fastq")){
+			new Fastq(filename, this, parameter);
+		}
+		
+		if (Main.DEBUGMODE){
+			int readnumber = Math.min(readId.size(), 5);
+			System.out.println("read quality control view (" + readnumber + " of " + readId.size() + " reads)");
+			for (int i=0; i<readnumber; i++){
+				System.out.println(readId.get(i));
+				System.out.println(readQuality.get(i));
+			}
+		}
+		
+		calcQualityMap();
+	}
+
+	
+	private void calcQualityMap(){
+		qualityMap = new HashMap<String, Vector<Integer>>();
+		for (int i=0; i<readId.size(); i++){
+			qualityMap.put(readId.get(i), readQuality.get(i));
+		}
+	}
+	
+	public Vector<Vector<Integer>> getPooledReadQualities(Vector<Vector<String>> contigReads){
+		
+		Vector<Vector<Integer>> qualities = new Vector<Vector<Integer>>();
+		
+		for (Vector<String> vs : contigReads){
+			qualities.add(new Vector<Integer>());
+			for (String s : vs){
+				try {
+					qualities.lastElement().addAll(qualityMap.get(s));
+				}
+				catch (NullPointerException e){
+//					e.printStackTrace();
+					System.out.println("\tread missing (check split.regex): " + s);
+				}
+			}
+		}
+		return qualities;
+	}
+	
+	public Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> contigReads){
+		Vector<Vector<Integer>> lengths = new Vector<Vector<Integer>>();
+		for (Vector<String> vs : contigReads){
+			lengths.add(new Vector<Integer>());
+			for (String s : vs){
+				try {
+					lengths.lastElement().add(qualityMap.get(s).size());
+				}
+				catch (NullPointerException e){
+//					e.printStackTrace();
+					System.out.println("\tread missing (check split.regex): " + s);
+				}
+			}
+		}
+		return lengths;
+	}
+	
+	public void readMatchCheck(Vector<Vector<Integer>> readMapping, Vector<String> contigId, String contigfile, String readfile){
+		Vector<String> contigsWithNoMatches = new Vector<String>();
+		
+		for (int i=0; i<readMapping.size(); i++){
+			if (readMapping.get(i).size()==0){
+				contigsWithNoMatches.add(contigId.get(i));
+			}
+		}
+		
+		if (contigsWithNoMatches.size() > 0){
+			StringBuffer output = new StringBuffer();
+			output.append("error: no reads matching in " + contigfile + " and " + readfile + " for contigs:\n");
+			output.append(contigsWithNoMatches.toString());
+			System.out.println(output.toString());
+			System.exit(1);
+		}
+	}
+}
diff --git a/src/de/rki/ng4/surankco/data/Read.java b/src/de/rki/ng4/surankco/data/Read.java
new file mode 100755
index 0000000..2876c7a
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/Read.java
@@ -0,0 +1,91 @@
+package de.rki.ng4.surankco.data;
+
+import java.util.Vector;
+
+public class Read {
+
+	private String id;
+	private char orientation;
+	private int offset;
+	
+	private int paddedLength;
+//	private String paddedSequence; // not need anymore, nucleotide counts
+	
+	private int alignClippingStart;
+	private int alignClippingEnd;
+	
+	private Vector<Integer> qualityValues;
+	
+	public Read(String id){
+		this.id = id;
+	}
+	
+	public Integer getClippingLength() {
+		return (alignClippingEnd-alignClippingStart+1);
+	}
+	
+		
+	public String getId() {
+		return id;
+	}
+	
+	public void setId(String id) {
+		this.id = id;
+	}
+	
+	public char getOrientation() {
+		return orientation;
+	}
+	
+	public void setOrientation(char orientation) {
+		this.orientation = orientation;
+	}
+	
+	public int getOffset() {
+		return offset;
+	}
+	
+	public void setOffset(int offset) {
+		this.offset = offset;
+	}
+	
+	public int getPaddedLength() {
+		return paddedLength;
+	}
+	
+	public void setPaddedLength(int paddedLength) {
+		this.paddedLength = paddedLength;
+	}
+	
+//	public String getPaddedSequence() {
+//		return paddedSequence;
+//	}
+	
+//	public void setPaddedSequence(String paddedSequence) {
+//		this.paddedSequence = paddedSequence;
+//	}
+	
+	public int getAlignClippingStart() {
+		return alignClippingStart;
+	}
+	
+	public void setAlignClippingStart(int alignClippingStart) {
+		this.alignClippingStart = alignClippingStart;
+	}
+	
+	public int getAlignClippingEnd() {
+		return alignClippingEnd;
+	}
+	
+	public void setAlignClippingEnd(int alignClippingEnd) {
+		this.alignClippingEnd = alignClippingEnd;
+	}
+
+	public Vector<Integer> getQualityValues() {
+		return qualityValues;
+	}
+
+	public void setQualityValues(Vector<Integer> qualityValues) {
+		this.qualityValues = qualityValues;
+	}	
+}
diff --git a/src/de/rki/ng4/surankco/data/Reads.java b/src/de/rki/ng4/surankco/data/Reads.java
new file mode 100755
index 0000000..7847eb8
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/Reads.java
@@ -0,0 +1,109 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.data;
+
+import java.util.HashMap;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.feature.Main;
+import de.rki.ng4.surankco.files.Fastq;
+import de.rki.ng4.surankco.files.Qual;
+
+public class Reads {
+
+	public Vector<String> readId;
+	public Vector<Vector<Integer>> readQuality;
+	
+	HashMap<String,Integer> lengthMap;
+	HashMap<String,Vector<Integer>> qualityMap;
+	
+	public Reads(String filename, String parameter) {
+		readId = new Vector<String>();	
+		readQuality = new Vector<Vector<Integer>>();
+		
+		String readfile_suffix = filename.substring(filename.lastIndexOf(".")).toLowerCase();
+		
+		if (readfile_suffix.matches(".qual") || readfile_suffix.matches(".qua") ){
+			new Qual(filename, this);
+		}
+		else if(readfile_suffix.matches(".fastq")){
+			new Fastq(filename, this, parameter);
+		}
+		
+		if (Main.DEBUGMODE){
+			int readnumber = Math.min(readId.size(), 5);
+			System.out.println("read quality control view (" + readnumber + " of " + readId.size() + " reads)");
+			for (int i=0; i<readnumber; i++){
+				System.out.println(readId.get(i));
+				System.out.println(readQuality.get(i));
+			}
+		}
+		
+		calcQualityMap();
+	}
+
+	
+	private void calcQualityMap(){
+		qualityMap = new HashMap<String, Vector<Integer>>();
+		for (int i=0; i<readId.size(); i++){
+			qualityMap.put(readId.get(i), readQuality.get(i));
+		}
+	}
+	
+	public Vector<Vector<Integer>> getPooledReadQualities(Vector<Vector<String>> contigReads){
+		
+		Vector<Vector<Integer>> qualities = new Vector<Vector<Integer>>();
+		
+		for (Vector<String> vs : contigReads){
+			qualities.add(new Vector<Integer>());
+			for (String s : vs){
+				try {
+					qualities.lastElement().addAll(qualityMap.get(s));
+				}
+				catch (NullPointerException e){
+//					e.printStackTrace();
+					System.out.println("\tread missing (check split.regex): " + s);
+				}
+			}
+		}
+		return qualities;
+	}
+	
+	public Vector<Vector<Integer>> getReadLengths(Vector<Vector<String>> contigReads){
+		Vector<Vector<Integer>> lengths = new Vector<Vector<Integer>>();
+		for (Vector<String> vs : contigReads){
+			lengths.add(new Vector<Integer>());
+			for (String s : vs){
+				try {
+					lengths.lastElement().add(qualityMap.get(s).size());
+				}
+				catch (NullPointerException e){
+//					e.printStackTrace();
+					System.out.println("\tread missing (check split.regex): " + s);
+				}
+			}
+		}
+		return lengths;
+	}
+	
+	public void readMatchCheck(Vector<Vector<Integer>> readMapping, Vector<String> contigId, String contigfile, String readfile){
+		Vector<String> contigsWithNoMatches = new Vector<String>();
+		
+		for (int i=0; i<readMapping.size(); i++){
+			if (readMapping.get(i).size()==0){
+				contigsWithNoMatches.add(contigId.get(i));
+			}
+		}
+		
+		if (contigsWithNoMatches.size() > 0){
+			StringBuffer output = new StringBuffer();
+			output.append("error: no reads matching in " + contigfile + " and " + readfile + " for contigs:\n");
+			output.append(contigsWithNoMatches.toString());
+			System.out.println(output.toString());
+			System.exit(1);
+		}
+	}
+}
diff --git a/src/de/rki/ng4/surankco/data/SamContig.java b/src/de/rki/ng4/surankco/data/SamContig.java
new file mode 100755
index 0000000..39a70d9
--- /dev/null
+++ b/src/de/rki/ng4/surankco/data/SamContig.java
@@ -0,0 +1,43 @@
+package de.rki.ng4.surankco.data;
+
+import htsjdk.samtools.SAMRecord;
+
+import java.util.ArrayList;
+
+public class SamContig {
+
+	private String id;
+	
+	private ArrayList<SAMRecord> reads;
+	private ArrayList<ArrayList<Byte>> alignment;
+	
+	public SamContig(String id){
+		this.id = id;
+		reads = new ArrayList<SAMRecord>();
+		alignment = new ArrayList<ArrayList<Byte>>();
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public ArrayList<SAMRecord> getReads() {
+		return reads;
+	}
+
+	public void addRead(SAMRecord read) {
+		this.reads.add(read);
+	}
+
+	public ArrayList<ArrayList<Byte>> getAlignment() {
+		return alignment;
+	}
+
+	public void addAlignmentColumn(ArrayList<Byte> alignmentColumn) {
+		this.alignment.add(alignmentColumn);
+	}
+}
diff --git a/src/de/rki/ng4/surankco/feature/Coverage.java b/src/de/rki/ng4/surankco/feature/Coverage.java
new file mode 100755
index 0000000..f2ffdf2
--- /dev/null
+++ b/src/de/rki/ng4/surankco/feature/Coverage.java
@@ -0,0 +1,166 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.feature;
+
+import java.util.Vector;
+
+import de.rki.ng4.surankco.tools.Statistic;
+
+
+
+public class Coverage {
+
+	private Vector<Vector<Integer>> coverages;
+	
+	private  Vector<Vector<Integer>> coverageLeftEnd;
+	private  Vector<Vector<Integer>> coverageRightEnd;
+	
+	private  Vector<Double> coverageMinEndMean;
+	private  Vector<Double> coverageMinEndSD;
+	private  Vector<Double> coverageMinEndMedian;
+	private  Vector<Double> coverageMinEndMAD;
+	private  Vector<Double> coverageMinEndMin;
+	private  Vector<Double> coverageMinEndMax;
+	
+	private  Vector<Double> coverageMaxEndMean;
+	private  Vector<Double> coverageMaxEndSD;
+	private  Vector<Double> coverageMaxEndMedian;
+	private  Vector<Double> coverageMaxEndMAD;
+	private  Vector<Double> coverageMaxEndMin;
+	private  Vector<Double> coverageMaxEndMax;
+	
+	boolean calced = false;
+	
+	public Coverage(Vector<Vector<Integer>> coverages){
+		this.coverages = coverages;
+	}
+	
+	public boolean calcEndCovs(Vector<Double> endSize){
+		boolean sizeCheck = endSize.size() == coverages.size();
+		if(sizeCheck){
+			coverageLeftEnd = new Vector<Vector<Integer>>();
+			coverageRightEnd = new Vector<Vector<Integer>>();
+			for(int i=0; i<coverages.size(); i++){
+				coverageLeftEnd.add(new Vector<Integer>(coverages.get(i).subList(0, Math.min(endSize.get(i).intValue(), coverages.get(i).size()))));
+				coverageRightEnd.add(new Vector<Integer>(coverages.get(i).subList(Math.max(0, coverages.get(i).size()-endSize.get(i).intValue()), coverages.get(i).size())));
+			}
+		}
+		calcStats();
+		return sizeCheck;
+	}
+		
+	private void calcStats(){
+		coverageMinEndMean = new Vector<Double>();
+		coverageMinEndSD = new Vector<Double>();
+		coverageMinEndMedian = new Vector<Double>();
+		coverageMinEndMAD = new Vector<Double>();
+		coverageMinEndMin = new Vector<Double>();
+		coverageMinEndMax = new Vector<Double>();
+		
+		coverageMaxEndMean = new Vector<Double>();
+		coverageMaxEndSD = new Vector<Double>();
+		coverageMaxEndMedian = new Vector<Double>();
+		coverageMaxEndMAD = new Vector<Double>();
+		coverageMaxEndMin = new Vector<Double>();
+		coverageMaxEndMax = new Vector<Double>();
+		
+		Vector<Double> leftMean = Statistic.means(coverageLeftEnd);
+		Vector<Double> leftSD = Statistic.sds(coverageLeftEnd);
+		Vector<Double> leftMedian = Statistic.medians(coverageLeftEnd);
+		Vector<Double> leftMAD = Statistic.mads(coverageLeftEnd);
+		Vector<Double> leftMin = Statistic.mins(coverageLeftEnd);
+		Vector<Double> leftMax = Statistic.maxs(coverageLeftEnd);
+		
+		Vector<Double> rightMean = Statistic.means(coverageRightEnd);
+		Vector<Double> rightSD = Statistic.sds(coverageRightEnd);
+		Vector<Double> rightMedian = Statistic.medians(coverageRightEnd);
+		Vector<Double> rightMAD = Statistic.mads(coverageRightEnd);
+		Vector<Double> rightMin = Statistic.mins(coverageRightEnd);
+		Vector<Double> rightMax = Statistic.maxs(coverageRightEnd);
+		
+		for (int i=0; i<leftMean.size(); i++){
+			if (leftMean.get(i) <= rightMean.get(i)){
+				coverageMinEndMean.add(leftMean.get(i));
+				coverageMinEndSD.add(leftSD.get(i));
+				coverageMinEndMedian.add(leftMedian.get(i));
+				coverageMinEndMAD.add(leftMAD.get(i));
+				coverageMinEndMin.add(leftMin.get(i));
+				coverageMinEndMax.add(leftMax.get(i));
+				
+				coverageMaxEndMean.add(rightMean.get(i));
+				coverageMaxEndSD.add(rightSD.get(i));
+				coverageMaxEndMedian.add(rightMedian.get(i));
+				coverageMaxEndMAD.add(rightMAD.get(i));
+				coverageMaxEndMin.add(rightMin.get(i));
+				coverageMaxEndMax.add(rightMax.get(i));
+			}
+			else{
+				coverageMinEndMean.add(rightMean.get(i));
+				coverageMinEndSD.add(rightSD.get(i));
+				coverageMinEndMedian.add(rightMedian.get(i));
+				coverageMinEndMAD.add(rightMAD.get(i));
+				coverageMinEndMin.add(rightMin.get(i));
+				coverageMinEndMax.add(rightMax.get(i));
+				
+				coverageMaxEndMean.add(leftMean.get(i));
+				coverageMaxEndSD.add(leftSD.get(i));
+				coverageMaxEndMedian.add(leftMedian.get(i));
+				coverageMaxEndMAD.add(leftMAD.get(i));
+				coverageMaxEndMin.add(leftMin.get(i));
+				coverageMaxEndMax.add(leftMax.get(i));
+			}
+		}
+		calced = true;
+	}
+	
+	public Vector<Double> getCoverageMinEndMean(){
+		return coverageMinEndMean;
+	}
+	
+	public Vector<Double> getCoverageMinEndSD(){
+		return coverageMinEndSD;
+	}
+	
+	public Vector<Double> getCoverageMinEndMedian(){
+		return coverageMinEndMedian;
+	}
+	
+	public Vector<Double> getCoverageMinEndMAD(){
+		return coverageMinEndMAD;
+	}
+	
+	public Vector<Double> getCoverageMinEndMin(){
+		return coverageMinEndMin;
+	}
+	
+	public Vector<Double> getCoverageMinEndMax(){
+		return coverageMinEndMax;
+	}
+	
+	public Vector<Double> getCoverageMaxEndMean(){
+		return coverageMaxEndMean;
+	}
+	
+	public Vector<Double> getCoverageMaxEndSD(){
+		return coverageMaxEndSD;
+	}
+	
+	public Vector<Double> getCoverageMaxEndMedian(){
+		return coverageMaxEndMedian;
+	}
+	
+	public Vector<Double> getCoverageMaxEndMAD(){
+		return coverageMaxEndMAD;
+	}
+	
+	public Vector<Double> getCoverageMaxEndMin(){
+		return coverageMaxEndMin;
+	}
+	
+	public Vector<Double> getCoverageMaxEndMax(){
+		return coverageMaxEndMax;
+	}
+}
diff --git a/src/de/rki/ng4/surankco/feature/Kmer.java b/src/de/rki/ng4/surankco/feature/Kmer.java
new file mode 100755
index 0000000..a33876f
--- /dev/null
+++ b/src/de/rki/ng4/surankco/feature/Kmer.java
@@ -0,0 +1,279 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.feature;
+
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.util.Vector;
+
+public class Kmer {
+
+	private Vector<String> consensusSequences;
+	private int kmerSize;
+	
+	private Vector<Integer> kmerEndUniquenessMin;
+	private Vector<Integer> kmerEndUniquenessMax;
+	
+	public Kmer(Vector<String> conSeqs, int kmerSize){
+		this.kmerSize = kmerSize;
+		consensusSequences = new Vector<String>();
+		StringBuffer temp;
+		for (String s : conSeqs) {
+			temp = new StringBuffer();
+			for (Character ch : s.toUpperCase().toCharArray()){
+				if (ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T'){
+					temp.append(ch);
+				}
+			}
+			consensusSequences.add(temp.toString());
+		}
+	}
+	
+	public Vector<Integer> getKmerGlobalUniqueness(){
+		Vector<HashMap<KmerKey,Boolean>> kmers = getKmers(kmerSize, consensusSequences);
+		Vector<Integer> uniqueKmerCount = new Vector<Integer>();
+		boolean unique;
+		int count;
+		for (int i=0; i<kmers.size(); i++){
+			count = 0;
+			for (KmerKey kmer : kmers.get(i).keySet()){
+				unique = true;
+				for (int j=0; j<kmers.size(); j++){
+					if (i!=j){
+						if (kmers.get(j).containsKey(kmer)){
+							unique = false;
+							break;
+						}
+					}
+				}
+				if (unique){
+					count++;
+				}
+			}
+			uniqueKmerCount.add(count);
+		}
+		return uniqueKmerCount;
+	}
+	
+	public Vector<Integer> getKmerEndUniquenessMin(int endLength){
+		if (kmerEndUniquenessMin == null)
+			calcKmerEndUniqueness(kmerSize, endLength);
+		return kmerEndUniquenessMin;
+	}
+	
+	public Vector<Integer> getKmerEndUniquenessMax(int endLength){
+		if (kmerEndUniquenessMax == null)
+			calcKmerEndUniqueness(kmerSize, endLength);
+		return kmerEndUniquenessMax;
+	}
+	
+	public Vector<Integer> getKmerSize(){
+		Vector<Integer> kmerSizes = new Vector<Integer>();
+		for (int i=0; i<consensusSequences.size(); i++){
+			kmerSizes.add(kmerSize);
+		}
+		return kmerSizes;
+	}
+	
+	private void calcKmerEndUniqueness(int kmerSize, int endLength){
+		Vector<HashMap<KmerKey,Boolean>> kmersLeft = getKmers(kmerSize, sequencePrefixes(endLength));
+		Vector<HashMap<KmerKey,Boolean>> kmersRight = getKmers(kmerSize, sequenceSuffixes(endLength));
+		kmerEndUniquenessMin = new Vector<Integer>();
+		kmerEndUniquenessMax = new Vector<Integer>();
+		int left, right;
+		boolean unique;
+		for (int i=0; i<kmersLeft.size(); i++){
+			left = 0;
+			for (KmerKey kmer : kmersLeft.get(i).keySet()){
+				unique = true;
+				for (int j=0; j<kmersLeft.size(); j++){
+					if (i!=j){
+						if (kmersLeft.get(j).containsKey(kmer) || kmersRight.get(j).containsKey(kmer)){
+							unique = false;
+							break;
+						}
+					}
+				}
+				if (unique){
+					left++;
+				}
+			}
+			right = 0;
+			for (KmerKey kmer : kmersRight.get(i).keySet()){
+				unique = true;
+				for (int j=0; j<kmersRight.size(); j++){
+					if (i!=j){
+						if (kmersLeft.get(j).containsKey(kmer) || kmersRight.get(j).containsKey(kmer)){
+							unique = false;
+							break;
+						}
+					}
+				}
+				if (unique){
+					right++;
+				}
+			}
+			kmerEndUniquenessMax.add(Math.max(left, right));
+			kmerEndUniquenessMin.add(Math.min(left, right));
+		}
+	}
+	
+	private Vector<HashMap<KmerKey, Boolean>> getKmers(int kmerSize, Vector<String> sequences){		
+		Vector<HashMap<KmerKey,Boolean>> kmers = new Vector<HashMap<KmerKey,Boolean>>();
+		for (String seq : sequences){
+			kmers.add(new HashMap<KmerKey, Boolean>());
+			String kmer;
+			for (int i=0; i<=seq.length()-kmerSize; i++){
+				kmer = seq.substring(i, i+kmerSize);
+				kmers.lastElement().put(new KmerKey(kmer), true);
+				
+			}
+		}
+		return kmers;
+	}
+
+	public Vector<Vector<Double>> getKmerDistances(){
+		Vector<HashMap<KmerKey, Boolean>> kmers = getKmers(kmerSize, consensusSequences);
+		Vector<Vector<Double>> pairwiseDistances = new Vector<Vector<Double>>();
+		for (HashMap<KmerKey, Boolean> kmerMapA : kmers){
+			pairwiseDistances.add(new Vector<Double>());
+			for (HashMap<KmerKey, Boolean> kmerMapB : kmers){
+				if (!kmerMapA.equals(kmerMapB))
+					pairwiseDistances.lastElement().add(editDistance(kmerMapA,kmerMapB));
+			}
+		}
+		return pairwiseDistances;
+	}
+
+	private double editDistance(HashMap<KmerKey, Boolean> kmerMapA, HashMap<KmerKey, Boolean> kmerMapB){
+		double distance = 0;
+		double sizeA = kmerMapA.size();
+		double sizeB = 0;	
+		for (Entry<KmerKey, Boolean> entry : kmerMapA.entrySet()){
+			if (kmerMapB.containsKey(entry.getKey())) sizeB++;
+		}
+		for (Entry<KmerKey, Boolean> entry : kmerMapA.entrySet()){
+			if (kmerMapB.containsKey(entry.getKey()))
+				distance += Math.abs((1/sizeA) - (1/sizeB));
+			else
+				distance += (1/sizeA);
+		}
+		return distance;
+	}
+
+	private Vector<String> sequencePrefixes(int length){
+		Vector<String> prefixes = new Vector<String>();
+		for (String s : consensusSequences){
+			prefixes.add(s.substring(0, Math.min(length, s.length())));
+		}
+		return prefixes;
+	}
+
+	private Vector<String> sequenceSuffixes(int length){
+		Vector<String> suffixes = new Vector<String>();
+		for (String s : consensusSequences){
+			suffixes.add(s.substring(Math.max(0, s.length()-length), s.length()));
+		}
+		return suffixes;
+	}
+
+	class MutableInt {
+		private int value = 0;
+		public void inc () { ++value; }
+		public int get () { return value; }
+		public String toString(){
+			return Integer.toString(value);
+		}
+	}
+	
+	private class KmerKey{
+		private String kmer;
+		private String revcomp;
+		
+		private int hash;
+
+		public KmerKey(String kmer){
+			this.kmer = kmer.toUpperCase();
+			this.revcomp = reverseCompl(this.kmer);
+			initHash();
+		}
+		
+		@Override
+		public String toString(){
+			return kmer + "<>" + revcomp;
+		}
+		
+		private String reverseCompl(String kmer){
+			StringBuffer revcomp = new StringBuffer();
+			for (Character ch : kmer.toCharArray()){
+				switch (ch){
+				case 'A':
+					revcomp.append('T');
+					break;
+				case 'C':
+					revcomp.append('G');
+					break;
+				case 'G':
+					revcomp.append('C');
+					break;
+				case 'T':
+					revcomp.append('A');
+					break;
+				default:
+					revcomp.append(ch);				
+				}
+			}
+			return revcomp.reverse().toString();
+		}
+		
+		@Override
+		public boolean equals(Object obj){
+			if(obj == this) {
+				return true;
+			}
+			if(!(obj instanceof KmerKey)) {
+				return false;
+			}
+			
+			KmerKey kk = (KmerKey) obj;
+			return (this.kmer.matches(kk.kmer) || this.kmer.matches(kk.revcomp));
+		}
+		
+		@Override
+		public int hashCode(){
+			return hash;
+		}
+		
+		private void initHash(){
+			hash = Math.max(hash(kmer), hash(revcomp));
+		}
+		
+		private int hash(String s){
+			int hash = 0;
+			for (Character ch : s.toCharArray()){
+				switch (ch){
+				case 'A':
+					hash = hash << 2;
+					hash = hash | 0;
+					break;
+				case 'C':
+					hash = hash << 2;
+					hash = hash | 1;
+					break;
+				case 'G':
+					hash = hash << 2;
+					hash = hash | 2;
+					break;
+				case 'T':
+					hash = hash << 2;
+					hash = hash | 3;
+					break;
+				}
+			}
+			return hash;
+		}
+	}
+}
diff --git a/src/de/rki/ng4/surankco/feature/Main.java b/src/de/rki/ng4/surankco/feature/Main.java
new file mode 100755
index 0000000..f8edc50
--- /dev/null
+++ b/src/de/rki/ng4/surankco/feature/Main.java
@@ -0,0 +1,257 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.feature;
+
+import java.util.Arrays;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.Assembly;
+import de.rki.ng4.surankco.data.AssemblyAce;
+import de.rki.ng4.surankco.data.AssemblySam;
+import de.rki.ng4.surankco.files.ExportR;
+import de.rki.ng4.surankco.tools.Statistic;
+
+/**
+ * Part of the feature generation/calculation of SuRankCo
+ * Main procedure, initializes file imports, features extractions and file exports
+ * 
+ * @author Mathias Kuhring
+ *
+ */
+public class Main {
+
+	// debugging
+	public final static boolean DEBUGMODE = false;
+	
+	// simple timer use with timerStart() and timerStop()
+	public static long timer;
+	
+	
+	/**
+	 * 
+	 * 
+	 * @param args Arguments and their order are described in the surankco-feature R-script 
+	 */
+	public static void main(String[] args) {
+		
+		if (Main.DEBUGMODE){ System.out.println(Arrays.toString(args)); }
+		
+		// prepare export object to collect features
+		ExportR e = new ExportR();
+
+		// import ace file and read qualities
+		if (Main.DEBUGMODE){ System.out.println("-> import assembly"); }
+		String contigfile = args[0];
+		String readfile = args[1];
+		String fastqParameter = args[5];
+		
+		boolean ace = contigfile.endsWith("ace");
+		Assembly assembly;
+		if (ace){
+			assembly = new AssemblyAce(contigfile, readfile, fastqParameter);			
+		}
+		else{
+			assembly = new AssemblySam(contigfile, readfile);
+		}
+		System.gc();
+		
+		// contig lenght filter
+		assembly.lengthFilter(Integer.parseInt(args[8]));
+		
+		// calculate several features
+		Several features = new Several(assembly.getContigs());
+
+		e.addStringVariable("ContigID", features.getIds());
+		e.addIntegerVariable("Length", features.getContigLength());
+		if (ace){ // !!! no BaseCount and BaseSegmentCount for sam contigs yet, because no padded sequence available yet
+			e.addIntegerVariable("BaseCount", features.getBaseCounts());
+			e.addIntegerVariable("BaseSegmentCount", features.getBaseSeqmentCounts());
+		}
+		e.addIntegerVariable("ReadCount", features.getReadCounts());
+		e.addDoubleVariable("ReadComplementFraction", features.getReadComplementFraction());
+
+		if (Main.DEBUGMODE){ System.out.println("-> contig qualities"); }
+		
+		if (ace){ // !!! no quality values for sam contigs yet
+			Vector<Vector<Integer>> contigQualities = features.getQualityValues();
+			e.addDoubleVariable("ContigQualitiesMean", Statistic.means(contigQualities));
+			e.addDoubleVariable("ContigQualitiesSD", Statistic.sds(contigQualities));
+			e.addDoubleVariable("ContigQualitiesMedian", Statistic.medians(contigQualities));
+			e.addDoubleVariable("ContigQualitiesMAD", Statistic.mads(contigQualities));
+			e.addDoubleVariable("ContigQualitiesMin", Statistic.mins(contigQualities));
+			e.addDoubleVariable("ContigQualitiesMax", Statistic.maxs(contigQualities));
+		}
+
+		System.gc();
+		
+		// calculate read quality features
+		if (Main.DEBUGMODE){ System.out.println("-> read qualities"); }
+		
+		String regex = args[6];
+		
+		Vector<Vector<String>> readIds = features.getReadIds(regex);
+		Vector<Vector<Integer>> readQualities = assembly.getPooledReadQualities(readIds);
+	
+		e.addDoubleVariable("ReadQualitiesMean", Statistic.means(readQualities));
+		e.addDoubleVariable("ReadQualitiesSD", Statistic.sds(readQualities));
+		e.addDoubleVariable("ReadQualitiesMedian", Statistic.medians(readQualities));
+		e.addDoubleVariable("ReadQualitiesMAD", Statistic.mads(readQualities));
+		e.addDoubleVariable("ReadQualitiesMin", Statistic.mins(readQualities));
+		e.addDoubleVariable("ReadQualitiesMax", Statistic.maxs(readQualities));
+
+		if (Main.DEBUGMODE){ System.out.println("-> read lengths"); }
+		
+		Vector<Vector<Integer>> readLengths = assembly.getReadLengths(readIds);
+		
+		Vector<Double> readLengthsMean = Statistic.means(readLengths);
+		e.addDoubleVariable("ReadLengthsMean", readLengthsMean);
+		e.addDoubleVariable("ReadLengthsSD", Statistic.sds(readLengths));
+		e.addDoubleVariable("ReadLengthsMedian", Statistic.medians(readLengths));
+		e.addDoubleVariable("ReadLengthsMAD", Statistic.mads(readLengths));
+		e.addDoubleVariable("ReadLengthsMin", Statistic.mins(readLengths));
+		e.addDoubleVariable("ReadLengthsMax", Statistic.maxs(readLengths));
+		
+		assembly = null;
+		System.gc();
+		
+		// calculate read length features
+		if (Main.DEBUGMODE){ System.out.println("-> read lengths padded"); }	
+		Vector<Vector<Integer>> readPaddedLengths = features.getReadPaddedLengths();
+		e.addDoubleVariable("ReadLengthsPaddedMean", Statistic.means(readPaddedLengths));
+		e.addDoubleVariable("ReadLengthsPaddedSD", Statistic.sds(readPaddedLengths));
+		e.addDoubleVariable("ReadLengthsPaddedMedian", Statistic.medians(readPaddedLengths));
+		e.addDoubleVariable("ReadLengthsPaddedMAD", Statistic.mads(readPaddedLengths));
+		e.addDoubleVariable("ReadLengthsPaddedMin", Statistic.mins(readPaddedLengths));
+		e.addDoubleVariable("ReadLengthsPaddedMax", Statistic.maxs(readPaddedLengths));
+
+		if (Main.DEBUGMODE){ System.out.println("-> read lengths clipping"); }
+		Vector<Vector<Integer>> readClippingLengths = features.getReadClippingLengths();
+		e.addDoubleVariable("ReadLengthsClippingMean", Statistic.means(readClippingLengths));
+		e.addDoubleVariable("ReadLengthsClippingSD", Statistic.sds(readClippingLengths));
+		e.addDoubleVariable("ReadLengthsClippingMedian", Statistic.medians(readClippingLengths));
+		e.addDoubleVariable("ReadLengthsClippingMAD", Statistic.mads(readClippingLengths));
+		e.addDoubleVariable("ReadLengthsClippingMin", Statistic.mins(readClippingLengths));
+		e.addDoubleVariable("ReadLengthsClippingMax", Statistic.maxs(readClippingLengths));
+
+		if (Main.DEBUGMODE){ System.out.println("-> gc-content"); }
+		e.addDoubleVariable("GcFraction", features.getGcContent());
+
+		System.gc();
+		
+		// calculate coverage features
+		if (Main.DEBUGMODE){ System.out.println("-> coverage"); timerStart(); }
+
+		Vector<Vector<Integer>> coverageGlobal = features.getGlobalCoverages();
+		e.addDoubleVariable("CoverageGlobalMean", Statistic.means(coverageGlobal));
+		e.addDoubleVariable("CoverageGlobalSD", Statistic.sds(coverageGlobal));
+		e.addDoubleVariable("CoverageGlobalMedian", Statistic.medians(coverageGlobal));
+		e.addDoubleVariable("CoverageGlobalMAD", Statistic.mads(coverageGlobal));
+		e.addDoubleVariable("CoverageGlobalMin", Statistic.mins(coverageGlobal));
+		e.addDoubleVariable("CoverageGlobalMax", Statistic.maxs(coverageGlobal));
+		if (Main.DEBUGMODE){ timerStop(); }
+				
+		if (Main.DEBUGMODE){ System.out.println("-> coverage 2"); timerStart(); }
+		Coverage cov = new Coverage(coverageGlobal);
+		if (cov.calcEndCovs(readLengthsMean)){
+			e.addDoubleVariable("CoverageMinEndMean", cov.getCoverageMinEndMean());
+			e.addDoubleVariable("CoverageMinEndSD", cov.getCoverageMinEndSD());
+			e.addDoubleVariable("CoverageMinEndMedian", cov.getCoverageMinEndMedian());
+			e.addDoubleVariable("CoverageMinEndMAD", cov.getCoverageMinEndMAD());
+			e.addDoubleVariable("CoverageMinEndMin", cov.getCoverageMinEndMin());
+			e.addDoubleVariable("CoverageMinEndMax", cov.getCoverageMinEndMax());
+			e.addDoubleVariable("CoverageMaxEndMean", cov.getCoverageMaxEndMean());
+			e.addDoubleVariable("CoverageMaxEndSD", cov.getCoverageMaxEndSD());
+			e.addDoubleVariable("CoverageMaxEndMedian", cov.getCoverageMaxEndMedian());
+			e.addDoubleVariable("CoverageMaxEndMAD", cov.getCoverageMaxEndMAD());
+			e.addDoubleVariable("CoverageMaxEndMin", cov.getCoverageMaxEndMin());
+			e.addDoubleVariable("CoverageMaxEndMax", cov.getCoverageMaxEndMax());
+		}
+		if (Main.DEBUGMODE){ timerStop(); }
+	
+		if (Main.DEBUGMODE){ System.out.println("-> core coverage"); timerStart(); }
+		Vector<Vector<Integer>> coreCoverage = features.getCoreCoverages();
+		e.addDoubleVariable("CoreCoverageGlobalMean", Statistic.means(coreCoverage));
+		e.addDoubleVariable("CoreCoverageGlobalSD", Statistic.sds(coreCoverage));
+		e.addDoubleVariable("CoreCoverageGlobalMedian", Statistic.medians(coreCoverage));
+		e.addDoubleVariable("CoreCoverageGlobalMAD", Statistic.mads(coreCoverage));
+		e.addDoubleVariable("CoreCoverageGlobalMin", Statistic.mins(coreCoverage));
+		e.addDoubleVariable("CoreCoverageGlobalMax", Statistic.maxs(coreCoverage));
+		if (Main.DEBUGMODE){ timerStop(); }
+		
+		// catch coverage vector size error
+		boolean devError = false;
+		for (int i=0; i<coverageGlobal.size(); i++){
+			if (devError = coverageGlobal.get(i).size() != coreCoverage.get(i).size()){
+				System.out.println("dev.error: different vector size of coverage (" + coverageGlobal.get(i).size() +
+						") and core coverage (" + coreCoverage.get(i).size() + ") for contig " + features.getIds().get(i));
+			}
+		}
+		if (devError){
+			System.exit(1);
+		}
+		
+		if (Main.DEBUGMODE){ System.out.println("-> core coverage 2"); timerStart(); }
+		Coverage corecov = new Coverage(coreCoverage);
+		if (corecov.calcEndCovs(readLengthsMean)){
+			e.addDoubleVariable("CoreCoverageMinEndMean", corecov.getCoverageMinEndMean());
+			e.addDoubleVariable("CoreCoverageMinEndSD", corecov.getCoverageMinEndSD());
+			e.addDoubleVariable("CoreCoverageMinEndMedian", corecov.getCoverageMinEndMedian());
+			e.addDoubleVariable("CoreCoverageMinEndMAD", corecov.getCoverageMinEndMAD());
+			e.addDoubleVariable("CoreCoverageMinEndMin", corecov.getCoverageMinEndMin());
+			e.addDoubleVariable("CoreCoverageMinEndMax", corecov.getCoverageMinEndMax());
+			e.addDoubleVariable("CoreCoverageMaxEndMean", corecov.getCoverageMaxEndMean());
+			e.addDoubleVariable("CoreCoverageMaxEndSD", corecov.getCoverageMaxEndSD());
+			e.addDoubleVariable("CoreCoverageMaxEndMedian", corecov.getCoverageMaxEndMedian());
+			e.addDoubleVariable("CoreCoverageMaxEndMAD", corecov.getCoverageMaxEndMAD());
+			e.addDoubleVariable("CoreCoverageMaxEndMin", corecov.getCoverageMaxEndMin());
+			e.addDoubleVariable("CoreCoverageMaxEndMax", corecov.getCoverageMaxEndMax());
+		}
+		if (Main.DEBUGMODE){ timerStop(); }
+
+		System.gc();
+		
+		// calculate kmer features
+		boolean calcKmers = Boolean.parseBoolean(args[7]);
+		if (calcKmers){
+			if (Main.DEBUGMODE){ System.out.println("-> kmer"); timerStart(); }
+			int kmerSize = 8;
+			int ends = (int) (Statistic.mean(Statistic.means(readLengths))*0.8);
+			Kmer kmer = new Kmer(features.getConsensusSequences(), kmerSize);
+			e.addIntegerVariable("KmerSize", kmer.getKmerSize());
+			e.addIntegerVariable("KmerGlobalUniqueness", kmer.getKmerGlobalUniqueness());
+			e.addIntegerVariable("KmerEndUniquenessMin", kmer.getKmerEndUniquenessMin(ends));
+			e.addIntegerVariable("KmerEndUniquenessMax", kmer.getKmerEndUniquenessMax(ends));
+			e.addDoubleVariable("KmerDistanceMin", Statistic.mins(kmer.getKmerDistances()));
+			if (Main.DEBUGMODE){ timerStop(); }
+		}
+
+		
+		if (Main.DEBUGMODE){ System.out.println("-> export"); }
+
+		System.gc();
+		
+		// export features to table format text
+		String outFilename = args[2];
+		e.write(outFilename);
+		
+		// export Coverage and CoreCoverage to fasta like text
+		System.gc();
+		e.writeCoverage(args[3], features.getIds(), coverageGlobal);
+		System.gc();
+		e.writeCoverage(args[4], features.getIds(), coreCoverage);
+	}
+	
+	// starts a simple timer
+	private static void timerStart(){
+		timer = System.currentTimeMillis();
+	}
+	
+	// stops the simple timer
+	private static void timerStop(){
+		System.out.println((System.currentTimeMillis() - timer) + " ms");
+	}
+	
+}
diff --git a/src/de/rki/ng4/surankco/feature/Several.java b/src/de/rki/ng4/surankco/feature/Several.java
new file mode 100755
index 0000000..e2bdea7
--- /dev/null
+++ b/src/de/rki/ng4/surankco/feature/Several.java
@@ -0,0 +1,287 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.feature;
+
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.Contig;
+import de.rki.ng4.surankco.data.Read;
+
+public class Several {
+
+	private Vector<Contig> contigs;
+
+	public Several(Vector<Contig> contigs){
+		this.contigs = contigs;
+	}
+
+
+	public void printContigInfo(){
+		for (Contig c : contigs){
+			System.out.println(c.getId());
+			System.out.println(c.getQualityValues().toString());
+		}
+	}
+
+
+	public Vector<String> getIds(){
+		Vector<String> ids = new Vector<String>();
+		for (Contig c : contigs){
+			ids.add(c.getId());
+		}
+		return ids;
+	}
+
+	public Vector<Integer> getBaseCounts(){
+		Vector<Integer> baseCounts = new Vector<Integer>();
+		for (Contig c : contigs){
+			baseCounts.add(c.getBaseCount());
+		}
+		return baseCounts;
+	}
+
+	public Vector<Integer> getReadCounts(){
+		Vector<Integer> readCounts = new Vector<Integer>();
+		for (Contig c : contigs){
+			readCounts.add(c.getReadCount());
+		}
+		return readCounts;
+	}
+
+	public Vector<Integer> getBaseSeqmentCounts(){
+		Vector<Integer> baseSeqmentCounts = new Vector<Integer>();
+		for (Contig c : contigs){
+			baseSeqmentCounts.add(c.getBaseSeqmentCount());
+		}
+		return baseSeqmentCounts;
+	}
+
+	public Vector<Integer> getContigLength(){
+		Vector<Integer> length = new Vector<Integer>();
+		for (Contig c : contigs){
+			length.add(c.getLength());
+		}
+		return length;
+	}
+
+	public Vector<Double> getReadComplementFraction(){
+		Vector<Double> fraction = new Vector<Double>();
+		double complements;
+		for (Contig c : contigs){
+			complements = 0;
+			for (Read read : c.getReads()){
+				if (read.getOrientation() == 'C') complements++;
+			}
+			fraction.add(complements / (double) c.getReads().size());
+		}
+		return fraction;
+	}
+
+	public Vector<String> getConsensusSequences(){
+		Vector<String> conSeqs = new Vector<String>();
+		for (Contig c : contigs){
+			conSeqs.add(c.getConsensusSequence());
+		}
+		return conSeqs;
+	}
+
+	public Vector<Vector<Integer>> getQualityValues(){
+		Vector<Vector<Integer>> qualityValues = new Vector<Vector<Integer>>();
+		for (Contig c : contigs){
+			qualityValues.add(c.getQualityValues());
+		}
+		return qualityValues;
+	}
+
+	// Collects contig read ids and uses a regex (if indicated) to split off additional suffixes in contrast to the original read names in fastq/qual files
+	public Vector<Vector<String>> getReadIds(String regex){
+		Vector<Vector<String>> readIds = new Vector<Vector<String>>();
+		
+		if (regex.matches("nosplit")){
+			for (Contig c : contigs){
+				readIds.add(new Vector<String>());
+				for (Read read : c.getReads()){
+					readIds.lastElement().add(read.getId());
+				}
+			}
+		}
+		else {
+			for (Contig c : contigs){
+				readIds.add(new Vector<String>());
+				for (Read read : c.getReads()){
+					readIds.lastElement().add(read.getId().split(regex)[0]);
+				}
+			}
+		}
+		
+		return readIds;
+	}
+	
+	public Vector<Vector<Integer>> getReadPaddedLengths(){
+		Vector<Vector<Integer>> lengths = new Vector<Vector<Integer>>();
+		for (Contig c : contigs){
+			lengths.add(new Vector<Integer>());
+			for (Read read : c.getReads()){
+				lengths.lastElement().add(read.getPaddedLength());
+			}
+		}
+		return lengths;
+	}
+
+	public Vector<Vector<Integer>> getReadClippingLengths(){
+		Vector<Vector<Integer>> clippings = new Vector<Vector<Integer>>();
+		for (Contig c : contigs){
+			clippings.add(new Vector<Integer>());
+			for (Read read : c.getReads()){
+				clippings.lastElement().add(read.getClippingLength());
+			}
+		}
+		return clippings;
+	}
+
+	public void printConsensusSequences(){
+		for (Contig c : contigs){
+			System.out.println(c.getId() + " (l=" + c.getConsensusSequence().length() + "):\t" + c.getConsensusSequence().substring(0,Math.min(100, c.getConsensusSequence().length())) + "...");
+		}
+	}
+
+	public void printContigReads(int contigNum){
+		Contig c = contigs.get(contigNum-1);
+		System.out.println("Reads in Contig " + c.getId() + ":");
+		for (Read read : c.getReads()){
+			System.out.println(read.getId() + ":\t" + read.getOrientation() + " " + read.getOffset());
+		}
+	}
+
+	public Vector<Vector<Integer>> getGlobalCoverages(){
+		Vector<Vector<Integer>> coverages = new Vector<Vector<Integer>>();
+
+		for (Contig c : contigs){
+			coverages.add(calcCoverage(c.getConsensusSequence(), c.getNucleotideCount()));
+		}
+
+		return coverages;
+	}
+
+	public Vector<Vector<Integer>> getCoreCoverages(){
+		Vector<Vector<Integer>> ccoverages = new Vector<Vector<Integer>>();
+
+		for (Contig c : contigs){
+			ccoverages.add(calcCoreCoverage(c.getConsensusSequence(), c.getNucleotideCount()));
+		}
+
+		return ccoverages;
+	}
+
+	private Vector<Integer> calcCoverage(String consensus, int[][] nucCounts){
+		Vector<Integer> coverage = new Vector<Integer>();
+		int nucCountSize = nucCounts.length;
+		int consensusSize = consensus.length();
+		for (int i=0; i<consensus.length(); i++){
+			if (consensus.charAt(i) != '*'){
+				coverage.add(nucCounts[i][0] + nucCounts[i][1] + nucCounts[i][2] + 
+						nucCounts[i][3] + nucCounts[i][4] + nucCounts[i][5]);
+			}
+			//else System.out.print("*");
+		}
+		return coverage;
+	}
+
+	//	Supported Nucleotides:
+	//	A 	Adenin
+	//	C 	Cytosin
+	//	G 	Guanin
+	//	T 	Thymin
+	//	R 	G A (PuRine)
+	//	Y 	T C (PYrimidine)
+	//	K 	G T (Ketone)
+	//	M 	A C (AMinogruppen)
+	//	S 	G C (Starke Wechselwirkung)
+	//	W 	A T (Weiche Wechselwirkung)
+	//	B 	G T C (nicht A) (B kommt nach A)
+	//	D 	G A T (nicht C) (D kommt nach C)
+	//	H 	A C T (nicht G) (H kommt nach G)
+	//	V 	G C A (nicht T, nicht U) (V kommt nach U)
+	//	N 	A G C T (aNy)
+	private Vector<Integer> calcCoreCoverage(String consensus, int[][] nucCounts){
+		Vector<Integer> ccoverage = new Vector<Integer>();
+		for (int i=0; i<consensus.length(); i++){
+			switch (consensus.charAt(i)){
+			case 'A': // A 	Adenin
+				ccoverage.add(nucCounts[i][0]);
+				break;
+			case 'C': // C 	Cytosin
+				ccoverage.add(nucCounts[i][1]);
+				break;
+			case 'G': // G 	Guanin
+				ccoverage.add(nucCounts[i][2]);
+				break;
+			case 'T': // T 	Thymin
+				ccoverage.add(nucCounts[i][3]);
+				break;
+			case 'R': // R 	G A (PuRine)
+				ccoverage.add(Math.max(nucCounts[i][2], nucCounts[i][0]));
+				break;
+			case 'Y': // Y 	T C (PYrimidine)
+				ccoverage.add(Math.max(nucCounts[i][3], nucCounts[i][1]));
+				break;
+			case 'K': // K 	G T (Ketone)
+				ccoverage.add(Math.max(nucCounts[i][2], nucCounts[i][3]));
+				break;
+			case 'M': // M 	A C (AMinogruppen)
+				ccoverage.add(Math.max(nucCounts[i][0], nucCounts[i][1]));
+				break;		
+			case 'S': // S 	G C (Starke Wechselwirkung)
+				ccoverage.add(Math.max(nucCounts[i][2], nucCounts[i][1]));
+				break;
+			case 'W': // W 	A T (Weiche Wechselwirkung)
+				ccoverage.add(Math.max(nucCounts[i][0], nucCounts[i][3]));
+				break;
+			case 'B': // B 	G T C (nicht A) (B kommt nach A)
+				ccoverage.add(Math.max(Math.max(nucCounts[i][2], nucCounts[i][3]), nucCounts[i][1]));
+				break;
+			case 'D': // D 	G A T (nicht C) (D kommt nach C)
+				ccoverage.add(Math.max(Math.max(nucCounts[i][2], nucCounts[i][0]), nucCounts[i][3]));
+				break;
+			case 'H': // H 	A C T (nicht G) (H kommt nach G)
+				ccoverage.add(Math.max(Math.max(nucCounts[i][0], nucCounts[i][1]), nucCounts[i][3]));
+				break;
+			case 'V': // V 	G C A (nicht T, nicht U) (V kommt nach U)
+				ccoverage.add(Math.max(Math.max(nucCounts[i][2], nucCounts[i][1]), nucCounts[i][0]));
+				break;
+			case 'N': // N 	A G C T (aNy)
+				ccoverage.add(Math.max(Math.max(nucCounts[i][0], nucCounts[i][1]), 
+						Math.max(nucCounts[i][2], nucCounts[i][3])));
+				break;
+			case '*':
+				//System.out.print("*");
+				break;
+			default:
+				ccoverage.add(nucCounts[i][0] + nucCounts[i][1] + nucCounts[i][2] + 
+						nucCounts[i][3] + nucCounts[i][4] + nucCounts[i][5]);
+				System.out.println("warning: unsupported nucleotide in consensus at position " + i + ": " + consensus.charAt(i) + " -> using full coverage for this position");
+			}
+		}
+		return ccoverage;
+	}
+
+	public Vector<Double> getGcContent(){
+		Vector<Double> content = new Vector<Double>();
+		double gc, at; 
+		for (Contig c : contigs){
+			at = 0;
+			gc = 0;
+			for (Character ch : c.getConsensusSequence().toUpperCase().toCharArray()){
+				if (ch == 'A' || ch == 'T') at++;
+				if (ch == 'G' || ch == 'C') gc++;
+			}
+			content.add(gc/(gc+at));
+		}
+		return content;
+	}
+
+
+}
diff --git a/src/de/rki/ng4/surankco/files/Ace.java b/src/de/rki/ng4/surankco/files/Ace.java
new file mode 100755
index 0000000..46307f8
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Ace.java
@@ -0,0 +1,175 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.AceRead;
+import de.rki.ng4.surankco.data.AceContig;
+
+
+public class Ace {
+
+	private String fileName;
+	private Vector<AceContig> contigs;
+
+	public Ace(String fileName){
+		this.fileName = fileName;
+		contigs = new Vector<AceContig>();
+		try {
+			LineNumberReader acebr = new LineNumberReader(new FileReader(new File(fileName)));
+			try {
+				String line;
+				int readCounter = 0;
+				while ((line = acebr.readLine()) != null){
+					if (line.startsWith("CO ")){
+						contigs.add(new AceContig(line));
+						contigs.lastElement().setLineCO( acebr.getLineNumber()-1 );
+						readCounter = 0;
+					}
+					else if (line.startsWith("BQ")){
+						contigs.lastElement().setLineBQ( acebr.getLineNumber()-1 );
+					}
+					else if (line.startsWith("AF ")){
+						contigs.lastElement().addRead(new AceRead());
+						contigs.lastElement().getReads().lastElement().addAF(line);
+					}
+					else if (line.startsWith("RD ")){
+						readCounter++;
+						
+						contigs.lastElement().getReads().get(readCounter-1).addRD(line);
+						contigs.lastElement().getReads().get(readCounter-1).setLineRD( acebr.getLineNumber()-1 );
+						
+						StringBuffer readSeq = new StringBuffer();
+						while ((line = acebr.readLine()) != null && line.length() != 0){
+							readSeq.append(line);
+						}
+						contigs.lastElement().getReads().get(readCounter-1).setPaddedSequence( readSeq.toString() );
+					}
+					else if (line.startsWith("QA ")){
+						contigs.lastElement().getReads().get(readCounter-1).addQA(line);
+					}
+				}
+			}
+			finally {
+				acebr.close();
+			}
+		} 
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + fileName);
+			System.exit(1);
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + fileName);
+			System.exit(1);
+		}
+	}
+
+	public void readConsensusSequences(){
+		String line;
+		StringBuffer consensus;
+		try {
+			BufferedReader acebr = new BufferedReader(new FileReader(new File(fileName)));
+			int lineCounter = 0;
+			try {
+				for (AceContig c : contigs){
+					consensus = new StringBuffer();
+					while (lineCounter < c.getLineCO()){
+						if ((line = acebr.readLine()) == null) break;
+						lineCounter++;
+					}
+
+					if ((line = acebr.readLine()) != null && line.startsWith("CO")){
+						lineCounter++;
+						while ((line = acebr.readLine()) != null && line.length() != 0){
+							lineCounter++;
+							consensus.append(line);
+						}
+					}
+					lineCounter++;
+					c.setConsensusSequence( consensus.toString().toUpperCase() );
+				}
+			}
+			finally {
+				acebr.close();
+			}
+		} 
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + fileName);
+			System.exit(1);
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + fileName);
+			System.exit(1);
+		}
+	}
+
+	public void readQualityValues(){
+		String line;
+		String[] splits;
+		try {
+			BufferedReader acebr = new BufferedReader(new FileReader(new File(fileName)));
+			int lineCounter = 0;
+			try {
+				for (AceContig c : contigs){
+					while (lineCounter < c.getLineBQ()){
+						if ((line = acebr.readLine()) == null) break;
+						lineCounter++;
+					}
+
+					if ((line = acebr.readLine()) != null && line.matches("BQ")){
+						lineCounter++;
+						while ((line = acebr.readLine()) != null && line.length() != 0){
+							lineCounter++;
+							splits = line.split(" ");
+							for (String s : splits){
+								c.getQualityValues().add(Integer.parseInt(s));
+							}
+						}
+					}
+					lineCounter++;
+				}
+			}
+			finally {
+				acebr.close();
+			}
+		} 
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + fileName);
+			System.exit(1);
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + fileName);
+			System.exit(1);
+		}
+	}
+
+	/**
+	 * @return the contigs
+	 */
+	public Vector<AceContig> getContigs() {
+		return contigs;
+	}
+
+	/**
+	 * @param contigs the contigs to set
+	 */
+	public void setContigs(Vector<AceContig> contigs) {
+		this.contigs = contigs;
+	}
+}
diff --git a/src/de/rki/ng4/surankco/files/ExportR.java b/src/de/rki/ng4/surankco/files/ExportR.java
new file mode 100755
index 0000000..417fb25
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/ExportR.java
@@ -0,0 +1,131 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.util.Vector;
+
+public class ExportR {
+
+	private Vector<String> header;
+	private Vector<Vector<String>> data;
+
+	public ExportR(){
+		header = new Vector<String>();
+		data = new Vector<Vector<String>>();
+	}
+
+	public void addStringVariable(String name, Vector<String> values){
+		header.add(name);
+		data.add(values);
+	}
+	
+	public void addIntegerVariable(String name, Vector<Integer> values){
+		header.add(name);
+		data.add(ints(values));
+	}
+	
+	public void addDoubleVariable(String name, Vector<Double> values){
+		header.add(name);
+		data.add(doubles(values));
+	}
+
+	public void write(String fileName){
+		try {
+			BufferedWriter bw = new BufferedWriter(new FileWriter(new File(fileName)));
+			try {
+				for (String s : header){
+					bw.append(s + " ");
+				}
+				bw.newLine();				
+
+				for (int i=0; i<data.firstElement().size(); i++){
+					for (int j=0; j<data.size(); j++){
+						bw.append(data.get(j).get(i) + " ");
+					}
+					bw.newLine();
+				}
+			}
+			finally {
+				bw.close();
+			}
+		} catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems writing file: " + fileName);
+			System.exit(1);
+		}
+	}
+
+	public static Vector<String> intVecAsRString(Vector<Vector<Integer>> vecs){
+		Vector<String> output = new Vector<String>();
+		StringBuffer string;
+		for (Vector<Integer> v : vecs){
+			string = new StringBuffer("\"");
+			for (Integer i : v){
+				string.append(i + " ");
+			}
+			string.replace(string.length(), string.length(), "\"");
+			output.add(string.toString());
+		}
+		return output;
+	}
+	
+	private static Vector<String> ints(Vector<Integer> vecs){
+		Vector<String> output = new Vector<String>();
+		for (Integer i : vecs){
+			output.add(i.toString());
+		}
+		return output;
+	}
+	
+	private static Vector<String> doubles(Vector<Double> vecs){
+		Vector<String> output = new Vector<String>();
+		for (Double i : vecs){
+			output.add(i.toString());
+		}
+		return output;
+	}
+
+	public void writeCoverage(String fileName, Vector<String> ids, Vector<Vector<Integer>> covs) {
+		try {
+			BufferedWriter bw = new BufferedWriter(new FileWriter(new File(fileName)));
+			try {
+				DecimalFormat df =   new DecimalFormat( "000" );
+				
+				for (int i=0; i<ids.size(); i++){
+					bw.append(">" + ids.get(i));
+					bw.newLine();
+					
+					
+					
+					int count = 0;
+					for (int j=0; j<covs.get(i).size(); j++){
+						
+						bw.append( df.format(covs.get(i).get(j)) + " " );
+						count++;
+						
+						if (count == 15){
+							bw.newLine();
+							count = 0;
+						}
+					}
+					bw.newLine();
+				}
+			}
+			finally {
+				bw.close();
+			}
+		} catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems writing file: " + fileName);
+			System.exit(1);
+		}
+	}
+}
diff --git a/src/de/rki/ng4/surankco/files/Fasta.java b/src/de/rki/ng4/surankco/files/Fasta.java
new file mode 100755
index 0000000..c82b5b4
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Fasta.java
@@ -0,0 +1,52 @@
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.FastaContig;
+
+public class Fasta {
+
+	private String filename;
+	
+	public Fasta(String filename){
+		this.filename = filename;
+	}
+	
+	public Vector<FastaContig> parseFasta(){
+		Vector<FastaContig> contigs = new Vector<FastaContig>();
+		try {
+			BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+			try {
+				String line;
+				while ((line = br.readLine()) != null){
+					if (line.startsWith(">")){
+						contigs.add(new FastaContig(line));
+					}
+					else {
+						contigs.lastElement().appendSequence(line.trim().toUpperCase());
+					}
+				}
+			}
+			finally {
+				br.close();
+			}
+		} 
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + filename);
+			System.exit(1);
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + filename);
+			System.exit(1);
+		}
+		return contigs;
+	}
+	
+}
diff --git a/src/de/rki/ng4/surankco/files/Fastq.java b/src/de/rki/ng4/surankco/files/Fastq.java
new file mode 100755
index 0000000..08db3ee
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Fastq.java
@@ -0,0 +1,217 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.FastqReads;
+import de.rki.ng4.surankco.feature.Main;
+
+/**
+ * Import of read qualities from Fastq-Files
+ * 
+ * @author Mathias Kuhring
+ *
+ */
+public class Fastq {
+
+	String fastqParameter;
+	String fastqVersion;
+	
+	Vector<String> quality;
+	
+	public Fastq(String filename, FastqReads reads, String parameter) {
+		quality = new Vector<String>();
+		
+		fastqParameter = parameter;
+		
+		try {
+			BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+			try {
+				String line;
+				String[] splits;
+				while ((line = br.readLine()) != null){
+					if (line.startsWith("@")){
+						// parse read id
+						splits = line.split(" ");
+						reads.readId.add(splits[0].substring(1));
+						// skip read sequence
+						br.readLine();
+						// skip +
+						br.readLine();
+						// read quality
+						line = br.readLine();
+						quality.add(line);
+					}
+				}
+			}
+			finally {
+				br.close();
+			}
+		} 
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + filename);
+			System.exit(1);
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + filename);
+			System.exit(1);
+		}
+		
+		if (Main.DEBUGMODE){
+			System.out.println(reads.readId.get(0));
+			System.out.println(quality.get(0));
+		}
+		
+		reads.readQuality.addAll(parseFastqQuality());
+	}	
+	
+	// Coordination of Fastq Quality conversion
+	private Vector<Vector<Integer>> parseFastqQuality(){
+		// convert from ascii to integer
+		Vector<Vector<Integer>> intQuality = asciiToNumber();
+		// determine fastq version
+		fastqVersion = getFastqVersion(new Vector<Vector<Integer>>(intQuality.subList(0, Math.min(intQuality.size(), 10))), fastqParameter);
+		// convert to illumina1.8+
+		intQuality = convertToIllumina18(intQuality);
+		
+		return intQuality;
+	}
+	
+	// Convert different fastq quality versions to illumina1.8+
+	private Vector<Vector<Integer>> convertToIllumina18(Vector<Vector<Integer>> intQuality) {
+		int illumina18Cutoff = 41;
+		
+		switch (fastqVersion) {
+		case "sanger":
+			intQuality = shiftQualities(intQuality, 33, illumina18Cutoff);
+			break;
+		case "solexa":
+			intQuality = shiftQualities(intQuality, 64, 62);
+			intQuality = convertSolexaToPhred(intQuality);
+			intQuality = shiftQualities(intQuality, 0, illumina18Cutoff);
+			break;
+		case "illumina13":
+			intQuality = shiftQualities(intQuality, 64, illumina18Cutoff);
+			break;
+		case "illumina15":
+			intQuality = shiftQualities(intQuality, 64, illumina18Cutoff);
+			break;
+		case "illumina18":
+			intQuality = shiftQualities(intQuality, 33, illumina18Cutoff);
+			break;
+		default:
+			System.out.println("error: bad fastq version");
+			System.exit(1);
+			break;
+		}
+		return intQuality;
+	}
+
+	// converts Solexa Quality to Phred Quality: qSolexy = -10 x log10(Pe / (1-Pe)) -> qPhred = -10 x log10(Pe)
+	// according to Cock et. al 2009, The Sanger FASTQ file format...
+	private Vector<Vector<Integer>> convertSolexaToPhred(Vector<Vector<Integer>> intQuality) {
+		double qSolexa = 0;
+		int converted = 0;
+		
+		for (int i=0; i<intQuality.size(); i++){
+			for (int j=0; j<intQuality.get(i).size(); j++){
+				qSolexa = intQuality.get(i).get(j);
+				converted = (int) Math.round(10d * Math.log10( Math.pow(10d,(qSolexa/10d)) + 1d ));
+				intQuality.get(i).set(j,converted);
+			}
+		}
+		
+		return intQuality;
+	}
+
+	// Scaling of Qualities
+	private Vector<Vector<Integer>> shiftQualities(Vector<Vector<Integer>> intQuality, int shift, int cutoff) {
+		int shifted = 0;
+		
+		for (int i=0; i<intQuality.size(); i++){
+			for (int j=0; j<intQuality.get(i).size(); j++){
+				shifted = Math.min((intQuality.get(i).get(j) - shift), cutoff);
+				intQuality.get(i).set(j,shifted);
+			}
+		}
+		
+		return intQuality;
+	}
+
+	// Converts the fastq quality ascii strings to integer vectors
+	private Vector<Vector<Integer>> asciiToNumber() {
+		Vector<Vector<Integer>> intQuality = new Vector<Vector<Integer>>();
+		
+		for (String read : quality){
+			intQuality.add(new Vector<Integer>());
+			for (char qual : read.toCharArray()){
+				intQuality.lastElement().add((int) qual);
+			}
+		}
+		
+		return intQuality;
+	}
+
+	// Determines Fastq version, estimating if parameter is "auto"
+	private String getFastqVersion(Vector<Vector<Integer>> intQuality, String parameter){
+		Vector<String> version = new Vector<String>();
+		if (parameter.matches("auto")){
+			int minQual = Integer.MAX_VALUE;
+			int maxQual = Integer.MIN_VALUE;
+			for (Vector<Integer> vec : intQuality){
+				for (Integer qual : vec){
+					minQual = Math.min(minQual, qual);
+					maxQual = Math.max(maxQual, qual);
+				}
+			}
+			
+			if (Main.DEBUGMODE){
+				System.out.println("min qual (ascii): " + minQual);	
+				System.out.println("max qual (ascii): " + maxQual);	
+			}
+			
+			if (minQual >= 33 && maxQual <= 126){
+				version.add("sanger");
+			}
+			if (minQual >= 35 && maxQual <= 74){
+				version.add("illumina18");
+			}
+			if (minQual >= 59 && maxQual <= 126){
+				version.add("solexa");
+			}
+			if (minQual >= 64 && maxQual <= 126){
+				version.add("illumina13");
+			}
+			if (minQual >= 66 && maxQual <= 126){
+				version.add("illumina15");
+			}
+		}
+		else{
+			version.add(parameter);
+		}
+		if (version.size() > 1){
+			System.out.println("error: fastq version dedection is ambiguous " + version + ".\n       please set version manually!");
+			System.exit(1);
+		}
+		if (version.size() == 0){
+			System.out.println("error: fastq version dedection failed, unknown version.\n       please set version manually!");
+			System.exit(1);
+		}
+		if (Main.DEBUGMODE){
+			System.out.println("detected fastq version: " + version.get(0));			
+		}
+		return version.get(0);
+	}
+		
+}
\ No newline at end of file
diff --git a/src/de/rki/ng4/surankco/files/Qual.java b/src/de/rki/ng4/surankco/files/Qual.java
new file mode 100755
index 0000000..923f5bb
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Qual.java
@@ -0,0 +1,61 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.files;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.data.FastqReads;
+
+/**
+ * Import of read qualities from Qual/Qua-Files
+ * 
+ * @author Mathias Kuhring
+ *
+ */
+public class Qual {
+	
+	public Qual(String filename, FastqReads reads) {
+		
+		try {
+			BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+			try {
+				String line;
+				String[] splits;
+				while ((line = br.readLine()) != null){
+					if (line.startsWith(">")){
+						splits = line.split(" ");
+						reads.readId.add(splits[0].substring(1));
+						reads.readQuality.add(new Vector<Integer>());
+					}
+					else {
+						splits = line.split(" ");
+						for (String s : splits){
+							reads.readQuality.lastElement().add(Integer.parseInt(s));
+						}
+					}
+				}
+			}
+			finally {
+				br.close();
+			}
+		} 
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + filename);
+			System.exit(1);
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + filename);
+			System.exit(1);
+		}
+	}
+}
diff --git a/src/de/rki/ng4/surankco/files/Sam.java b/src/de/rki/ng4/surankco/files/Sam.java
new file mode 100755
index 0000000..043fdd7
--- /dev/null
+++ b/src/de/rki/ng4/surankco/files/Sam.java
@@ -0,0 +1,72 @@
+package de.rki.ng4.surankco.files;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.SamReaderFactory;
+import htsjdk.samtools.util.SamLocusIterator;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import de.rki.ng4.surankco.data.SamContig;
+
+public class Sam {
+
+	public static void main(String[] args){
+		Sam test = new Sam("data/ex1.sam");
+		for (SamContig contig : test.getContigs().values()){
+			System.out.println(contig.getId());
+		}
+	}
+
+	private HashMap<String,SamContig> contigs;
+
+	public Sam(String filename) {
+
+		contigs = new HashMap<String,SamContig>();
+
+		final SamReader readReader = SamReaderFactory.makeDefault().open(new File(filename));
+
+		//		System.out.println(readReader.getFileHeader());
+
+		String refId;
+		for (final SAMRecord samRecord : readReader) {
+
+			if (!samRecord.getReadUnmappedFlag()){
+				refId = samRecord.getReferenceName();
+				if (!contigs.containsKey(refId)){
+					contigs.put(refId, new SamContig(refId));
+				}
+				contigs.get(refId).addRead(samRecord);
+			}
+
+			//			System.out.println(samRecord);
+		}
+
+		final SamReader alignmentReader = SamReaderFactory.makeDefault().open(new File(filename));
+
+		for (final SamLocusIterator.LocusInfo locus : new SamLocusIterator(alignmentReader)){
+			refId = locus.getSequenceName();
+
+			if (contigs.containsKey(locus.getSequenceName())){
+				ArrayList<Byte> alignmentColumn = new ArrayList<Byte>();
+				
+				List<SamLocusIterator.RecordAndOffset> recs = locus.getRecordAndPositions();
+
+//				System.out.println(refId + "\t" + locus.getPosition() + ": ");
+				for (SamLocusIterator.RecordAndOffset rec : recs){
+					alignmentColumn.add(rec.getReadBase());
+					//				System.out.print((char) rec.getReadBase() + " ");
+				}
+				//			System.out.println();
+				contigs.get(locus.getSequenceName()).addAlignmentColumn(alignmentColumn);
+			}
+		}
+	}
+
+	public HashMap<String,SamContig> getContigs(){
+		return contigs;
+	}
+}
diff --git a/src/de/rki/ng4/surankco/scoring/Main.java b/src/de/rki/ng4/surankco/scoring/Main.java
new file mode 100755
index 0000000..1875c8d
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/Main.java
@@ -0,0 +1,32 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring;
+
+public class Main {
+
+	// debugging
+	public final static boolean DEBUGMODE = false;
+	
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		
+		String filename = args[0];
+		String outputFile = args[1];
+				
+		if (Main.DEBUGMODE){ System.out.println("parse pretty psl file..."); }
+		PSLPrettyParser psl = new PSLPrettyParser(filename);
+		
+		if (Main.DEBUGMODE){ System.out.println("calculate metrics..."); }
+		MetricsPSL metrics = new MetricsPSL(psl.getHits());
+		metrics.calculateMetrics();
+		if (Main.DEBUGMODE){ System.out.println("export metrics..."); }
+		metrics.export(outputFile);
+
+	}
+
+}
diff --git a/src/de/rki/ng4/surankco/scoring/MetricsPSL.java b/src/de/rki/ng4/surankco/scoring/MetricsPSL.java
new file mode 100755
index 0000000..c167fbb
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/MetricsPSL.java
@@ -0,0 +1,349 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Vector;
+
+import de.rki.ng4.surankco.scoring.scores.NormedContigLength1;
+import de.rki.ng4.surankco.scoring.scores.NormedContigLength2;
+import de.rki.ng4.surankco.scoring.scores.Score;
+
+
+public class MetricsPSL {
+		
+	private Vector<PSLHit> hits;
+	private Vector<Vector<Boolean>> matchSequences;
+	
+	private Vector<String> metricsHeader;
+	private Vector<Vector<Double>> metricsValues;
+	
+	public MetricsPSL(Vector<PSLHit> hits){
+		this.hits = hits;
+		
+		matchSequences = new Vector<Vector<Boolean>>();
+		for (PSLHit hit : hits){
+			matchSequences.add(hit.getExtendedMatchSequence());
+		}
+		
+		metricsHeader = new Vector<String>();
+		metricsValues = new Vector<Vector<Double>>();
+	}
+		
+	public void calculateMetrics(){
+		Vector<Double> matchCounts = matchCount();
+		Vector<Double> errorCounts = errorCount(matchCounts);
+		Vector<Double> errorSubtractions = errorSubtraction(errorCounts);
+		
+		Vector<Double> normMatchCounts1 = normMatchCount1(matchCounts);
+		Vector<Double> normMatchCounts2 = normMatchCount2(matchCounts);
+		Vector<Double> normErrorCounts1 = normErrorCount1(errorCounts);
+		Vector<Double> normErrorCounts2 = normErrorCount2(errorCounts);
+		Vector<Double> normErrorSubtractions = normErrorSubtraction(errorSubtractions);
+		
+		Vector<Double> maxContErrors = maxContError();
+		Vector<Double> maxRegionErrors = maxRegionError(100);
+		
+		Vector<Double> maxEndErrorStretches = maxEndErrorStretch();
+		Vector<Double> maxEndErrorCounts = maxEndErrorCount(100);
+		Vector<Double> maxEndContErrors = maxEndContError(100);
+			
+		metricsValues.add(matchCounts);
+		metricsValues.add(errorCounts);
+		metricsValues.add(errorSubtractions);
+		
+		metricsValues.add(normMatchCounts1);
+		metricsValues.add(normMatchCounts2);
+		metricsValues.add(normErrorCounts1);
+		metricsValues.add(normErrorCounts2);
+		metricsValues.add(normErrorSubtractions);
+		
+		metricsValues.add(maxContErrors);
+		metricsValues.add(maxRegionErrors);
+		
+		metricsValues.add(maxEndErrorStretches);
+		metricsValues.add(maxEndErrorCounts);
+		metricsValues.add(maxEndContErrors);
+		
+		
+		// New Scoring implementation
+		Vector<Score> scores = new Vector<Score>();
+		scores.add(new NormedContigLength1());
+		scores.add(new NormedContigLength2());
+		
+		Vector<Vector<Double>> results = new Vector<Vector<Double>>();
+		for (Score score : scores){
+			metricsHeader.add(score.getScoreName());
+			results.add(new Vector<Double>());
+		}
+		
+		for (PSLHit hit : hits){
+			for (int i=0; i<scores.size(); i++){
+				results.get(i).add(scores.get(i).calculate(hit));
+			}
+		}		
+
+		for (Vector<Double> result : results){
+			metricsValues.add(result);
+		}
+		
+	}
+	
+	public void printMetrics(){
+		System.out.print("ContigID ");
+		for (String header : metricsHeader){
+			System.out.print(header + " ");
+		}
+		System.out.println();
+		
+		for (int i=0; i<hits.size(); i++){
+			System.out.print(hits.get(i).getQueryName() + " ");
+			for (int j=0; j<metricsValues.size(); j++){
+				System.out.print(metricsValues.get(j).get(i) + " ");
+			}
+			System.out.println();
+		}
+	}
+	
+	public void export(String fileName){
+		try {
+			BufferedWriter mbw = new BufferedWriter(new FileWriter(new File(fileName)));
+			
+			try {
+				mbw.append("ContigID ");
+				for (String header : metricsHeader){
+					mbw.append(header + " ");
+				}
+				mbw.newLine();
+				
+				for (int i=0; i<hits.size(); i++){
+					mbw.append(hits.get(i).getQueryName() + " ");
+					for (int j=0; j<metricsValues.size(); j++){
+						mbw.append(metricsValues.get(j).get(i) + " ");
+					}
+					mbw.newLine();
+				}
+			}
+			finally {
+				mbw.close();
+			}
+			
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+	
+	private Vector<Double> matchCount(){
+		metricsHeader.add("MatchCount");
+		Vector<Double> values = new Vector<Double>();
+		double count;
+		for (Vector<Boolean> matchSeq : matchSequences){
+			count = 0;
+			for (Boolean match : matchSeq){
+				if (match) count++;
+			}
+			values.add(count);
+		}
+		return(values);
+	}
+	
+	private Vector<Double> errorCount(Vector<Double> matchCounts){
+		metricsHeader.add("ErrorCount");
+		Vector<Double> values = new Vector<Double>();
+		for (int i=0; i<hits.size(); i++){
+			values.add(matchSequences.get(i).size() - matchCounts.get(i));
+		}
+		return(values);
+	}
+	
+	private Vector<Double> errorSubtraction(Vector<Double> errorCounts){
+		metricsHeader.add("ErrorSubtraction");
+		Vector<Double> values = new Vector<Double>();
+		for (int i=0; i<hits.size(); i++){
+			values.add( Math.max(0, (hits.get(i).getQuerySize() - errorCounts.get(i))) ); // set to 0 if error count > contig size, e.g. big gaps
+		}
+		return(values);
+	}
+	
+	private Vector<Double> normMatchCount1(Vector<Double> matchCounts){
+		metricsHeader.add("NormedMatchCount1");
+		Vector<Double> values = new Vector<Double>();
+		for (int i=0; i<hits.size(); i++){
+			values.add(matchCounts.get(i) / hits.get(i).getQuerySize());
+		}
+		return(values);
+	}
+	
+	private Vector<Double> normMatchCount2(Vector<Double> matchCounts){
+		metricsHeader.add("NormedMatchCount2");
+		Vector<Double> values = new Vector<Double>();
+		for (int i=0; i<hits.size(); i++){
+			values.add(matchCounts.get(i) / matchSequences.get(i).size());
+		}
+		return(values);
+	}
+	
+	private Vector<Double> normErrorCount1(Vector<Double> errorCounts){
+		metricsHeader.add("NormedErrorCount1");
+		Vector<Double> values = new Vector<Double>();
+		for (int i=0; i<hits.size(); i++){
+			values.add( Math.min(1, (errorCounts.get(i) / hits.get(i).getQuerySize())) ); // set to 1 if error count > contig size, e.g. big gaps
+		}
+		return(values);
+	}
+	
+	private Vector<Double> normErrorCount2(Vector<Double> errorCounts){
+		metricsHeader.add("NormedErrorCount2");
+		Vector<Double> values = new Vector<Double>();
+		for (int i=0; i<hits.size(); i++){
+			values.add(errorCounts.get(i) / matchSequences.get(i).size());
+		}
+		return(values);
+	}
+	
+	private Vector<Double> normErrorSubtraction(Vector<Double> errorSubtractions){
+		metricsHeader.add("NormedErrorSubtraction");
+		Vector<Double> values = new Vector<Double>();
+		for (int i=0; i<hits.size(); i++){
+			values.add( Math.max(0, (errorSubtractions.get(i) / hits.get(i).getQuerySize())) );  // set to 0 if error count > contig size, e.g. big gaps
+		}
+		return(values);
+	}
+	
+	private Vector<Double> maxContError(){
+		metricsHeader.add("MaxContiguousError");
+		Vector<Double> values = new Vector<Double>();
+		double max, count;
+		for (int i=0; i<hits.size(); i++){
+			max = 0;
+			count = 0;
+			for (Boolean b : matchSequences.get(i)){
+				if (b){
+					max = Math.max(max, count);
+					count = 0;
+				}
+				else count++;
+			}
+			max = Math.max(max, count);
+			values.add( Math.min(1, (max / hits.get(i).getQuerySize())) ); // normed by contig size and set to 1 if error count > contig size, e.g. very big gaps
+		}
+		return(values);
+	}
+	
+	private Vector<Double> maxRegionError(int regionSize){
+		metricsHeader.add("MaxRegionError");
+		Vector<Double> values = new Vector<Double>();
+		double max, count;
+		Vector<Boolean> matchSeq;
+		for (int i=0; i<hits.size(); i++){
+			max = 0;
+			count = 0;
+			matchSeq = matchSequences.get(i);
+			for (int j=0; j<Math.min(regionSize,matchSeq.size()); j++){
+				if (!matchSeq.get(j)) count++;
+				max = count;
+			}
+			for (int j=Math.min(regionSize, matchSeq.size()); j<matchSeq.size(); j++){
+				if (!matchSeq.get(j-regionSize)) count--;
+				if (!matchSeq.get(j)) count++;
+				max = Math.max(max, count);
+			}
+			values.add(max);
+		}
+		return(values);
+	}
+	
+	
+	private Vector<Double> maxEndErrorStretch(){
+		metricsHeader.add("MaxEndErrorStretch");
+		Vector<Double> values = new Vector<Double>();
+		double left, right;
+		int index;
+		Vector<Boolean> matchSeq;
+		for (int i=0; i<hits.size(); i++){
+			left = 0;
+			right = 0;
+			matchSeq = matchSequences.get(i);
+			
+			// left error stretch size
+			index = 0;
+			while (!matchSeq.get(index++)){
+				left++;
+			}
+			
+			// right error stretch size
+			index = matchSeq.size()-1;
+			while (!matchSeq.get(index--)){
+				right++;
+			}
+			
+			values.add( Math.max(left, right) / hits.get(i).getQuerySize() ); // normed by contig size
+		}
+		return(values);
+	}
+	
+	private Vector<Double> maxEndErrorCount(int endSize){
+		metricsHeader.add("MaxEndErrorCount");
+		Vector<Double> values = new Vector<Double>();
+		double left, right;
+		Vector<Boolean> matchSeq;
+		for (int i=0; i<hits.size(); i++){
+			left = 0;
+			right = 0;
+			matchSeq = matchSequences.get(i);
+			
+			for (int j=0; j<Math.min(endSize,matchSeq.size()); j++){
+				if (!matchSeq.get(j)) left++;
+			}
+			
+			for (int j=matchSeq.size()-1; j>=Math.max(matchSeq.size()-endSize,0); j--){
+				if (!matchSeq.get(j)) right++;
+			}
+			
+			values.add(Math.max(left, right));
+		}
+		return(values);
+	}
+	
+	private Vector<Double> maxEndContError(int endSize){
+		metricsHeader.add("MaxEndContError");
+		Vector<Double> values = new Vector<Double>();
+		double left, right, count;
+		Vector<Boolean> matchSeq;
+		for (int i=0; i<hits.size(); i++){
+			left = 0;
+			right = 0;
+			matchSeq = matchSequences.get(i);
+			
+			count = 0;
+			for (int j=0; j<Math.min(endSize,matchSeq.size()); j++){
+				if (matchSeq.get(j)){
+					left = Math.max(left, count);
+					count = 0;
+				}
+				else count++;
+			}
+			left = Math.max(left, count);
+			
+			count = 0;
+			for (int j=matchSeq.size()-1; j>=Math.max(matchSeq.size()-endSize,0); j--){
+				if (matchSeq.get(j)){
+					right = Math.max(right, count);
+					count = 0;
+				}
+				else count++;
+			}
+			right = Math.max(right, count);
+			
+			values.add(Math.max(left, right));
+		}
+		return(values);
+	}
+
+}
diff --git a/src/de/rki/ng4/surankco/scoring/PSLHit.java b/src/de/rki/ng4/surankco/scoring/PSLHit.java
new file mode 100755
index 0000000..80e2d4b
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/PSLHit.java
@@ -0,0 +1,139 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring;
+
+import java.util.Vector;
+
+public class PSLHit {
+
+	private String qName, strand;
+	private int qStart, qEnd, qSize;
+	
+	private String tName;
+	private int tStart, tEnd, tSize;
+	
+	private Vector<String> lines;
+	
+	private String query, target;
+	
+	private Vector<Boolean> matchSeq;
+	private Vector<Boolean> matchSeqExt;
+	
+	
+	public PSLHit(String header){
+		parseHeader(header);
+		lines = new Vector<String>();
+	}
+	
+	private void parseHeader(String header){
+		// >contig00001:7626+41627 of 41627 gi|170079663|ref|NC_010473.1|:563130+597131 of 4686137
+		String[] mainSplits = header.split(" ");
+		
+		String[] namePositionSplits = mainSplits[0].split("\\:");
+		String[] positionSplits;
+		if (namePositionSplits[1].contains("+")){
+			positionSplits = namePositionSplits[1].split("\\+");
+			strand = "+";
+		}
+		else{
+			positionSplits = namePositionSplits[1].split("\\-");
+			strand = "-";
+		}
+		qName = namePositionSplits[0].substring(1);
+		qStart = Integer.parseInt(positionSplits[0]);
+		qEnd = Integer.parseInt(positionSplits[1]);
+		qSize = Integer.parseInt(mainSplits[2]);
+		
+		namePositionSplits = mainSplits[3].split("\\:");
+		positionSplits = namePositionSplits[1].split("\\+");
+		tName = namePositionSplits[0];
+		tStart = Integer.parseInt(positionSplits[0]);
+		tEnd = Integer.parseInt(positionSplits[1]);
+		tSize = Integer.parseInt(mainSplits[5]);
+	}
+	
+	private String getHeader(){
+		return ">" + qName + ":" + qStart + strand + qEnd + " of " + qSize + " " +
+					tName + ":" + tStart + "+" + tEnd + " of " + tSize;
+	}
+	
+	private String getAlignment(){
+		StringBuffer buffer = new StringBuffer();
+		for (String line : lines){
+			buffer.append(line + "\n");
+		}
+		return buffer.substring(0, buffer.length()-1);
+	}
+	
+	public String toString(){
+		return getHeader() + "\n" + getAlignment();
+	}
+
+	public void appendAlignment(String line) {
+		lines.add(line);
+	}
+
+	public void parseAlignment() {
+		StringBuffer query = new StringBuffer();
+		StringBuffer target = new StringBuffer();
+		
+		int index = 0;
+		
+		while (index+2 <= lines.size()){
+			query.append(lines.get(index++));
+			index++;
+			target.append(lines.get(index++));
+			index++;
+		}
+		
+		this.query = query.toString().toUpperCase();
+		this.target = target.toString().toUpperCase();
+	}
+	
+	public Vector<Boolean> getMatchSequence(){
+		if (matchSeq != null) return matchSeq;
+		
+		matchSeq = new Vector<Boolean>();
+		
+		for (int i=0; i<query.length(); i++){
+			matchSeq.add(query.charAt(i) == target.charAt(i));
+		}
+		
+		return matchSeq;
+	}
+	
+	public Vector<Boolean> getExtendedMatchSequence(){
+		if (matchSeqExt != null) return matchSeqExt;
+		
+		matchSeqExt = new Vector<Boolean>();
+		
+		int prefix = qStart;
+		for (int j=0; j<prefix; j++){
+			matchSeqExt.add(false);
+		}
+		
+		matchSeqExt.addAll(getMatchSequence());
+		
+		int suffix = qSize-qEnd;
+		for (int j=0; j<suffix; j++){
+			matchSeqExt.add(false);
+		}
+		
+		return matchSeqExt;
+	}
+	
+	public String getQueryName(){
+		return qName;
+	}
+	
+	public int getQuerySize(){
+		return qSize;
+	}
+
+	public int getTargetSize(){
+		return tSize;
+	}
+}
diff --git a/src/de/rki/ng4/surankco/scoring/PSLPrettyParser.java b/src/de/rki/ng4/surankco/scoring/PSLPrettyParser.java
new file mode 100755
index 0000000..009acfe
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/PSLPrettyParser.java
@@ -0,0 +1,93 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.util.Vector;
+
+
+public class PSLPrettyParser {
+
+	private String fileName;
+	private Vector<PSLHit> hits;
+	
+	public PSLPrettyParser(String fileName){
+		this.fileName = fileName;
+		hits = new Vector<PSLHit>();	
+		
+		try {
+			LineNumberReader reader = new LineNumberReader(new FileReader(new File(fileName)));
+			try {
+				String line;
+
+				while ((line = reader.readLine()) != null){
+					if (line.startsWith(">")){
+						hits.add(new PSLHit(line));
+					}
+					else {
+						hits.lastElement().appendAlignment(line);
+					}
+				}
+				
+				for (PSLHit hit : hits){
+					hit.parseAlignment();
+				}
+				
+			}
+			finally {
+				reader.close();
+			}
+		} 
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + fileName);
+			System.exit(1);
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems reading file: " + fileName);
+			System.exit(1);
+		}
+	}
+	
+	public void printPrettyPslHits(){
+		for (PSLHit hit : hits){
+			System.out.print(hit.toString());
+		}
+	}
+	
+	public void export(String filename){
+		try {
+			BufferedWriter writer = new BufferedWriter(new FileWriter(new File(filename)));
+			
+			try {
+				for (PSLHit hit : hits){
+					writer.write(hit.toString());
+					writer.newLine();
+				}
+			}
+			finally {
+				writer.close();
+			}
+			
+		} catch (IOException e) {
+			e.printStackTrace();
+			System.out.println("error: problems writing file: " + fileName);
+			System.exit(1);
+		}
+	}
+
+	public Vector<PSLHit> getHits() {
+		return hits;
+	}
+	
+}
\ No newline at end of file
diff --git a/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength1.java b/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength1.java
new file mode 100755
index 0000000..8319359
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength1.java
@@ -0,0 +1,29 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring.scores;
+
+import java.util.Vector;
+
+import de.rki.ng4.surankco.scoring.PSLHit;
+
+public class NormedContigLength1 extends Score{
+
+	private static final String SCORE_NAME = "NormedContigLength1";
+	
+	@Override
+	public String getScoreName() {
+		return SCORE_NAME;
+	}
+	
+	@Override
+	public double calculate(PSLHit hit) {
+
+		int contigSize = hit.getQuerySize();
+		Vector<Boolean> matchSequence = hit.getExtendedMatchSequence();
+		
+		return (double) contigSize / (double) matchSequence.size();
+	}
+}
diff --git a/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength2.java b/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength2.java
new file mode 100755
index 0000000..ef4f78b
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/scores/NormedContigLength2.java
@@ -0,0 +1,28 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring.scores;
+
+import de.rki.ng4.surankco.scoring.PSLHit;
+
+
+public class NormedContigLength2 extends Score{
+
+	private static final String SCORE_NAME = "NormedContigLength2";
+	
+	@Override
+	public String getScoreName() {
+		return SCORE_NAME;
+	}
+	
+	@Override
+	public double calculate(PSLHit hit) {
+
+		int contigSize = hit.getQuerySize();
+		int referenceSize = hit.getTargetSize();
+		
+		return (double) contigSize / (double) referenceSize;
+	}
+}
diff --git a/src/de/rki/ng4/surankco/scoring/scores/Score.java b/src/de/rki/ng4/surankco/scoring/scores/Score.java
new file mode 100755
index 0000000..b89b9aa
--- /dev/null
+++ b/src/de/rki/ng4/surankco/scoring/scores/Score.java
@@ -0,0 +1,15 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.scoring.scores;
+
+import de.rki.ng4.surankco.scoring.PSLHit;
+
+public abstract class Score {
+
+	public abstract String getScoreName();
+	public abstract double calculate(PSLHit hit);
+
+}
diff --git a/src/de/rki/ng4/surankco/tools/Statistic.java b/src/de/rki/ng4/surankco/tools/Statistic.java
new file mode 100755
index 0000000..d5e4154
--- /dev/null
+++ b/src/de/rki/ng4/surankco/tools/Statistic.java
@@ -0,0 +1,111 @@
+/* Copyright (c) 2014,
+ * Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+ * All rights reserved. For details, please note the license.txt.
+ */
+
+package de.rki.ng4.surankco.tools;
+
+import java.util.Collections;
+import java.util.Vector;
+
+public class Statistic {
+
+	public static <N extends Number> double mad(Vector<N> data){
+		final double scalefactor = 1.4826;
+		double median = median(data);
+		Vector<Double> diffs = new Vector<Double>();
+		for (N num : data){ diffs.add(Math.abs(num.doubleValue()-median)); }
+		return median(diffs) * scalefactor;
+	}
+	
+	public static <N extends Number> Vector<Double> mads(Vector<Vector<N>> data){
+		Vector<Double> mads = new Vector<Double>();
+		for (Vector<N> vn : data){ mads.add(mad(vn)); }	
+		return mads;
+	}
+	
+	public static <N extends Number> double max(Vector<N> data){
+		double max = Double.MIN_VALUE;
+		for (N num : data){ max = Math.max(max, num.doubleValue()); }
+		return max;
+	}
+	
+	public static <N extends Number> Vector<Double> maxs(Vector<Vector<N>> data){
+		Vector<Double> maxs = new Vector<Double>();
+		for (Vector<N> vn : data){ maxs.add(max(vn)); }	
+		return maxs;
+	}
+	
+	public static <N extends Number> double mean(Vector<N> data){
+		return sum(data)/data.size();
+	}
+	
+	public static  <N extends Number> Vector<Double> means(Vector<Vector<N>> data){
+		Vector<Double> means = new Vector<Double>();
+		for (Vector<N> vn : data){ means.add(mean(vn)); }	
+		return means;
+	}
+	
+	public static <N extends Number> double median(Vector<N> data){
+		Vector<Double> copy = new Vector<Double>();
+		for (N num : data){ copy.add(num.doubleValue()); }
+		Collections.sort(copy);
+		int n = copy.size();
+		if (n % 2 == 1){ return copy.get(((n+1)/2) -1); }
+		else{ return 0.5 * (copy.get((n/2) -1) + copy.get(((n/2)+1) -1)); }
+	}
+	
+	public static <N extends Number> Vector<Double> medians(Vector<Vector<N>> data){
+		Vector<Double> medians = new Vector<Double>();
+		for (Vector<N> vn : data){ medians.add(median(vn)); }	
+		return medians;
+	}
+	
+	public static <N extends Number> double min(Vector<N> data){
+		double min = Double.MAX_VALUE;
+		for (N num : data){ min = Math.min(min, num.doubleValue()); }
+		return min;
+	}
+	
+	public static <N extends Number> Vector<Double> mins(Vector<Vector<N>> data){
+		Vector<Double> mins = new Vector<Double>();
+		for (Vector<N> vn : data){ mins.add(min(vn)); }	
+		return mins;
+	}
+	
+	public static <N extends Number> double sd(Vector<N> data){
+		return Math.sqrt(var(data));
+	}
+	
+	public static <N extends Number> Vector<Double> sds(Vector<Vector<N>> data){
+		Vector<Double> sds = new Vector<Double>();
+		for (Vector<N> vn : data){ sds.add(sd(vn)); }	
+		return sds;
+	}
+	
+	public static <N extends Number> double sum(Vector<N> data){
+		double sum = 0;
+		for (N num : data){ sum += num.doubleValue(); }
+		return sum;
+	}
+	
+	public static <N extends Number> Vector<Double> sums(Vector<Vector<N>> data){
+		Vector<Double> sums = new Vector<Double>();
+		for (Vector<N> vn : data){ sums.add(sum(vn)); }	
+		return sums;
+	}
+	
+	public static <N extends Number> double var(Vector<N> data){
+		double var = 0;
+		double mean = mean(data);
+		for (N num : data){ var += Math.pow((num.doubleValue()-mean), 2); }
+		return var/(data.size()-1);
+	}
+	
+	public static <N extends Number> Vector<Double> vars(Vector<Vector<N>> data){
+		Vector<Double> vars = new Vector<Double>();
+		for (Vector<N> vn : data){ vars.add(var(vn)); }	
+		return vars;
+	}
+
+}
diff --git a/surankco-feature b/surankco-feature
new file mode 100755
index 0000000..433a6a5
--- /dev/null
+++ b/surankco-feature
@@ -0,0 +1,111 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# surankco-feature: feature generation from contigs (ACE format) and 
+#                   corresponding reads (QUAL or FASTQ format)
+
+
+# get script path
+args <- commandArgs(trailingOnly = FALSE)
+script.arg <- "--file="
+script.name <- sub(script.arg, "", args[grep(script.arg, args)])
+script.path <- dirname(script.name)
+
+
+# testing/debugging
+# args <- c("--directory=data", "--split.regex=\\.\\d+-")
+# script.path <- getwd()
+DEBUGGING <- FALSE
+
+
+# sources and libraries
+source(paste(script.path, '/r/parameter.R', sep=""))
+source(paste(script.path, '/r/feature.R', sep=""))
+source(paste(script.path, '/r/functions.R', sep=""))
+source(paste(script.path, '/r/coverage.R', sep=""))
+source(paste(script.path, '/r/covcurv.R', sep=""))
+source(paste(script.path, '/r/import.R', sep=""))
+loadPackages(c("optparse","parallel"), quietly=TRUE)
+
+
+# parsing parameter
+cat("prepare files\n")
+parameters <- parseSurankcoFeature()
+files <- parameters$files
+
+if (DEBUGGING){
+  print(args)
+  print(parameters)
+  print(files)
+}
+
+
+input.file.assembly <- files[ ,parameters$assembly.format]
+input.file.quality <- files[ ,parameters$read.quality.format]
+output.file.features <- paste(rownames(files), ".feature.tmp", sep="")
+output.file.coverage <- paste(rownames(files), ".coverage.tmp", sep="")
+output.file.ccoverage <- paste(rownames(files), ".ccoverage.tmp", sep="")
+
+# extract features in java and prepare/export data for R
+# multicore?
+cat("extract features (part 1): ")
+for (i in 1:nrow(files)){
+  cat(paste0(i, " "))
+  java.call <- paste("java",
+                     paste("-Xms", parameters$memory, "G", sep=""), 
+                     paste("-Xmx", parameters$memory, "G", sep=""), 
+                     "-Dcom.ibm.tools.attach.enable=no -jar",
+                     paste(script.path, "surankco.jar", sep="/"),
+                     input.file.assembly[i],    # args[0] Assembly File
+                     input.file.quality[i],     # args[1] Fasta/Qual File
+                     output.file.features[i],   # args[2] Tmp Feature Output File
+                     output.file.coverage[i],   # args[3] Tmp Coverage Output File
+                     output.file.ccoverage[i],  # args[4] Tmp CCoverage Output File
+                     parameters$fastq.version,  # args[5] Fastq version
+                     parameters$split.regex,    # args[6] Read name split regex
+                     parameters$kmer.features,  # args[7] kmer on/off
+                     parameters$contig.size.filter) # args[8] minimum contig size
+  
+  if(DEBUGGING){
+    print(java.call)
+  }
+  
+  if(code <- system(java.call, ignore.stdout=FALSE)){
+    complainAndStop("java not successfully executed", code)
+  }
+}
+cat("\n")
+
+cat("extract features (part 2)\n")
+# import java features and prepared data (coverages)
+features.raw <- importFeatures(input.file.assembly,
+                               input.file.quality,
+                               output.file.features,
+                               output.file.coverage,
+                               output.file.ccoverage)
+
+# extract features in R
+features.raw <- attachEGS(features.raw, parameters$expected.genome.size)
+features.final <- extractFeatures(features.raw, 
+                                  parameters$threads)
+#parameters$expected.genome.size)  
+
+# export features as csv
+cat("export features\n")
+final.file.features <- paste(rownames(files), ".features.txt", sep="")
+for (i in 1:nrow(files)){
+  write.table(features.final[[i]], file=final.file.features[i],
+              sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
+}
+
+# unlink tmp files
+if (!DEBUGGING){
+  cat("remove temporary files\n")
+  unlink(c(output.file.features,output.file.coverage,output.file.ccoverage))
+}
+
+# done
+cat("surankco-feature calculations done\n")
diff --git a/surankco-prediction b/surankco-prediction
new file mode 100755
index 0000000..72982e9
--- /dev/null
+++ b/surankco-prediction
@@ -0,0 +1,78 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# surankco-prediction:  prediction of scores and ranking of contigs using 
+#                       trained random forests (from surankco-training) and 
+#                       contig features (from surankco-feature)
+
+
+# get script path
+args <- commandArgs(trailingOnly = FALSE)
+script.arg <- "--file="
+script.name <- sub(script.arg, "", args[grep(script.arg, args)])
+script.path <- dirname(script.name)
+
+
+# testing/debugging
+# args <- c("--features=data/adenoABC.features.txt", "-r", "surankco_RFs.RData")
+# script.path <- getwd()
+DEBUGGING <- FALSE
+
+
+# sources and libraries
+source(paste(script.path, '/r/parameter.R', sep=""))
+source(paste(script.path, '/r/import.R', sep=""))
+source(paste(script.path, '/r/rf.R', sep=""))
+source(paste(script.path, '/r/voting.R', sep=""))
+# ...
+loadPackages(c("optparse","randomForest"), quietly=TRUE)
+
+
+# parsing parameter
+cat("prepare files\n")
+parameters <- parseSurankcoPrediction()
+
+if (DEBUGGING){
+  print(args)
+  print(parameters)
+  print(parameters$files.features)
+  print(parameters$files.rf)
+}
+
+# import feature files and RFs
+cat("import features and RFs\n")
+features <- read.table(file=parameters$files.features, sep="\t", 
+                       dec = ".", header=TRUE, 
+                       colClasses=get.colClasses(parameters$files.features, 3))
+rf.loaded <- load(parameters$files.rf)
+if (!all(c("rfs") %in% rf.loaded)){
+  print("error")
+}
+
+# prepare datacat
+cat("prepare data\n")
+features <- dataFilter(features)
+metadata <- features[ ,1:3]
+features <- features[ ,-c(1:3)]
+
+# prediction
+cat("predict scores\n")
+prediction <- scorePrediction(rfs=rfs, input=features, type="response")
+weighting <- scorePrediction(rfs=rfs, input=features, type="prob")
+
+# voting and ranking
+cat("vote and rank\n")
+final.scores <- weightedVotingSum(scoreList=prediction, weightList=weighting)
+results <- cbind(metadata, SurankcoContigScore=final.scores)
+results <- results[order(results$SurankcoContigScore, decreasing=TRUE),]
+
+# export rankings
+cat("export ranking\n")
+write.table(results, file=parameters$output.filename,
+            sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
+
+# done
+cat("surankco-prediction calculations done\n")
\ No newline at end of file
diff --git a/surankco-score b/surankco-score
new file mode 100755
index 0000000..fbb01f1
--- /dev/null
+++ b/surankco-score
@@ -0,0 +1,151 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# surankco-score: alignment of contigs (FASTA format) and reference genomes 
+#                 (FASTA format) and score calculation
+
+
+# get script path
+args <- commandArgs(trailingOnly = FALSE)
+script.arg <- "--file="
+script.name <- sub(script.arg, "", args[grep(script.arg, args)])
+script.path <- dirname(script.name)
+
+
+# testing/debugging
+# args <- c("--directory=data")
+# script.path <- getwd()
+DEBUGGING <- FALSE
+
+
+# sources and libraries
+source(paste(script.path, '/r/parameter.R', sep=""))
+source(paste(script.path, '/r/scores.R', sep=""))
+source(paste(script.path, '/r/expofits.R', sep=""))
+source(paste(script.path, '/r/import.R', sep=""))
+loadPackages(c("optparse","MASS"), quietly=TRUE)
+
+
+# parsing parameter
+cat("prepare files\n")
+parameters <- parseSurankcoScore()
+files <- parameters$files
+
+if (DEBUGGING){
+  print(args)
+  print(parameters)
+  print(files)
+}
+
+files$scores <- vector(mode="character", length=nrow(files))
+
+# external processes
+cat("align and score contigs: ")
+for (i in 1:nrow(files)){
+  cat(paste0(i, " "))
+  input.file.assembly <- files[i, parameters$assembly.suffix]
+  input.file.reference <- files[i, parameters$reference.suffix]
+  output.file.psl <- paste(rownames(files)[i], ".psl.tmp", sep="")
+  output.file.filter <- paste(rownames(files)[i], ".filter.tmp", sep="")
+  output.file.pretty <- paste(rownames(files)[i], ".pretty.tmp", sep="")
+  output.file.scores <- paste(rownames(files)[i], ".scores.tmp", sep="")
+  files$scores[i] <- output.file.scores
+  
+  # build alignments
+  blat.call <- paste(paste("blat", sep=""),
+                     input.file.reference, input.file.assembly,
+                     output.file.psl, "-minIdentity=0")
+  
+  if (DEBUGGING){
+    print(blat.call)
+  }
+  
+  if(code <- system(blat.call, ignore.stdout=TRUE)){
+    complainAndStop("blat not successfully executed", code)
+  }
+  
+  # filter alignments
+  filter.call <- paste(paste(script.path, "/r/pslMatchFilter", sep=""), 
+                       output.file.psl, output.file.filter)
+  
+  if (DEBUGGING){
+    print(filter.call)
+  }
+  
+  if(code <- system(filter.call, ignore.stdout=FALSE)){
+    complainAndStop("pslMatchFilter not successfully executed", code)
+  }
+  
+  # extract alignments from psl
+  pretty.call <- paste(paste("pslPretty", sep=""), 
+                       output.file.filter, input.file.reference,
+                       input.file.assembly, output.file.pretty, "-long")
+  
+  if (DEBUGGING){
+    print(pretty.call)
+  }
+  
+  if(code <- system(pretty.call, ignore.stdout=FALSE)){
+    complainAndStop("pslPretty not successfully executed", code)
+  }
+  
+  # calculate scores in Java and export data for R
+  java.call <- paste("java", 
+                     paste("-Xms", parameters$memory, "G", sep=""), 
+                     paste("-Xmx", parameters$memory, "G", sep=""), 
+                     "-cp",
+                     paste(script.path, "surankco.jar", sep="/"),
+                     "de.rki.ng4.surankco.scoring.Main",
+                     output.file.pretty,
+                     output.file.scores)
+  
+  if (DEBUGGING){
+    print(java.call)
+  }
+  
+  if(code <- system(java.call, ignore.stdout=FALSE)){
+    complainAndStop("java not successfully executed", code)
+  }
+  
+  if (!DEBUGGING){
+    unlink(c(output.file.psl, output.file.filter, output.file.pretty))
+  }
+}
+cat("\n")
+
+# import Java score
+cat("finalize scores\n")
+scores.raw <- importScores(files$scores, 
+                           files[ ,parameters$assembly.suffix], 
+                           files[ ,parameters$reference.suffix])
+
+# post process scores
+# remove redundant and highly correlating scores
+# (???could be moved to training/prediction to export a full set of score???)
+scores.final <- correctScores(scores.raw)
+
+# calculate fittings
+# print score distributions and threshold suggestions
+cat("export histograms\n")
+expoFit(do.call(rbind, scores.final), TRUE, parameters$pdf.histograms)
+
+# export score as csv
+cat("export scores\n")
+for (i in 1:nrow(files)){
+  output.file.scores <- paste(rownames(files)[i], ".scores.txt", sep="")
+  
+  write.table(scores.final[[i]], file=output.file.scores, 
+              sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
+}
+
+if (!DEBUGGING){
+  # unlink tmp filescat
+  cat("remove temporary files\n")
+  unlink(files$scores)
+}
+
+# done
+cat("surankco-score calculations done\n")
diff --git a/surankco-training b/surankco-training
new file mode 100755
index 0000000..f8071ad
--- /dev/null
+++ b/surankco-training
@@ -0,0 +1,93 @@
+#!/usr/bin/Rscript
+
+# Copyright (c) 2014,
+# Mathias Kuhring, KuhringM at rki.de, Robert Koch Institute, Germany, 
+# All rights reserved. For details, please note the license.txt.
+
+# surankco-training: training of random forest using contig features (from 
+#                    surankco-feature) and contig scores (surankco-score)
+
+
+# get script path
+args <- commandArgs(trailingOnly = FALSE)
+script.arg <- "--file="
+script.name <- sub(script.arg, "", args[grep(script.arg, args)])
+script.path <- dirname(script.name)
+
+
+# testing/debugging
+# args <- c("--directory=data")
+# script.path <- getwd()
+DEBUGGING <- FALSE
+
+
+# sources and libraries
+source(paste(script.path, '/r/parameter.R', sep=""))
+source(paste(script.path, '/r/import.R', sep=""))
+source(paste(script.path, '/r/rf.R', sep=""))
+source(paste(script.path, '/r/scores.R', sep=""))
+source(paste(script.path, '/r/expofits.R', sep=""))
+loadPackages(c("optparse","MASS","randomForest"), quietly=TRUE)
+
+
+# parsing parameter
+cat("prepare files\n")
+parameters <- parseSurankcoTraining()
+files <- parameters$files
+
+
+if (DEBUGGING){
+  print(args)
+  print(parameters)
+  print(files)
+}
+
+
+# import feature and score files
+cat("import features and scores\n")
+features <- readSurankcoFeatures(files$features.txt)
+scores <- readSurankcoFeatures(files$scores.txt)
+
+
+# merge and preparate data
+cat("prepare and merge data\n")
+training <- selectContigs(features,scores)
+training <- dataFilter(training)
+
+firstFeature <- which(colnames(training)=="Length")
+firstScore <- which(colnames(training)=="NormedMatchCount1")
+lastScore <- ncol(training)
+
+input <- training[,firstFeature:(firstScore-1)]
+targets <- training[,firstScore:lastScore]
+
+cat("divide into classes\n")
+if (DEBUGGING){ 
+  cat("before:\n")
+  summary(targets) 
+}
+# set thresholds/classes (maybe with fitting)
+if ("manual.thresholds" %in% names(parameters)){
+  targets <- manualClasses(targets, parameters$manual.thresholds)
+}else{
+  targets <- expoClasses(targets, parameters$exponential.quantile)
+}
+
+# training
+if (DEBUGGING){ # show always?
+  cat("after:\n")
+  summary(targets) 
+  
+  write.table(cbind(training$ContigID,targets), file="classification.tmp", 
+              sep="\t", dec = ".", col.names=TRUE, row.names=FALSE)
+} 
+cat("train random forests\n")
+rfs <- rfTraining(input, targets)
+
+# export the RFs
+cat("export random forests\n")
+save(rfs, file=parameters$output.filename)
+#save(rfs, file="surankco_RFs.RData")
+
+# done
+cat("surankco-training calculations done\n")

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/surankco.git



More information about the debian-med-commit mailing list