[med-svn] [giira] 01/01: Imported Upstream version 0.0.20131015
Andreas Tille
tille at debian.org
Fri Feb 7 23:05:27 UTC 2014
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository giira.
commit 979142cff004fb74fd7ce5b9922cafababa6d78a
Author: Andreas Tille <tille at debian.org>
Date: Sat Feb 8 00:04:22 2014 +0100
Imported Upstream version 0.0.20131015
---
README.txt | 169 ++
scripts/bwa_aln.py | 41 +
scripts/bwa_index.py | 28 +
scripts/bwa_samse.py | 42 +
scripts/bwa_sw.py | 42 +
scripts/callCat.py | 27 +
scripts/callCat_RnaFile.py | 27 +
scripts/filterGenes.py | 56 +
scripts/getMeanCov.py | 108 +
scripts/lpCall.py | 17 +
scripts/sortReaSam.py | 24 +
src/geneFinder/BWA_Call.java | 167 ++
src/geneFinder/CalculateScores.java | 173 ++
src/geneFinder/CleanAfterAmbiOpti.java | 382 ++++
src/geneFinder/DefineAlternativeTranscripts.java | 423 ++++
src/geneFinder/ExtractGeneCandidates.java | 2641 ++++++++++++++++++++++
src/geneFinder/FindExonsOfGene.java | 404 ++++
src/geneFinder/FrameSearch.java | 582 +++++
src/geneFinder/GeneFinder.java | 241 ++
src/geneFinder/Giira.java | 238 ++
src/geneFinder/HelperFunctions_GeneSearch.java | 590 +++++
src/geneFinder/IntronExonSearch.java | 1152 ++++++++++
src/geneFinder/LocalTwinResolve.java | 64 +
src/geneFinder/MergeClusters.java | 283 +++
src/geneFinder/Operon_LP.java | 425 ++++
src/geneFinder/OptimizeAmbis.java | 502 ++++
src/geneFinder/PrepareMapping_GF.java | 56 +
src/geneFinder/ProkaryoteExtraction.java | 623 +++++
src/geneFinder/Prokaryote_Specials.java | 589 +++++
src/geneFinder/ReadInParameters_GeneFinder.java | 485 ++++
src/geneFinder/SamParser.java | 1248 ++++++++++
src/geneFinder/TopHat_Call.java | 103 +
src/geneFinder/WriteOutput.java | 480 ++++
src/types/Contig.java | 27 +
src/types/Gene.java | 74 +
src/types/Rna.java | 30 +
36 files changed, 12563 insertions(+)
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..fdca0c5
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,169 @@
+---------------------------------
+GIIRA
+---------------------------------
+
+GIIRA is a stand-alone java program to predict genes based on RNA-Seq reads without requiring
+any a-priori knowledge.
+
+---------------------------------
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0.
+
+When using GIIRA, please cite the following manuscript:
+
+GIIRA - RNA-Seq Driven Gene Finding Incorporating Ambiguous Reads
+Franziska Zickmann; Martin S. Lindner; Bernhard Y. Renard
+Bioinformatics 2013; doi: 10.1093/bioinformatics/btt577
+
+---------------------------------
+INSTALLATION
+---------------------------------
+
+GIIRA is designed to run on a linux system with the following minimum requirements for installed software:
+
+- Python (http://www.python.org/)
+- Java (http://www.java.com)
+- either the CPLEX Optimizer (http://www-01.ibm.com/software/integration/optimization/cplex-optimizer/)
+ or the GLPK solver (http://www.gnu.org/software/glpk/glpk.html)
+
+If you want to use the GLPK solver for the optimization, make sure that the executable "glpsol" is installed in
+a directory included in your path. For CPLEX, the path of the file "cplex.jar" and the cplex Djava.library.path
+have to be passed as parameters in each GIIRA run (refer to parameter description below).
+
+To install GIIRA, download the compressed zip folder from https://sourceforge.net/projects/giira/ und unpack the package with:
+
+> unzip GIIRA.zip
+
+This creates a folder named "GIIRA" in your current directory. This folder includes the executable GIIRA.jar.
+
+To receive the help message of GIIRA, type:
+
+> java -jar GIIRA/GIIRA.jar --help
+
+Note that GIIRA needs several helper scripts to call external programs, these scripts are included
+in the directory GIIRA/scripts. To run GIIRA it is necessary that this folder is always in the same directory as
+the file GIIRA.jar.
+
+
+---------------------------------
+RUN GIIRA - EXAMPLE
+---------------------------------
+
+In the following example we assume that the file GIIRA.jar is contained in the directory foo/.
+Further, if you have CPLEX installed on your system, we assume that the path to the file "cplex.jar" and
+to the cplex library "Djava.library.path" is foo_CPLEX/.
+
+GIIRA can either be provided with the raw reads and reference and then calls an external mapper to perform the necessary
+alignment, or it is presented an already existing alignment. It is important that mappings that are provided to GIIRA are
+in SAM-Format(http://samtools.sourceforge.net). The SAM file has to be sorted by read names, which can be performed using
+samtools sort and the -n option (see samtools manual).
+
+In the following we show a simple example run with the testdata that are included in the download package in the directory GIIRA/example/.
+
+In this example, we apply GIIRA to a set of 500000 reads and chromosome IV of Saccaromyces cerevisae as a reference genome.
+Both the reads and the reference are provided in the example, as well as a SAM file with an existing alignment.
+
+To run GIIRA with the already existing SAM file:
+
+1. Create a directory for the results, e.g "GIIRA_example"
+2a. to call GIIRA when you have CPLEX installed on your system, type:
+
+> java -jar foo/GIIRA.jar -libPath foo_CPLEX -cp foo_CPLEX/cplex.jar -iG GIIRA/example/Scer_chr4.fasta -haveSam GIIRA/example/scer_example_mapping.sam -out GIIRA_example/
+
+2b. Alternatively, you can call GIIRA without CPLEX (using GLPK):
+
+> java -jar foo/GIIRA.jar -iG GIIRA/example/Scer_chr4.fasta -haveSam GIIRA/example/scer_example_mapping.sam -out GIIRA_example/ -opti glpk
+
+---------------------------------
+
+You can also apply GIIRA to the unmapped reads, using either TopHat2(http://tophat.cbcb.umd.edu/) or BWA(http://bio-bwa.sourceforge.net/) to obtain the read mapping
+(either must be installed on your system to run this example, note that in this description we use TopHat2):
+
+1. Create a directory for the results, e.g "GIIRA_example"
+2a. to call GIIRA when you have CPLEX installed on your system, type:
+
+> java -jar foo/GIIRA.jar -libPath foo_CPLEX -cp foo_CPLEX/cplex.jar -iG GIIRA/example/Scer_chr4.fasta -iR GIIRA/example/scer_example_reads.fastq -out GIIRA_example/
+
+2b. Alternatively, you can call GIIRA without CPLEX (using GLPK):
+
+> java -jar foo/GIIRA.jar -iG GIIRA/example/Scer_chr4.fasta -iR GIIRA/example/scer_example_reads.fastq -out GIIRA_example/ -opti glpk
+
+
+---------------------------------
+PARAMETERS OF GIIRA
+---------------------------------
+
+General information:
+
+1) If you use the CPLEX optimizer to solve the linear program, please provide the absolute path to the cplex library Djava.library.path as well as to the file cplex.jar
+(included in the directory of your CPLEX installation).
+
+> java -jar GIIRA.jar -cp PATH_TO_CPLEX/cplex.jar -libPath PATH_TO_CPLEX/Djava.library.path
+
+2) Depending on the size of your dataset, you might have to assign more memory to the GIIRA run to avoid an out of memory error.
+ To do so, set a higher Xmx value when calling GIIRA, e.g. 3GB (="3000m"):
+
+> java -Xmx3000m -jar GIIRA.jar
+
+options:
+
+ -h : help text and exit
+
+ -iG [pathToGenomes] : specify path to directory with genome files in fasta format (it is also possible to address one fasta file directly)
+
+ -iR [pathToRna] : specify path to directory with rna read files in fastq format (it is also possible to address one fastq file directly)
+
+ -out [pathToResults] : specify the directory that shall contain the results files
+
+ -outName [outputName] : specify desired name for output files, DEFAULT: genes
+
+ -haveSam [samfileName]: if a sam file already exists, provide the name, else a mapping is performed. NOTE: the sam file has to be sorted according to read names!
+ (this can be achieved by using the samtools (http://samtools.sourceforge.net/) sort command with option "-n")
+
+ -nT [numberThreads] : specify the maximal number of threads that are allowed to be used, DEFAULT: 1
+
+ -mT [tophat/bwa/bwasw] : specify desired tool for the read mapping, DEFAULT: tophat
+
+ -opti [cplex/glpk] : specify the desired optimization method, either using CPLEX optimizer (cplex, DEFAULT) or glpk solver (glpk)
+
+ -libPath [PATH] : if cplex is the desired optimizer, specify the absolute path to the cplex library Djava.library.path
+
+ -cp [PATH] : if cplex is the desired optimizer, specify the absolute path to the cplex jar file cplex.jar
+
+ -mem [int] : specify the amount of memory that cplex is allowed to use (Note: this parameter should be set for large sets of reads with high ambiguity,
+ e.g. when the number of ambiguous mappings is above 10 million. Specify the amount in MB, e.g. -mem 10000 means 10GB of memory are allowed)
+
+ -maxReportedHits [int] : if using BWA as mapping tool, specify the maximal number of reported hits; DEFAULT: 2 (if the number of hits of an ambiguous
+ read exceeds this threshold, it is not reported.)
+
+ -prokaryote [y/n] : if set to true (y), no spliced reads are accepted and structural genes are resolved. DEFAULT: n (Note: if structural genes shall be
+ resolved, it is necessary to apply CPLEX as the optimizer. To predict genes on prokaryotes without installed CPLEX, do not set this
+ parameter to receive the prediction of coding regions.)
+
+ -minCov [double] : specify the minimum required coverage of the gene candidate extraction; DEFAULT: -1 (If -1, it is estimated from the mapping. Otherwise,
+ it is recommended to choose minCov very small, e.g. = 1 to achieve maximum sensitivity.)
+
+ -maxCov [double] : optional maximal coverage threshold, can also be estimated from mapping (DEFAULT) (Note: this parameter should be set by the user if
+ coverages above a certain threshold are not desired. Reads mapping to regions with a coverage higher than maxCov are excluded from the analysis.)
+
+ -endCov [double] : if the coverage falls below this value, the currently open candidate gene is closed. This value can be estimated from the minimum coverage (-1); DEFAULT: -1
+ (If this parameter is set by the user, it is recommended to choose endCov small to garantuee higher sensitivity)
+
+ -dispCov [0/1] : if set to 1 (or if minCov is not specified), the minimum coverage and maximum coverage are automatically estimated from the mapping, DEFAULT: 0
+
+ -interval [int] : specify the minimal size of an interval between near candidate genes, if "-1" it equals the read length. DEFAULT: -1 (Note: this parameter directly
+ affects how often nearby candidate regions are merged to one candidate, if it is set to 0, only overlapping regions are merged. If the dataset has an
+ overall low coverage, it can be helpful to set a bigger value for interval because then coverage gaps are covered more frequently.)
+
+ -splLim [double] : specify the minimal coverage that is required to accept a splice site, if (-1) the threshold is equal to minCov, DEFAULT: -1
+
+ -rL [int] : specify read length, otherwise this information is extracted from the SAM file (DEFAULT)
+
+ -noAmbiOpti : if specified, ambiguous hits are not included in the analysis (and no optimizer is necessary)
+
+ -settingMapper [(list of parameters)] : A comma-separated list of the desired parameters for TopHat or BWA. Please provide
+ for each parameter a pair of indicator and value, separated by an equality sign.
+ Note that paramters intended for the 3 different parts (indexing, aln, sam) of BWA have to be separated by a lowercase bar
+ Example: -settingMapper [-a=is_-t=5,-N_-n=5]
\ No newline at end of file
diff --git a/scripts/bwa_aln.py b/scripts/bwa_aln.py
new file mode 100644
index 0000000..6d7ef6b
--- /dev/null
+++ b/scripts/bwa_aln.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Oct 31 16:29:55 2011
+helper script to call bwa aln
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+"""
+#!/usr/bin/env python
+import os
+import optparse
+import subprocess
+
+parser = optparse.OptionParser()
+
+args = parser.parse_args()
+
+paras = args[1][0]
+orfFile = args[1][1]
+rnaFile = args[1][2]
+outFile = args[1][3]
+
+callPara = " "
+fnull = open(os.devnull, 'w')
+
+if paras == 0:
+ syscall = "bwa aln %s %s > %s" %(orfFile,rnaFile,outFile)
+ os.system(syscall)
+ result = subprocess.call(syscall, shell = True, stdout = fnull, stderr = fnull)
+ fnull.close()
+else:
+ stringAr = paras[1:(len(paras)-1)].split("_")
+ for data in stringAr:
+ callPara=callPara+data+" "
+ syscall = "bwa aln%s%s %s > %s" %(callPara,orfFile,rnaFile,outFile)
+ os.system(syscall)
+ result = subprocess.call(syscall, shell = True, stdout = fnull, stderr = fnull)
+ fnull.close()
+
+
diff --git a/scripts/bwa_index.py b/scripts/bwa_index.py
new file mode 100644
index 0000000..14f7093
--- /dev/null
+++ b/scripts/bwa_index.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Feb 03 2012
+helper script to call bwa index
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+"""
+#!/usr/bin/env python
+import os
+import optparse
+import subprocess
+
+parser = optparse.OptionParser()
+
+args = parser.parse_args()
+
+orfFile = args[1][0]
+print orfFile
+para = args[1][1]
+print para
+fnull = open(os.devnull, 'w')
+
+syscall = "bwa index -a %s %s" %(para, orfFile)
+print syscall
+result = subprocess.call(syscall, shell = True, stdout = fnull, stderr = fnull)
+fnull.close()
diff --git a/scripts/bwa_samse.py b/scripts/bwa_samse.py
new file mode 100644
index 0000000..eb0a225
--- /dev/null
+++ b/scripts/bwa_samse.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Oct 31 16:29:55 2011
+helper script to call bwa samse
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+"""
+#!/usr/bin/env python
+import os
+import optparse
+import subprocess
+
+parser = optparse.OptionParser()
+
+args = parser.parse_args()
+
+paras = args[1][0]
+orfFile = args[1][1]
+alnFile = args[1][2]
+rnaFile = args[1][3]
+outFile = args[1][4]
+
+callPara = " "
+fnull = open(os.devnull, 'w')
+
+if paras == 0:
+ syscall = "bwa samse %s %s %s > %s" %(orfFile,alnFile,rnaFile,outFile)
+ os.system(syscall)
+ result = subprocess.call(syscall, shell = True, stdout = fnull, stderr = fnull)
+ fnull.close()
+else:
+ stringAr = paras[1:(len(paras)-1)].split("_")
+ for data in stringAr:
+ callPara=callPara+data+" "
+ syscall = "bwa samse%s%s %s %s> %s" %(callPara,orfFile,alnFile,rnaFile,outFile)
+ os.system(syscall)
+ print syscall
+ result = subprocess.call(syscall, shell = True, stdout = fnull, stderr = fnull)
+ fnull.close()
+
diff --git a/scripts/bwa_sw.py b/scripts/bwa_sw.py
new file mode 100644
index 0000000..bd0c258
--- /dev/null
+++ b/scripts/bwa_sw.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Oct 31 16:29:55 2011
+helper script to call bwa samse
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+"""
+#!/usr/bin/env python
+import os
+import optparse
+import subprocess
+
+parser = optparse.OptionParser()
+
+args = parser.parse_args()
+
+paras = args[1][0]
+orfFile = args[1][1]
+rnaFile = args[1][2]
+outFile = args[1][3]
+
+callPara = " "
+fnull = open(os.devnull, 'w')
+
+if paras == 0:
+ syscall = "bwa bwasw %s %s > %s" %(orfFile,rnaFile,outFile)
+ #os.system(syscall)
+ result = subprocess.call(syscall, shell = True, stdout = fnull, stderr = fnull)
+ fnull.close()
+else:
+ stringAr = paras[1:(len(paras)-1)].split("_")
+ for data in stringAr:
+ callPara=callPara+data+" "
+ syscall = "bwa bwasw%s %s %s> %s" %(callPara,orfFile,rnaFile,outFile)
+ #os.system(syscall)
+ print syscall
+ result = subprocess.call(syscall, shell = True, stdout = fnull, stderr = fnull)
+ fnull.close()
+
+
diff --git a/scripts/callCat.py b/scripts/callCat.py
new file mode 100644
index 0000000..349897f
--- /dev/null
+++ b/scripts/callCat.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+"""
+#!/usr/bin/env python
+import os
+import optparse
+
+parser = optparse.OptionParser()
+
+args = parser.parse_args()
+
+fastaFiles = args[1][0]
+output = args[1][1]
+
+fileArr = fastaFiles.split("&&")
+
+fileNames = ""
+
+for file in fileArr:
+ fileNames = "%s %s" % (fileNames,file)
+
+syscall = "cat %s > %s" %(fileNames,output)
+os.system(syscall)
diff --git a/scripts/callCat_RnaFile.py b/scripts/callCat_RnaFile.py
new file mode 100644
index 0000000..3766ccd
--- /dev/null
+++ b/scripts/callCat_RnaFile.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+"""
+#!/usr/bin/env python
+import os
+import optparse
+
+parser = optparse.OptionParser()
+
+args = parser.parse_args()
+
+fastqFiles = args[1][0]
+output = args[1][1]
+
+fileArr = fastqFiles.split("&&")
+
+fileNames = ""
+
+for file in fileArr:
+ fileNames = "%s %s" % (fileNames,file)
+
+syscall = "cat %s > %s" %(fileNames,output)
+os.system(syscall)
diff --git a/scripts/filterGenes.py b/scripts/filterGenes.py
new file mode 100644
index 0000000..f06f91c
--- /dev/null
+++ b/scripts/filterGenes.py
@@ -0,0 +1,56 @@
+'''
+filters a given GIIRA GTF file for all genes with sufficient support
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+'''
+
+import sys
+import numpy
+
+if len(sys.argv) <= 4:
+ print "Script to filter a given GIIRA GTF file for all genes with sufficient support."
+ print "Usage: python filterGenes.py [Path_TO_GTF] [PATH_OUTFILE] [WANT_AMBI_FILTER: (y/n)] [WANT_COVERAGE_FILTER: (y/n)] [WANT_UNIQUE_COVERAGE_FILTER: (y/n)]"
+ print "AMBI-FILTER = tag \"alsoUniqueSupport\" in GIIRA output"
+ print "COVERAGE-FILTER = tag \"coverageSupport\" in GIIRA output"
+ print "UNIQUE_COVERAGE-FILTER = tag \"hasEnoughUniques\" in GIIRA output"
+ print "Example: python filterGenes.py myGenes.gtf myGenes_filtered.gtf y y n"
+ print "This applies a filtering for genes only supported by ambiguous reads or lacking sufficient overall coverage."
+ sys.exit(1)
+if "-h" in sys.argv[1]:
+ print "Script to filter a given GIIRA GTF file for all genes with sufficient support."
+ print "Usage: python filterGenes.py [Path_TO_GTF] [PATH_OUTFILE] [WANT_AMBI_FILTER: (y/n)] [WANT_COVERAGE_FILTER: (y/n)] [WANT_UNIQUE_COVERAGE_FILTER: (y/n)]"
+ print "AMBI-FILTER = tag \"alsoUniqueSupport\" in GIIRA output"
+ print "COVERAGE-FILTER = tag \"coverageSupport\" in GIIRA output"
+ print "UNIQUE_COVERAGE-FILTER = tag \"hasEnoughUniques\" in GIIRA output"
+ print "Example: python filterGenes.py myGenes.gtf myGenes_filtered.gtf y y y"
+ print "This applies a filtering for genes only supported by ambiguous reads or lacking sufficient overall coverage."
+ sys.exit(1)
+
+pathToGTF = sys.argv[1]
+pathOutFile = sys.argv[2]
+wantMultiFilter = sys.argv[3]
+wantCovFilter = sys.argv[4]
+wantUniqueCovfilter = sys.argv[5]
+
+infile = open(pathToGTF, 'r')
+outfile = open(pathOutFile, 'w')
+
+for line in infile:
+ arr = line.rstrip().split("\t")
+ arrTag = arr[8].split(";")
+ wantLine = True
+
+ if "y" in wantMultiFilter:
+ if ": n" in arrTag[len(arrTag)-4]:
+ wantLine = False
+ if "y" in wantCovFilter:
+ if ": n" in arrTag[len(arrTag)-3]:
+ wantLine = False
+ if "y" in wantUniqueCovfilter:
+ if ": n" in arrTag[len(arrTag)-2]:
+ wantLine = False
+
+ if wantLine:
+ outfile.write(line);
\ No newline at end of file
diff --git a/scripts/getMeanCov.py b/scripts/getMeanCov.py
new file mode 100644
index 0000000..7a49600
--- /dev/null
+++ b/scripts/getMeanCov.py
@@ -0,0 +1,108 @@
+"""
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+"""
+
+import pysam
+import matplotlib.pyplot as plt
+import numpy as np
+from math import *
+import scipy.stats as stats
+import sys
+
+# calculate coverage without x-coverage bases
+def covWOzero(cov,x):
+ cov2 = np.array([])
+ count = 0
+ for i in cov:
+ if i!= x:
+ count += 1
+ cov2.resize(count)
+ cov2[count-1]=i
+ return cov2
+
+# only computes the maxCov without the need to refine the coverage map
+def computeMaxCov(cov,maxCov,it):
+ if it > 100:
+ return cov
+ # first remove all maxCov entries
+ while (np.max(cov)) == maxCov:
+ cov = np.delete(cov,(len(cov)-1))
+
+ mean = np.mean(cov)
+ median = np.median(cov)
+ if (median * 10.0) < mean :
+ maxCov = np.max(cov)
+ cov = computeMaxCov(cov,maxCov,(it+1))
+ maxCov = np.max(cov)
+
+ return cov
+
+nameIn = sys.argv[1] # name and path sam file
+nameOut = sys.argv[2]
+
+""" extract a genome coverage profile from a sam file. """
+sf = pysam.Samfile(nameIn,'r')
+
+cov = np.zeros((sum(sf.lengths),))
+start_pos = np.cumsum(sf.lengths)-sf.lengths[0]
+read_length = 0
+num_reads = 0
+for read in sf:
+ if not read.is_unmapped:
+ r_start = start_pos[read.tid] + read.pos # start position
+ r_end = start_pos[read.tid] + read.pos + read.qlen # end
+ cov[r_start:r_end] += 1
+ num_reads += 1
+ read_length += r_end-r_start
+
+
+#print "length including zero: %s" %(len(cov))
+
+# calculate coverage
+
+covWOZRef = covWOzero(cov,0)
+
+#print "length without zero: %s" %(len(covWOZRef))
+
+mean_cov_wozRef = np.mean(covWOZRef)
+percentileQuart_cov_wozRef = np.percentile(covWOZRef,25)
+median_cov_wozRef = np.median(covWOZRef)
+percentileUpQuart_cov_wozRef = np.percentile(covWOZRef,75)
+
+print "average: %s" %(mean_cov_wozRef)
+print "median: %s" %(median_cov_wozRef)
+print "25-quart: %s" %(percentileQuart_cov_wozRef)
+print "75-quart: %s" %(percentileUpQuart_cov_wozRef)
+
+maxCov = -1
+if (median_cov_wozRef * 10.0) < mean_cov_wozRef:
+ sys.setrecursionlimit(100000)
+ covSorted = np.sort(covWOZRef)
+ print "finished sorting!"
+ maxCov = np.max(covSorted)
+ itNum = 1
+ while True:
+ covSorted = computeMaxCov(covSorted,maxCov,1)
+ maxCov = np.max(covSorted)
+ mean = np.mean(covSorted)
+ median = np.median(covSorted)
+ print "iter %s max: %s, median: %s, average: %s" %(itNum,maxCov,median,mean)
+ itNum = itNum + 1
+ if (median * 10.0) >= mean:
+ break
+
+
+print "maximum threshold: %s" %(maxCov)
+
+outfile = open(nameOut,'w')
+outfile.write(str(min(percentileQuart_cov_wozRef,(mean_cov_wozRef/5.0))) + "\n");
+outfile.write(str(maxCov) + "\n")
+outfile.write("25-qua: " + str(percentileQuart_cov_wozRef) + "\n")
+outfile.write("median: " + str(median_cov_wozRef) + "\n")
+outfile.write("average: " + str(mean_cov_wozRef) + "\n")
+outfile.close()
+
+
diff --git a/scripts/lpCall.py b/scripts/lpCall.py
new file mode 100644
index 0000000..1a53620
--- /dev/null
+++ b/scripts/lpCall.py
@@ -0,0 +1,17 @@
+'''
+Created on 4 November 2013
+
+calls the glpsol lp solver to assign the rnas
+
+ at author: zickmannf
+'''
+
+import sys
+import numpy
+import os
+
+inputFile = sys.argv[1]
+outputFile = sys.argv[2]
+
+syscall = "glpsol --lp %s --output %s" %(inputFile,outputFile)
+os.system(syscall)
diff --git a/scripts/sortReaSam.py b/scripts/sortReaSam.py
new file mode 100644
index 0000000..0aa8a32
--- /dev/null
+++ b/scripts/sortReaSam.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright (c) 2013,
+Franziska Zickmann,
+ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+Distributed under the GNU Lesser General Public License, version 3.0
+"""
+#!/usr/bin/env python
+import os
+import sys
+
+pathOut = sys.argv[1]
+
+syscall1 = "samtools view -b -S %sresultsRun/reassignedReads.sam > %sresultsRun/reassignedReads.bam" %(pathOut,pathOut)
+print syscall1
+os.system(syscall1)
+
+syscall2 = "samtools sort -n %sresultsRun/reassignedReads.bam %sresultsRun/reassignedReads_sorted" %(pathOut,pathOut)
+print syscall2
+os.system(syscall2)
+
+syscall3 = "samtools view -h -o %sresultsRun/reassignedReads_sorted.sam %sresultsRun/reassignedReads_sorted.bam" %(pathOut,pathOut)
+print syscall3
+os.system(syscall3)
\ No newline at end of file
diff --git a/src/geneFinder/BWA_Call.java b/src/geneFinder/BWA_Call.java
new file mode 100755
index 0000000..a9dc084
--- /dev/null
+++ b/src/geneFinder/BWA_Call.java
@@ -0,0 +1,167 @@
+package geneFinder;
+
+/**
+ * call BWA to map the rna reads against the reference genome
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ */
+
+
+import java.io.File;
+import java.io.IOException;
+
+
+public class BWA_Call {
+
+ /*
+ * map the reads against all ORFs using bwa
+ * creates a sam output file
+ * note: allows only one setting for the indexing (-a XX) and various ones in the aln step
+ */
+
+ public void callBWA(File refFile, String nameOutSamFile) {
+
+ File rnaFile = null;
+
+ if(GeneFinder.rnaFilesWithNames.keySet().size() > 1){
+
+ // more than one read file, so concatenate them to one big file
+
+ rnaFile = new File(GeneFinder.pathOut+"resultsRun/mergedReadFiles.fastq");
+ callConcatenateReadFiles(); // perform the concatenation
+
+ }else{
+ for(File readFile : GeneFinder.rnaFilesWithNames.keySet()){
+ rnaFile = readFile;
+ }
+ }
+
+ // now that we have the read file, do the alignment
+
+ System.out.println("Start to do the alignment with BWA...");
+ WriteOutput.writeToLogFile("Start to do the alignment with BWA... \n");
+
+ long timebef = System.currentTimeMillis();
+ long timeAfter = 0;
+
+
+ String nameIndexTool = "is"; // these are the variables for the different options given for each part of the BWA process
+ String optionString_aln = "";
+ String optionString_samse = "";
+
+ if(GeneFinder.settingMapper != null){
+ String[] returnArr = getMapperOptions_BWA(nameIndexTool,optionString_aln,optionString_samse); // get the options
+ nameIndexTool = returnArr[0];
+ optionString_aln = returnArr[1];
+ optionString_samse = returnArr[2];
+ }else{
+ optionString_aln = "0"; // "0" is the indicator that no options are provided
+ optionString_samse = "0";
+ }
+
+ System.out.print("Indexing.... "); // first step
+ String firstExe = "python " + GeneFinder.pathToHelpFiles+"bwa_index.py " + refFile + " " + nameIndexTool; // first index reference file
+ Giira.callAndHandleOutput(firstExe);
+
+ System.out.println("\nPerform alignment.... ");
+
+ if(!GeneFinder.useBWAsw){
+ String saiOut = GeneFinder.pathOut+"resultsRun/aln_sa.sai";
+
+ String secondExe = "python "+GeneFinder.pathToHelpFiles+"bwa_aln.py " + optionString_aln + " " + refFile + " " + rnaFile + " " + saiOut; // perform alignment
+ Giira.callAndHandleOutput(secondExe);
+
+ System.out.println("\nWrite to SAM format....");
+
+ String thirdExe = "python "+GeneFinder.pathToHelpFiles+"bwa_samse.py " + optionString_samse + " " + refFile + " " + saiOut + " " + rnaFile + " " + nameOutSamFile; //output: sam file
+ Giira.callAndHandleOutput(thirdExe);
+
+ }else{
+ String secondExe = "python "+GeneFinder.pathToHelpFiles+"bwa_sw.py " + optionString_samse + " " + refFile + " " + rnaFile + " " + nameOutSamFile; //output: sam file
+ Giira.callAndHandleOutput(secondExe);
+ }
+
+ System.out.println("\nDone.");
+ timeAfter = System.currentTimeMillis();
+ System.out.println("Time required for the alignment: "+ (double) (timeAfter-timebef)/1000.0 +"s.");
+ WriteOutput.writeToLogFile(" Done.\n Time required for the alignment: "+ (double) (timeAfter-timebef)/1000.0 +"s.\n");
+
+
+
+ }
+
+ /*
+ * if more than one read file is provided, concatenate them for BWA to provide one big merged read file
+ */
+
+ public static void callConcatenateReadFiles(){
+
+ System.out.println("Concatenate read files.");
+ Runtime prepRef = Runtime.getRuntime();
+ Process firstExe;
+
+ String allRnaNames = new String(); // necessary for python helper script
+
+ for(File rnaFileTemp : GeneFinder.rnaFilesWithNames.keySet()){ // generate the name
+ String name = GeneFinder.rnaFilesWithNames.get(rnaFileTemp);
+ allRnaNames += name+"&&";
+ }
+
+ try { // call python helper script
+ String exe = "python "+GeneFinder.pathToHelpFiles+"callCat_RnaFile.py " + allRnaNames + " " + GeneFinder.pathOut+"resultsRun/mergedReadFiles.fastq";
+ firstExe = prepRef.exec(exe);
+ firstExe.waitFor();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /*
+ * if options are provided by the user, extract those options
+ */
+
+ public static String[] getMapperOptions_BWA(String nameIndexTool, String optionString_aln, String optionString_samse){
+
+ String[] splitParts = GeneFinder.settingMapper.split("_");
+ for(int i=0;i<splitParts.length;i++){ // each part is for one of the parts of BWA (indexing, aln, samse)
+ String[] paraSplit = splitParts[i].split(",");
+ for(String para : paraSplit){
+ if(i==0){
+ String[] paraArr = para.split("=");
+ nameIndexTool = paraArr[1];
+ }else if(i==1){ // aln part, add option here
+ String[] paraArr = para.split("=");
+ if(paraArr.length == 1){ // this means an option that only consists of one tag
+ optionString_aln = optionString_aln+paraArr[0]+"_";
+ } else{
+ optionString_aln = optionString_aln+paraArr[0]+"_"+paraArr[1]+"_";
+ }
+ }else{ // samse part, add option here
+ String[] paraArr = para.split("=");
+ if(paraArr.length == 1){ // this means an option that only consists of one tag
+ optionString_samse = optionString_samse+paraArr[0]+"_";
+ } else{
+ optionString_samse = optionString_samse+paraArr[0]+"_"+paraArr[1]+"_";
+ }
+ }
+ }
+
+ }
+
+ // now prepare the option string in a way that it can be passed to the python helper script
+
+ if(splitParts[1].length() > 1){
+ optionString_aln="["+optionString_aln.substring(0, (optionString_aln.length()-1))+"]";
+ }
+ if(splitParts[2].length() > 1){
+ optionString_samse="["+optionString_samse.substring(0, (optionString_samse.length()-1))+"]";
+ }
+
+ return new String[] {nameIndexTool,optionString_aln,optionString_samse};
+ }
+
+}
diff --git a/src/geneFinder/CalculateScores.java b/src/geneFinder/CalculateScores.java
new file mode 100755
index 0000000..b71c1dd
--- /dev/null
+++ b/src/geneFinder/CalculateScores.java
@@ -0,0 +1,173 @@
+package geneFinder;
+
+
+/**
+ * based on the assigned reads, assign a score to each identified gene
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+import java.util.Vector;
+
+import types.*;
+
+public class CalculateScores {
+
+ public static double minScoreClust; // necessary for the normalization later on
+ public static double maxScoreClust;
+
+ /*
+ * depending on the number and quality of assigned reads calculate a score for each cluster
+ * multiple and uniques are distinguished via certain weighting
+ */
+
+ public static double[] assignGeneScores(boolean befOpti){
+
+ minScoreClust = Double.MAX_VALUE;
+ maxScoreClust = Double.MIN_VALUE;
+
+ for(String contigName : GeneFinder.mappedContigs.keySet()){
+
+ Contig thisContig = GeneFinder.mappedContigs.get(contigName);
+
+ Vector<Gene> twinsAlternativeSplicing = new Vector<Gene>();
+
+ for(Gene cluster : thisContig.allGenes){
+
+ scoreCalculation(cluster, contigName, befOpti);
+
+ if(cluster.twinNode != null){
+ scoreCalculation(cluster.twinNode, contigName, befOpti);
+
+ // make sure that only one of the twins is included
+
+ if((cluster.twinNode.score != 0) && (cluster.score != 0)){
+ // also include twin node in contig genes
+ twinsAlternativeSplicing.add(cluster.twinNode);
+ cluster.twinNode = null;
+ } else if(cluster.twinNode.score == 0){
+ cluster.twinNode = null;
+ } else{
+ cluster = cluster.twinNode;
+ cluster.twinNode = null;
+ }
+ }
+
+ }
+
+ thisContig.allGenes.addAll(twinsAlternativeSplicing);
+ }
+
+ double[] minMax = {minScoreClust,maxScoreClust};
+ return minMax;
+ }
+
+ /*
+ * does the calculation
+ */
+
+ public static void scoreCalculation(Gene cluster, String contigName, boolean befOpti){
+
+ double exonLength = 0;
+ // first determine exonlength
+
+ cluster.exonsOfGene.clear();
+ cluster.exonLength = 0;
+ FindExonsOfGene.findExonsForGene(cluster); // TODO: also check if exons are there!
+ OptimizeAmbis.sumUpExonLengths(cluster, cluster.exonsOfGene);
+
+ exonLength = cluster.exonLength;
+
+ if(exonLength <= 0){
+ System.err.println("Exon length <= 0 for gene " + cluster.geneID);
+ }
+ double scoreRnaPart = 0.0;
+ cluster.numOfMultis = 0;
+
+ for(String rnaKey : cluster.idTOassociatedRnas.keySet()){
+
+ Rna rna = ((Rna) cluster.idTOassociatedRnas.get(rnaKey)[0]);
+
+ double uniqueFactor = 1.0;
+
+ if(rna.isMulti != 0 || rna.isSharedBy.size() > 0){
+ uniqueFactor = (1.0)/(rna.hitNum);
+ if(rna.isMulti == 1){
+ cluster.numOfMultis++;
+ if(GeneFinder.iteration == 1 && !befOpti){
+ prepareRnasForIteration(rna, cluster,contigName);
+ }
+ }
+
+ }
+
+ scoreRnaPart += (rna.quality * GeneFinder.readLength * uniqueFactor); // * mapQual was excluded, because multi hits might have always mapQual = 0
+ }
+
+ cluster.score = (double) ((scoreRnaPart/(double)exonLength));
+
+ if(cluster.score > 0 && cluster.score < minScoreClust){
+ minScoreClust = cluster.score;
+ }
+ if(cluster.score > maxScoreClust ){
+ maxScoreClust = cluster.score;
+ }
+
+ }
+
+ /*
+ * calculate score for alternative isoforms, depending on their supporting reads (as fraction of all reads supporting this gene)
+ * note: will be called during output writing
+ * Careful: its necessary to make sure that this method is only called for non-fake splice keys!!
+ */
+
+ public static double calculateIsoformScore(Gene cluster, int spliceKey, int positionInVector){
+
+ double score = ((double)((Vector<Vector<Rna>>)cluster.possibleIntrons.get(spliceKey)[2]).get(positionInVector).size() * 1000.0)/((double) cluster.idTOassociatedRnas.keySet().size()); // multiply with 1000 to avoid too small numbers
+
+ return score;
+ }
+
+ /*
+ * to initialize iteration
+ */
+
+ public static void prepareRnasForIteration(Rna rna, Gene gene, String contigName){
+
+ Contig thisContig = GeneFinder.mappedContigs.get(contigName);
+ rna.isSharedBy.clear();
+
+ if(rna.contigsMappedOn.size() > 1){
+
+ Object[] minInfo = {Integer.MAX_VALUE,""};
+
+ for(int pos = rna.contigsMappedOn.size() -1;pos>=0;pos--){
+ Object[] info = rna.contigsMappedOn.get(pos);
+ if(!(((Integer) info[1] > gene.startPos) && ((Integer) info[1] < gene.stopPos) && (((Contig) info[0]).equals(thisContig)))){
+ rna.contigsMappedOn.removeElementAt(pos);
+ }else{
+
+ if(((Integer)minInfo[0]).intValue() > (Integer) info[1]){
+ String rnaInfo = "";
+ rnaInfo += rna.rnaID + "\t0\t"+ contigName + "\t" + ((Integer)info[1]) + "\t0\t" + ((String)info[2]) + "\t*\t*\t*\t*\t*\tNH:i:1\n";
+
+ minInfo[0] = ((Integer) info[1]).intValue();
+ minInfo[1] = rnaInfo;
+ }
+
+ }
+ }
+
+ WriteOutput.writeToOtherFile(GeneFinder.pathOut+"resultsRun/reassignedReads.sam",(String)minInfo[1]);
+
+ }else if(rna.contigsMappedOn.size() == 1){
+ String rnaInfo = "";
+ Object[] info = rna.contigsMappedOn.get(0);
+ rnaInfo += rna.rnaID + "\t0\t"+ contigName + "\t" + ((Integer)info[1]) + "\t0\t" + ((String)info[2]) + "\t*\t*\t*\t*\t*\tNH:i:1\n";
+ WriteOutput.writeToOtherFile(GeneFinder.pathOut+"resultsRun/reassignedReads.sam",rnaInfo);
+ }
+ }
+}
diff --git a/src/geneFinder/CleanAfterAmbiOpti.java b/src/geneFinder/CleanAfterAmbiOpti.java
new file mode 100755
index 0000000..639babc
--- /dev/null
+++ b/src/geneFinder/CleanAfterAmbiOpti.java
@@ -0,0 +1,382 @@
+package geneFinder;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Vector;
+
+import types.Gene;
+import types.Rna;
+
+/**
+ * after the optimization, assign ambiguous reads to their final position + erase all genes and isoforms not any longer supported by reads
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ */
+
+public class CleanAfterAmbiOpti {
+
+ /*
+ * method that parses the solution file of cplex
+ */
+
+ public static void parse_solution_and_clean_CPLEX(Map<String,Object[]> multiRnas, int fVar_counter){
+
+ long timeBefClean = System.currentTimeMillis();
+
+ Gene currentCluster = null; // to avoid searching for a candidate that we already found
+ int processedNum = 0; // for progress reporting
+
+ String rnaKey = ""; // Initialization to avoid long run times
+ int countRna = 0;
+
+ Iterator<String> it = multiRnas.keySet().iterator();
+
+ if(it.hasNext()){
+ rnaKey = it.next();
+ }
+
+ int counterVars = 0;
+ try{
+ BufferedReader br = new BufferedReader(new FileReader(GeneFinder.pathOut+"resultsRun/solutionCPLEX_it" + GeneFinder.iteration + ".sol"));
+
+ System.out.println("Parsing solution of cplex and clean ambiguous reads... ");
+ System.out.print("Processed: ");
+
+ String line = "";
+
+ while((line = br.readLine()) != null){ // go through file until we reach the part where we find the variables
+ if(line.contains("<variables>")){
+ break;
+ }
+ }
+
+ while(((line = br.readLine()) != null) && (!line.contains("</variables>"))){ // make sure to stop when variable part is over
+
+ String[] lineSplit1 = line.split(" "); // position 1 contains name, 3 contains value
+
+ counterVars++;
+ if(counterVars > fVar_counter){
+
+ String[] valueSplit = lineSplit1[5].split("\""); // to extract the value
+
+ int score = 0;
+
+ if(!(valueSplit[1].equals("0") || valueSplit[1].equals("1"))){ // sometimes cplex does not round
+ double sol = Double.parseDouble(valueSplit[1]);
+ if(sol<=0.5){ // if 2 variables are both 0.5, then we cannot take both! so remove both because no real support!
+ score = 0;
+ }else{
+ score = 1;
+ }
+ }else{
+ score = Integer.parseInt(valueSplit[1]); // directly grab the score, without rounding
+ }
+
+ // now search for the right gene using the list of genes associated to each ambiguous read (more complicated than storing a map, but also more memory efficient)
+
+ if(countRna == (((Vector<Gene>) multiRnas.get(rnaKey)[1]).size())){
+ if(it.hasNext()){
+ rnaKey = it.next();
+ countRna = 0;
+ }else{
+ System.out.println("crash at: " + line);
+ System.out.println("fCount: " + fVar_counter);
+ System.out.println("var Count: " + counterVars);
+ }
+ }
+
+ if(score == 0){ // read was assigned elsewhere, so delete it from the candidate of this variable
+
+ // get variable name by search in binaryVars
+
+ if(currentCluster == null){ // to avoid exceptions in searchAndErase fkt.
+ currentCluster = ((Vector<Gene>) multiRnas.get(rnaKey)[1]).get(0);
+ }
+
+ currentCluster = searchAndErase_CPLEX(multiRnas, rnaKey, currentCluster, countRna); // performs the deleting task
+
+ if(currentCluster == null){
+ break; // this means we did not find the candidate, so something is wrong
+ }
+ }
+
+ countRna++;
+
+ processedNum++;
+
+ if(processedNum % 100000 == 0){
+ System.out.print(processedNum + "; ");
+ }
+ }
+ }
+
+ br.close();
+ } catch (IOException e) {
+ System.out.println("Create solution file:");
+ System.exit(0);
+ }
+
+ // log messages
+ System.out.println(processedNum + ".");
+ System.out.println("Done.");
+ long timeAfterClean = System.currentTimeMillis();
+
+ System.out.println("Time needed to parse solution and to clean up after ambiguous reads assignment: "+(double) (timeAfterClean-timeBefClean)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Time needed to parse solution and to clean up after ambiguous reads assignment: "+(double) (timeAfterClean-timeBefClean)/1000.0 +"s.");
+ Runtime r=Runtime.getRuntime();
+ r.gc();
+ r.gc(); // to keep the memory requirements down as much as possible
+
+ }
+
+ /*
+ * method that parses the solution file of glpk
+ */
+
+ public static void parse_solution_and_clean_GLPK(Map<String,Object[]> multiRnas){
+
+ long timeBefClean = System.currentTimeMillis();
+
+ Gene currentCluster = null; // to avoid searching twice for the same candidate
+ int processedNum = 0; // for progress reporting
+
+ try{
+ BufferedReader br = new BufferedReader(new FileReader(GeneFinder.pathOut+"resultsRun/solutionGLPK_out_it" + GeneFinder.iteration + ".out"));
+
+ System.out.println("Parsing solution of glpk and clean ambiguous reads... ");
+ System.out.print("Processed: ");
+
+ String line = "";
+
+ while((line = br.readLine()) != null){ // proceed through file until we reach the variable part
+ if(line.contains("x__") && !line.contains("__f")){
+ break;
+ }
+ }
+
+ while((line != null) && line.contains("x__")){
+
+ String[] lineSplit1 = line.split(" "); // position 1 contains name, value is contained in next line
+
+ if(line.contains("x__") && (!line.contains("__f"))){
+
+ // we arrived at a "real" ambiguous read constraint
+
+ line = br.readLine();
+
+ int score = Integer.parseInt(line.substring(line.indexOf("*")+1,line.indexOf("*")+20).trim()); // the first integer after the "*" is the score we need
+
+ // now search for the right gene using the list of genes associated to each ambiguous read (more complicated than storing a map, but also more memory efficient)
+
+ if(score == 0){ // read was assigned elsewhere, so delete it from the candidate of this variable
+ String[] nameSep = lineSplit1[lineSplit1.length-1].split("__");
+ String varName_read = "x__"+nameSep[1];
+
+ if(currentCluster == null){ // are there any exceptions possible?
+ currentCluster = ((Vector<Gene>) multiRnas.get(varName_read)[1]).get(0);
+ }
+
+ int idGene = Integer.parseInt(nameSep[2]);
+ currentCluster = searchAndErase(multiRnas, varName_read, idGene, currentCluster); // performs the deleting task
+
+ if(currentCluster == null){
+ break; // this means we did not find the candidate, so something is wrong
+ }
+ }
+
+ processedNum++;
+
+ if(processedNum % 50000 == 0){
+
+ System.out.print(processedNum + "; ");
+ Runtime r=Runtime.getRuntime();
+ r.gc();
+ r.gc(); // to keep the memory requirements down as much as possible
+ }
+ }
+
+ line = br.readLine();
+ }
+
+ br.close();
+ } catch (IOException e) {
+ System.out.println("Create solution file:");
+ System.exit(0);
+ }
+
+ // log messages:
+
+ System.out.println(processedNum + ".");
+ System.out.println("Done.");
+ long timeAfterClean = System.currentTimeMillis();
+
+ Runtime r=Runtime.getRuntime();
+ r.gc();
+ r.gc(); // to keep the memory requirements down as much as possible
+ System.out.println("Time needed to parse solution and to clean up after ambiguous reads assignment: "+(double) (timeAfterClean-timeBefClean)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Time needed to parse solution and to clean up after ambiguous reads assignment: "+(double) (timeAfterClean-timeBefClean)/1000.0 +"s.");
+
+ }
+
+ /*
+ * search the gene this variable is assigned to and erase the rna from all lists
+ */
+
+ public static Gene searchAndErase(Map<String,Object[]> multiRnas, String varName_read, int idGene, Gene currentCluster){
+
+ if(idGene != currentCluster.geneID){ // otherwise we do not need the time consuming search
+ // search for the next node
+
+ currentCluster = null;
+
+
+ for(Gene cluster : ((Vector<Gene>) multiRnas.get(varName_read)[1])){
+
+ if(cluster.geneID == idGene){
+ currentCluster = cluster;
+
+ if(cluster.twinNode != null){ // the removal leads to a speed up, because we do not need to look at one candidate more than once (per read)
+ ((Vector<Gene>) multiRnas.get(varName_read)[1]).add(cluster.twinNode);
+ ((Vector<Gene>) multiRnas.get(varName_read)[1]).removeElement(cluster);
+ }else{
+ ((Vector<Gene>) multiRnas.get(varName_read)[1]).removeElement(cluster);
+ }
+
+ break;
+
+ }else if((cluster.twinNode != null) && (cluster.twinNode.geneID == (idGene))){ // if the candidate does not have the correct id, than maybe its twin
+ currentCluster = cluster.twinNode;
+ break;
+ }
+
+ }
+
+
+ }
+
+ if(currentCluster == null){
+ System.out.println("Did not find the node!"); // This would mean that something is wrong
+ return null;
+ }
+
+ // this read must be deleted from the list of associated reads of this candidate
+
+ if(currentCluster.idTOassociatedRnas.containsKey(varName_read.substring(3))){ // first remove from candidate
+
+ Rna rna = ((Rna) currentCluster.idTOassociatedRnas.get(varName_read.substring(3))[0]); // grab the necessary information from map
+ int[] spliceSupport = ((int[]) currentCluster.idTOassociatedRnas.get(varName_read.substring(3))[1]);
+ int fussyExonSupport = ((Integer) currentCluster.idTOassociatedRnas.get(varName_read.substring(3))[2]);
+
+ if(spliceSupport[0] != -1 && (currentCluster.possibleIntrons.get(spliceSupport[0])[0] != null)){ // check if this was a read supporting a split
+ // remove this rna from split, first find the right one
+ for(int pos = 0; pos < ((Vector<int[]>)currentCluster.possibleIntrons.get(spliceSupport[0])[1]).size();++pos){
+
+ if(((Vector<int[]>)currentCluster.possibleIntrons.get(spliceSupport[0])[0]).get(pos)[1] == spliceSupport[1]){
+ // found intron, so erase rna and break;
+ ((Vector<Vector<Rna>>)currentCluster.possibleIntrons.get(spliceSupport[0])[1]).get(pos).removeElement(rna);
+
+ // now check if intron has not enough support left, if yes, then mark as to-be-erased intron
+
+ if(((Vector<Vector<Rna>>)currentCluster.possibleIntrons.get(spliceSupport[0])[1]).get(pos).size() == 0){
+
+ // mark intron as intron to be erased, note that exons are only erased if no other support
+
+ currentCluster.eraseIntrons_temp.add(((Vector<int[]>)currentCluster.possibleIntrons.get(spliceSupport[0])[0]).get(pos));
+
+ }
+
+ break;
+ }
+ }
+ }
+
+ if(fussyExonSupport != -1 && currentCluster.possibleFussyExons.containsKey(fussyExonSupport)){ // also erase read from fussyExon support list
+ currentCluster.possibleFussyExons.get(fussyExonSupport).removeElement(rna);
+ if(currentCluster.possibleFussyExons.get(fussyExonSupport).size() == 0){
+ // erase and also update in possibleIntrons
+ currentCluster.possibleFussyExons.remove(fussyExonSupport);
+
+ if(currentCluster.possibleIntrons.containsKey(fussyExonSupport)){
+ currentCluster.possibleIntrons.get(fussyExonSupport)[2] = -1;
+ }
+ }
+ }
+
+ currentCluster.idTOassociatedRnas.remove(varName_read.substring(3));
+
+ }else{
+ System.out.println("Did not find the read "+ varName_read.substring(3) + " in the candidate " + currentCluster.geneID + "!");
+ }
+
+ return currentCluster;
+ }
+
+ /*
+ * search the gene this variable is assigned to and erase the rna from all lists
+ */
+
+ public static Gene searchAndErase_CPLEX(Map<String,Object[]> multiRnas, String varName_read, Gene currentCluster, int posInVec){
+
+ currentCluster = ((Vector<Gene>) multiRnas.get(varName_read)[1]).get(posInVec);
+
+ // this read must be deleted from the list of associated reads of this candidate
+
+ if(currentCluster.idTOassociatedRnas.containsKey(varName_read.substring(3))){ // first remove from candidate
+
+ Rna rna = ((Rna) currentCluster.idTOassociatedRnas.get(varName_read.substring(3))[0]); // grab the necessary information from map
+ int[] spliceSupport = ((int[]) currentCluster.idTOassociatedRnas.get(varName_read.substring(3))[1]);
+ int fussyExonSupport = ((Integer) currentCluster.idTOassociatedRnas.get(varName_read.substring(3))[2]);
+
+ if(spliceSupport[0] != -1 && (currentCluster.possibleIntrons.get(spliceSupport[0])[0] != null)){ // check if this was a read supporting a split
+ // remove this rna from split, first find the right one
+ for(int pos = 0; pos < ((Vector<int[]>)currentCluster.possibleIntrons.get(spliceSupport[0])[1]).size();++pos){
+
+ if(((Vector<int[]>)currentCluster.possibleIntrons.get(spliceSupport[0])[0]).get(pos)[1] == spliceSupport[1]){
+ // found intron, so erase rna and break;
+ ((Vector<Vector<Rna>>)currentCluster.possibleIntrons.get(spliceSupport[0])[1]).get(pos).removeElement(rna);
+
+ // now check if intron has not enough support left, if yes, then mark as to-be-erased intron
+
+ if(((Vector<Vector<Rna>>)currentCluster.possibleIntrons.get(spliceSupport[0])[1]).get(pos).size() == 0){
+
+ // mark intron as intron to be erased, note that exons are only erased if no other support
+
+ currentCluster.eraseIntrons_temp.add(((Vector<int[]>)currentCluster.possibleIntrons.get(spliceSupport[0])[0]).get(pos));
+
+ }
+
+ break;
+ }
+ }
+ }
+
+ if(fussyExonSupport != -1 && currentCluster.possibleFussyExons.containsKey(fussyExonSupport)){ // also erase read from fussyExon support list
+ currentCluster.possibleFussyExons.get(fussyExonSupport).removeElement(rna);
+ if(currentCluster.possibleFussyExons.get(fussyExonSupport).size() == 0){
+ // erase and also update in possibleIntrons
+ currentCluster.possibleFussyExons.remove(fussyExonSupport);
+
+ if(currentCluster.possibleIntrons.containsKey(fussyExonSupport)){
+ currentCluster.possibleIntrons.get(fussyExonSupport)[2] = -1;
+ }
+ }
+ }
+
+ currentCluster.idTOassociatedRnas.remove(varName_read.substring(3));
+
+
+ }else{
+ System.out.println("Did not find the read "+ varName_read.substring(3) + " in the candidate " + currentCluster.geneID + "!");
+ }
+
+ return currentCluster;
+ }
+
+}
diff --git a/src/geneFinder/DefineAlternativeTranscripts.java b/src/geneFinder/DefineAlternativeTranscripts.java
new file mode 100755
index 0000000..0c8fd6d
--- /dev/null
+++ b/src/geneFinder/DefineAlternativeTranscripts.java
@@ -0,0 +1,423 @@
+package geneFinder;
+
+import java.util.*;
+
+import types.*;
+
+/**
+ * as a final step, define alternative transcripts for all genes with contradicting splicing events
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class DefineAlternativeTranscripts {
+
+
+ /*
+ * go through all splice sites, if a site has more than one intron or is fuzzy, then create a new transcript, defined by the intron
+ */
+
+ public static boolean searchForTranscripts(Gene gene){
+
+ boolean onlyTranscriptStarts = true;
+
+ for(int split : gene.possibleIntrons.keySet()){
+
+ if((gene.possibleIntrons.get(split)[0] != null) && ((Vector<int[]>) gene.possibleIntrons.get(split)[0]).size() > 1){
+
+ if(firstTestIfEraseIntron(gene, split) > 1){
+
+ for(int i = 0; i<((Vector<int[]>) gene.possibleIntrons.get(split)[0]).size(); ++i){
+ int[] intron = ((Vector<int[]>) gene.possibleIntrons.get(split)[0]).get(i);
+
+ if(!findSuitableTranscriptPartner(gene, intron)){
+ Vector<int[]> correspondingPartner = new Vector<int[]>();
+ correspondingPartner.add(intron);
+ gene.alternativeTranscripts.add(new Object[] {gene.startPos, gene.startPos, gene.startPos, gene.startPos, gene.startPos, correspondingPartner});
+ onlyTranscriptStarts = false;
+ }
+
+ }
+ }
+
+ }
+ }
+
+ return onlyTranscriptStarts;
+ }
+
+ /*
+ * before different transcripts are assigned, check if there are indeed more than 1 left
+ */
+
+ public static int firstTestIfEraseIntron(Gene gene, int split){
+
+ for(int i = ((Vector<int[]>) gene.possibleIntrons.get(split)[0]).size() - 1; i >= 0; i--){
+
+ int[] intron = ((Vector<int[]>) gene.possibleIntrons.get(split)[0]).get(i);
+
+ if(gene.eraseIntrons_temp.contains(intron)){
+
+ FindExonsOfGene.searchCorrespondingExons(gene, intron);
+ ((Vector<int[]>) gene.possibleIntrons.get(split)[0]).removeElementAt(i);
+ ((Vector<int[]>) gene.possibleIntrons.get(split)[1]).removeElementAt(i);
+
+ }
+
+ }
+
+ return ((Vector<int[]>) gene.possibleIntrons.get(split)[0]).size();
+ }
+
+ /*
+ * we have an alternative splicing event, so now check whether we can assign it to an existing transcript
+ */
+
+ public static boolean findSuitableTranscriptPartner(Gene gene, int[] intron){
+
+ for(int posi = 0; posi < gene.alternativeTranscripts.size(); ++posi){
+
+ Object[] transcriptPartner = gene.alternativeTranscripts.get(posi);
+
+ if((((Integer) transcriptPartner[0]).intValue() < intron[0]) && (((Integer) transcriptPartner[1]).intValue() < intron[0])){
+ boolean suitable = true;
+ if(transcriptPartner.length >= 6){
+ if((((Integer) transcriptPartner[4]).intValue() == -1) || ((transcriptPartner.length == 7) && (((Integer) transcriptPartner[6]).intValue() != 0))){ // because at the moment these ones are erased anyway or are indicating isoform ends
+ suitable = false;
+ break;
+ }else{
+ for(int[] partner : ( (Vector<int[]>) transcriptPartner[5])){
+ if(!((partner[0] < intron[0]) && (partner[1] < intron[0]))){
+ suitable = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if(suitable){
+
+ if(transcriptPartner.length >= 6){
+ ((Vector<int[]>) transcriptPartner[5]).add(intron);
+ }else{
+ Vector<int[]> partner_temp = new Vector<int[]>();
+ partner_temp.add(intron);
+ if(transcriptPartner.length >= 5){
+ gene.alternativeTranscripts.setElementAt(new Object[] {transcriptPartner[0], transcriptPartner[1], transcriptPartner[2], transcriptPartner[3], transcriptPartner[4], partner_temp},posi);
+ }else{
+ gene.alternativeTranscripts.setElementAt(new Object[] {transcriptPartner[0], transcriptPartner[1], transcriptPartner[2], transcriptPartner[3], transcriptPartner[0], partner_temp},posi);
+ }
+
+ }
+
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /*
+ * during output writing, it is important which exon has to be assigned to which transcript
+ */
+
+ public static Object[] assignExonsToTranscripts(Gene gene, Object[] altTranscript){
+
+ Vector<int[]> exons = new Vector<int[]>();
+
+ int maxExonEnd = -1;
+ int minExonStart = Integer.MAX_VALUE;
+
+ if(((Integer) altTranscript[0]).intValue() == ((Integer) altTranscript[1]).intValue()){
+ if(gene.onRevStrand && (((Integer) altTranscript[3]).intValue() != -1)){
+ minExonStart = (Integer) altTranscript[3];
+ }else if(!gene.onRevStrand && (((Integer)altTranscript[2]).intValue() != -1)){
+ minExonStart = (Integer) altTranscript[2];
+ }else if(((Integer) altTranscript[2] == -1) && (((Integer)altTranscript[3]).intValue() == -1)){
+ minExonStart = (Integer) altTranscript[0];
+ }else{
+ minExonStart = Math.max(((Integer) altTranscript[2]).intValue(),((Integer) altTranscript[3]).intValue());
+ }
+
+ int intronEndForFirstExon = -1;
+ boolean goOn = true;
+
+ if(altTranscript.length == 6){
+ intronEndForFirstExon = ((Integer) altTranscript[4]).intValue();
+ }else if(altTranscript.length == 7){
+ int valueSeven = ((Integer) altTranscript[6]);
+ if(valueSeven == Integer.MAX_VALUE){
+ maxExonEnd = ((Integer) altTranscript[4]).intValue() + GeneFinder.readLength;
+ int[] newExon = {minExonStart,maxExonEnd};
+ exons.add(newExon);
+ goOn = false;
+ }
+ }
+
+ if(goOn){
+
+ boolean foundFirstExon = false;
+
+ for(int[] exon : gene.exonsOfGene){
+ if(exon[0] >= ((Integer)altTranscript[0]).intValue()){
+ // is not contained in intron, so its part of this transcript
+
+ if(checkCorrespondingPartner(altTranscript,exon)){
+ if(exon[0] == intronEndForFirstExon){
+ foundFirstExon = true;
+ int startNew = (Integer)altTranscript[0];
+ if(startNew != minExonStart){
+ startNew = minExonStart;
+ }
+ int[] newExon = {startNew,exon[1]};
+ if(FindExonsOfGene.checkIfExonIsContained(exons, newExon) == 0){
+ exons.add(newExon);
+ altTranscript = addExonAsFakeTranscriptPartner(altTranscript,newExon);
+ }
+ if(maxExonEnd < exon[1]){
+ maxExonEnd = exon[1];
+ }
+ }else{
+ if(FindExonsOfGene.checkIfExonIsContained(exons, exon) == 0){
+ exons.add(exon);
+ altTranscript = addExonAsFakeTranscriptPartner(altTranscript,exon);
+ }
+
+ if(maxExonEnd < exon[1]){
+ maxExonEnd = exon[1];
+ }
+ }
+ }
+
+ }
+ }
+
+ if(!foundFirstExon && altTranscript.length >= 6 && intronEndForFirstExon != -1){
+
+ int startNew = (Integer)altTranscript[0];
+ if(startNew != minExonStart){
+ startNew = minExonStart;
+ }
+
+ int[] newExon = {startNew,intronEndForFirstExon};
+ if(FindExonsOfGene.checkIfExonIsContained(exons, newExon) == 0){
+ exons.add(newExon);
+ altTranscript = addExonAsFakeTranscriptPartner(altTranscript,newExon);
+ }
+
+ if(maxExonEnd <intronEndForFirstExon){
+ maxExonEnd = intronEndForFirstExon;
+ }
+ }
+ }
+ }else{
+
+ boolean exonAfterEnd = false;
+ boolean haveIsoformEndFussy = false;
+ if(altTranscript.length == 7){
+ // this is a transcript because of a fussy exon, so treat first two positions not as an intron
+ exons.add(new int[] {(Integer)altTranscript[0],((Integer)altTranscript[1])+1});
+ if(maxExonEnd < (((Integer)altTranscript[1])+1)){
+ maxExonEnd = ((Integer)altTranscript[1])+1;
+ }
+ if(minExonStart > ((Integer)altTranscript[0])){
+ minExonStart = (Integer)altTranscript[0];
+ }
+
+ int endIso = (Integer) altTranscript[6];
+ if(endIso == 1){
+ haveIsoformEndFussy = true;
+ }
+ }
+
+ for(int[] exon : gene.exonsOfGene){
+ if(((exon[0] < ((Integer)altTranscript[0]).intValue()) && (exon[1] <= ((Integer)altTranscript[0]).intValue())) || (exon[0] >= ((Integer)altTranscript[1]).intValue() && exon[1] > ((Integer)altTranscript[1]).intValue())){
+ // is not contained in intron, so its part of this transcript
+
+ if((haveIsoformEndFussy && ((exon[0] >= ((Integer)altTranscript[1]).intValue() && exon[1] > ((Integer)altTranscript[1]).intValue())))){
+ exonAfterEnd = true;
+ }else{
+ if(checkCorrespondingPartner(altTranscript,exon)){
+
+ if(FindExonsOfGene.checkIfExonIsContained(exons, exon) == 0){
+ exons.add(exon);
+ altTranscript = addExonAsFakeTranscriptPartner(altTranscript,exon);
+ }
+
+ if(maxExonEnd < exon[1]){
+ maxExonEnd = exon[1];
+ }
+ if(minExonStart > exon[0]){
+ minExonStart = exon[0];
+ }
+
+ }
+ }
+
+ }
+ }
+
+ if(haveIsoformEndFussy && !exonAfterEnd){
+ //the last exon has to assigned stopPos of gene
+ int[] thisExon = {(Integer)altTranscript[0],((Integer)altTranscript[1])+1};
+ for(int[] exonTmp : exons){
+ if((thisExon[0] == exonTmp[0]) && (thisExon[1] == exonTmp[1])){
+ exons.removeElement(exonTmp);
+ break;
+ }
+ }
+
+ exons.add(new int[] {(Integer)altTranscript[0],gene.stopPos+1});
+ maxExonEnd = gene.stopPos+1;
+ }
+
+ if((Integer) altTranscript[1] == Integer.MAX_VALUE){
+ // this is a transcript end
+ int start = (Integer)altTranscript[4];
+ if(start == Integer.MIN_VALUE){
+ start = gene.startPos;
+ minExonStart = start;
+ }else if(start < minExonStart){
+ minExonStart = start;
+ }
+
+ exons.add(new int[] {start,((Integer)altTranscript[0])});
+ maxExonEnd = ((Integer)altTranscript[0]);
+ }
+ }
+
+
+ return new Object[] {exons,minExonStart,maxExonEnd};
+ }
+
+ /*
+ * the exon looks as if it can be assigned to the transcript, but check the corresponding partner alternatives first
+ */
+
+ public static boolean checkCorrespondingPartner(Object[] transcriptAlt, int[] exon){
+
+ if(transcriptAlt.length >= 6){
+ for(int[] intron : ((Vector<int[]>)transcriptAlt[5])){
+ if(!(((exon[1] <= intron[0]) || ((exon[1] > intron[1]) && (exon[0] >= intron[1]))) || ((exon[0] >= intron[1]) || ((exon[0] < intron[0]) && (exon[1] <= intron[0]))))){
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ /*
+ * when you add an exon, than make sure that no overlapping exon can be assigned to this transcript by adding it as a fake "transcript" partner
+ */
+
+ public static Object[] addExonAsFakeTranscriptPartner(Object[] transcriptAlt, int[] exon){
+
+ if(transcriptAlt.length >= 6){
+ ((Vector<int[]>) transcriptAlt[5]).add(exon);
+ }else{
+ Vector<int[]> partner_temp = new Vector<int[]>();
+ partner_temp.add(exon);
+ if(transcriptAlt.length >= 5){
+ transcriptAlt = new Object[] {transcriptAlt[0], transcriptAlt[1], transcriptAlt[2], transcriptAlt[3], transcriptAlt[4], partner_temp};
+
+ }else{
+ transcriptAlt = new Object[] {transcriptAlt[0], transcriptAlt[1], transcriptAlt[2], transcriptAlt[3], transcriptAlt[0], partner_temp};
+
+ }
+
+ }
+
+ return transcriptAlt;
+ }
+
+ /*
+ * during final gene candidate extraction, check if there are alternative transcripts that are outside the extraction gene
+ * (might happen due to non supported large introns, where we first start a new transcript but later define the end to be the last position of the "normal" transcript)
+ */
+
+ public static void finalIni_alternativeStartsCheck(Gene gene){
+
+ if(gene.alternativeTranscripts.size() > 0){
+ for(int posi = gene.alternativeTranscripts.size()-1; posi >= 0; posi--){
+ Object[] altTrans = gene.alternativeTranscripts.get(posi);
+ if(((Integer)altTrans[0]) > gene.stopPos){
+ gene.alternativeTranscripts.removeElementAt(posi);
+ }else if((altTrans.length < 6) || ((altTrans.length >= 6) && (((Integer)altTrans[4]).intValue() == -1))){
+ gene.alternativeTranscripts.removeElementAt(posi);
+ }
+ }
+ }
+ }
+
+ /*
+ * final check if by chance we included to identical transcripts
+ */
+
+ public static void eraseEqualTranscripts(Gene gene){
+
+ Vector<Integer> similarTranscripts = new Vector<Integer>();
+
+ for(int i = 0;i<gene.alternativeTranscripts.size()-1;++i){
+
+ Object[] alt_i = gene.alternativeTranscripts.get(i);
+
+ Vector<Integer> similarTranscripts_i = new Vector<Integer>();
+
+ for(int j = i+1;j<gene.alternativeTranscripts.size();++j){
+
+ Object[] alt_j = gene.alternativeTranscripts.get(j);
+
+ int numSame = 0;
+ if(alt_i.length == alt_j.length){
+ for(int pos = 0; pos<alt_i.length;++pos){
+ if(pos != 5){
+ if(((Integer)alt_i[pos]).intValue() == ((Integer)alt_j[pos]).intValue()){
+ numSame++;
+ }
+ }else{
+ int samePartner = 0;
+ for(int[] exon : ((Vector<int[]>) alt_i[pos])){
+ for(int[] exon_j : ((Vector<int[]>) alt_j[pos])){
+ if((exon[0] == exon_j[0]) && (exon[1] == exon_j[1])){
+ samePartner++;
+ break;
+ }
+ }
+ }
+
+ if((samePartner == ((Vector<int[]>) alt_i[pos]).size()) && (samePartner == ((Vector<int[]>) alt_j[pos]).size())){
+ numSame++;
+ }
+ }
+ }
+ }
+
+ if((alt_i.length == alt_j.length) && (numSame == alt_i.length )){
+ similarTranscripts_i.add(j);
+ }
+ }
+
+ if(!similarTranscripts_i.isEmpty()){
+ for(int posSim : similarTranscripts_i){
+ if(!similarTranscripts.contains(posSim)){
+ similarTranscripts.add(posSim);
+ }
+ }
+ }
+ }
+
+ if(!similarTranscripts.isEmpty()){
+ for(int i = gene.alternativeTranscripts.size()-1;i >= 0;i--){
+ if(similarTranscripts.contains(i)){
+ gene.alternativeTranscripts.removeElementAt(i);
+ System.out.println("Removed twin transcript! Gene: " + gene.geneID);
+ }
+ }
+ }
+ }
+}
diff --git a/src/geneFinder/ExtractGeneCandidates.java b/src/geneFinder/ExtractGeneCandidates.java
new file mode 100755
index 0000000..a663c7c
--- /dev/null
+++ b/src/geneFinder/ExtractGeneCandidates.java
@@ -0,0 +1,2641 @@
+package geneFinder;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.Vector;
+
+import types.*;
+
+
+/**
+ * extracts high-coverage clusters as potential genes, includes alternative splicing
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ */
+
+public class ExtractGeneCandidates {
+
+
+ // variables for log file
+
+ public static int numMergedClusters;
+ public static int numFoundNoStart_firstTime;
+ public static int numFoundNoStop_firstTime;
+ public static int numNoFrameFound;
+
+ /*
+ * method to analyze the mapping -> extract clusters of high coverage and assign start and stop codons
+ */
+
+ public int initializeClusterSearch(String nameRefFile){
+
+ File refFile;
+ if(GeneFinder.useTopHat){
+ refFile = new File(nameRefFile+".fa");
+ }else{
+ refFile = new File(nameRefFile+".fasta");
+ }
+
+ int id = 1;
+
+ for(String contigName : GeneFinder.mappedContigs.keySet()){
+ Contig thisContig = GeneFinder.mappedContigs.get(contigName);
+ StringBuffer contigSeq = new StringBuffer();
+
+ try{
+ BufferedReader br = new BufferedReader(new FileReader(refFile));
+ String line;
+
+ while((line = br.readLine()) != null){
+ if(line.startsWith(">")){
+ // test if correct contig
+ if(line.substring(1).startsWith(contigName)){
+ // found right one, now extract sequence
+ while(((line = br.readLine()) != null) && (line.length() != 0) && (!(line.startsWith(">")))){
+ String line2 = "";
+ if(Character.isLowerCase(line.charAt(0))){
+ for(int i = 0;i<line.length();i++){
+ char letter = line.charAt(i);
+ letter = Character.toUpperCase(letter);
+ line2 += letter;
+ }
+ }else{
+ line2 = line;
+ }
+ contigSeq.append(line2);
+ }
+ break;
+ }
+ }
+ }
+
+ if(contigSeq.length() == 0){
+ // oops, did not found contig
+ System.out.println("Error, could not find contig " + contigName);
+ System.exit(0);
+ }
+
+ // now that we have the sequence, search for areas with high coverage
+ Runtime r = Runtime.getRuntime();
+
+ id = searchClusters(thisContig, id, contigSeq);
+
+ double memBef_2 = (r.totalMemory()-r.freeMemory());
+
+ thisContig.positionTOmappingRnas.clear();
+ thisContig.positionTOmappingRnas = new TreeMap<Integer,Vector<Rna>>();
+ thisContig.splicePositions.clear();
+ thisContig.splicePositions = new TreeMap<Integer,Integer>();
+ thisContig.positionTOdiff.clear();
+ thisContig.positionTOdiff = new TreeMap<Integer,Integer>();
+ contigSeq = null;
+ r.gc();
+ r.gc();
+
+ double memAft_2 = (r.totalMemory()-r.freeMemory());
+ if(!GeneFinder.secondPart){
+ System.out.println("Memory freed = " + (((memBef_2-memAft_2)/1000.0)/1000.0) + "MB");
+ System.out.println();
+ }
+
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ return id;
+
+ }
+
+ /*
+ * searches potential genes on forward and reverse strand, extracts the specific regions respecting reading frame (if possible)
+ */
+
+ public int searchClusters(Contig thisContig, int id, StringBuffer contigSeq){
+
+ if(GeneFinder.noAmbiOpti){
+ HelperFunctions_GeneSearch.removeAmbiHits(thisContig);
+ }
+
+ Iterator<Integer> positionIt = thisContig.positionTOmappingRnas.keySet().iterator();
+
+ if(GeneFinder.iteration == 2 && !GeneFinder.secondPart){
+ System.out.print("Iteration 2. ");
+ WriteOutput.writeToLogFile("Iteration 2. ");
+ }
+ if(!GeneFinder.secondPart){
+ System.out.println("Contig: = " + thisContig.contigName);
+ WriteOutput.writeToLogFile("Contig: = " + thisContig.contigName + "\n\n");
+ }
+
+ int spliceKey = -1; // this is important to always know the next spliceSite without checking all rnas each time
+ if(!thisContig.splicePositions.isEmpty()){
+ spliceKey = thisContig.splicePositions.firstKey();
+ }
+
+ int startPos = 0;
+ int currentPos = 0;
+ int nextPos = 0;
+
+ // variables for log file
+
+ numMergedClusters = 0;
+ numFoundNoStart_firstTime = 0;
+ numFoundNoStop_firstTime = 0;
+ numNoFrameFound = 0;
+
+ if(positionIt.hasNext()){
+ currentPos = positionIt.next();
+ while((spliceKey != -1) && (currentPos > spliceKey) && (thisContig.splicePositions.higherKey(spliceKey) != null)){
+ spliceKey = thisContig.splicePositions.higherKey(spliceKey); //update the key until it appears AFTER the current position
+ }
+ }
+
+ // initialize the coverage handling:
+ Object[] coverageVecAndPos = new Object[3];
+ Vector<Integer> coverageVec = new Vector<Integer>(); // vector is preferred over array because more flexible
+ // first the coverage is zero at all positions:
+ for(int arrPos=0;arrPos<GeneFinder.readLength;++arrPos){
+ coverageVec.add(0);
+ }
+
+ coverageVecAndPos[0] = coverageVec;
+ coverageVecAndPos[1] = currentPos;
+ coverageVecAndPos[2] = -1;
+
+ if(GeneFinder.spliceLim == -1){
+ GeneFinder.spliceLim = GeneFinder.minCoverage;
+ }
+ if(GeneFinder.endCoverage == -1){
+
+ GeneFinder.endCoverage = (1.0/3.0)*GeneFinder.minCoverage - 0.001;
+
+ if(!GeneFinder.secondPart){
+ System.out.println("End coverage estimated from required minimum coverage: " + GeneFinder.endCoverage);
+ WriteOutput.writeToLogFile("End coverage estimated from required minimum coverage: " + GeneFinder.endCoverage + "\n\n");
+ }
+
+ }
+
+ if(GeneFinder.maxCov == -1){
+ GeneFinder.maxCov = Double.MAX_VALUE; // so we always accept the coverage
+ }
+
+ boolean noMoreCluster = false;
+ boolean startedNewCluster = false; // this boolean ensures that also the last cluster is completed once it has been started due to sufficient coverage (otherwise, if map is empty, cluster is not extracted)
+ boolean doNotCountTwice = false; // if true we do not perform the coverageVec update for the first current position (when starting a new cluster) because this has already been done with "nextPos"
+
+ int numIdentifiedClusters = 0; // if = 1, this ends the while loop
+
+ Vector<Gene> temporaryGenes = new Vector<Gene>(); // contains all candidate genes that have been derived by regarding isoforms completely spanned by introns
+
+ Object[] proceedFromIsoStartStuff = new Object[2]; // 1: int , 2: reads
+ proceedFromIsoStartStuff[0] = -1;
+ proceedFromIsoStartStuff[1] = new Vector<Rna>();
+
+ do{
+
+ Gene cluster = new Gene();
+ coverageVecAndPos[2] = -1;
+
+ do{
+
+ startedNewCluster = false;
+
+ cluster.possibleIntrons.clear();
+
+ int currentCompete = -1; // stores the current alternative interval, is only -1,-1 if no alternatives exist for current position
+ int currentCompeteStart = -1; // defines the first split determining the currentCompete, necessary to not extract exons, that overlap with currentCompete region
+
+ Vector<Rna> rnasThatDoNotSupportAnySplit = new Vector<Rna>(); // contains all rnas that do not support any splits
+ int localExonEnd = -1;
+ int localSpliceSite = -1;
+ int searchTranscriptStart = -1; // indicates whether a transcript starts within an intron
+ int inTranscriptMode = -1; // indicates if we currently also respect a newly opened transcript
+ int inNormalTranscript = -1; // indicates whether we closed a new transcript within an intron or with an overlap to the normal transcript
+ int endToPassForclosedGenes = -1; // when we have to close a gene within a currentCompete interval, it is important to grab the last included position as the end
+ boolean chooseAlternativeEndInstead = false;
+
+ int acceptReads = 0;
+ Vector<Rna> isoformRnas = new Vector<Rna>();
+ boolean goToIni = false;
+
+ TreeMap<Integer,Vector<Integer>> posiCovMap = new TreeMap<Integer,Vector<Integer>>(); // for each intron, the coverage add after begin and end ist stored
+
+ int diff = 0;
+ int considerSpliceSiteForCovUpdate = -1; // with alternative splicing, all positions after splice site would could twice, so take it into account
+
+ if(!doNotCountTwice){
+
+ if(!(spliceKey < 0) && ((currentPos+GeneFinder.readLength)-spliceKey >= 0) ){ // note,old: && (thisContig.splicePositions.get(spliceKey) >= GeneFinder.spliceLim)
+
+ // find splits associated with this site, only take introns into account that are supported by a sufficient number of reads
+
+ double limit = 1; // formerly geneFinder.spliceLim
+ Object[] returnObject = IntronExonSearch.findIntrons_RespectAlternative(cluster, thisContig, spliceKey, currentPos, currentCompete, posiCovMap,limit);
+
+ currentCompete = (Integer) returnObject[0];
+ if(currentCompete != -1){
+ if(spliceKey < currentCompete){
+ currentCompeteStart = spliceKey;
+ }
+ }else{
+ currentCompeteStart = -1;
+ }
+
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnObject[1];
+ rnasThatDoNotSupportAnySplit.addAll((Vector<Rna>) returnObject[2]);
+
+ if(thisContig.splicePositions.get(spliceKey) >= GeneFinder.spliceLim){
+ considerSpliceSiteForCovUpdate = spliceKey;
+ }else{
+ Vector<Integer> otherSpliceSites = ((Vector<Integer>) returnObject[3]);
+ if(!otherSpliceSites.isEmpty()){
+ int minSite = Integer.MAX_VALUE;
+ for(int site : otherSpliceSites){
+ if(site < minSite){
+ minSite = site;
+ }
+ }
+ considerSpliceSiteForCovUpdate = minSite;
+ }
+ }
+
+ }
+
+ int covPlus = thisContig.positionTOmappingRnas.get(currentPos).size();
+
+ Object[] returnValues = updateCoverageInterval_respectAlternatives(thisContig,covPlus,currentPos,coverageVecAndPos,posiCovMap,considerSpliceSiteForCovUpdate);
+
+ coverageVecAndPos = (Object[]) returnValues[0];
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnValues[1];
+
+ if(thisContig.positionTOdiff.keySet().contains(currentPos)){
+ diff = thisContig.positionTOdiff.get(currentPos); // if there occurred insertions or deletions before this positions add/subtract the difference
+ }
+
+ }else{
+ if(!(spliceKey < 0) && ((currentPos+GeneFinder.readLength)-spliceKey >= 0)){ // note: old : && (thisContig.splicePositions.get(spliceKey) >= GeneFinder.spliceLim)
+
+ // we have a "valid" splice site within the next covered interval, so regard this in the update of the coverage vector
+ double limit = 1;
+ Object[] returnObject = IntronExonSearch.findIntrons_RespectAlternative(cluster, thisContig, spliceKey, currentPos, currentCompete, posiCovMap,limit);
+
+ currentCompete = (Integer) returnObject[0];
+
+ if(currentCompete != -1){
+ if(spliceKey < currentCompete){
+ currentCompeteStart = spliceKey;
+ }
+ }else{
+ currentCompeteStart = -1;
+ }
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnObject[1];
+ rnasThatDoNotSupportAnySplit.addAll((Vector<Rna>) returnObject[2]);
+
+ if(thisContig.splicePositions.get(spliceKey) >= GeneFinder.spliceLim){
+ considerSpliceSiteForCovUpdate = spliceKey;
+ }else{
+ Vector<Integer> otherSpliceSites = ((Vector<Integer>) returnObject[3]);
+ if(!otherSpliceSites.isEmpty()){
+ int minSite = Integer.MAX_VALUE;
+ for(int site : otherSpliceSites){
+ if(site < minSite){
+ minSite = site;
+ }
+ }
+ considerSpliceSiteForCovUpdate = minSite;
+ }
+ }
+
+ }
+ }
+
+ int[] normalExonAndDifferenceToIt = {-1,-1,-1}; // if an alternative transcript start merges with the normal exon, make sure that the difference between alternative and transcript is < readLength, so remember diff here
+
+ boolean startWithGene = false;
+ if(((Integer)proceedFromIsoStartStuff[0]).intValue() != -1){
+ startWithGene = true;
+ goToIni = false;
+ }else if((((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff >= GeneFinder.minCoverage) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff < GeneFinder.maxCov)){
+ startWithGene = true;
+ }
+
+ if(startWithGene){
+
+ double averageExonCov = 0; // is updated with every new read start and reflects the average coverage for the current exon
+ int currentExonLength = 0; // length from start of the currentExon until current position
+ int numReadsExon = 0; // reads mapping this exon (intron read-ending are handled differently - average of posiCovMap added)
+
+ if(((Integer)proceedFromIsoStartStuff[0]).intValue() != -1){
+ startPos = ((Integer)proceedFromIsoStartStuff[0]).intValue();
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,(Vector<Rna>)proceedFromIsoStartStuff[1]);
+
+ numReadsExon += ((Vector<Rna>)proceedFromIsoStartStuff[1]).size();
+
+ proceedFromIsoStartStuff[0] = -1;
+ ((Vector<Rna>)proceedFromIsoStartStuff[1]).clear();
+ }else{
+ startPos = (Integer)coverageVecAndPos[2]; // potential start of a new cluster begins at currentPos - bases covered by already present rnas
+ if((Integer)coverageVecAndPos[2] == -1){
+ startPos = currentPos;
+ }
+
+ int pos_temp = (Integer)coverageVecAndPos[2];
+
+ if((Integer)coverageVecAndPos[2] != -1){
+ do{
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(pos_temp));
+ numReadsExon += thisContig.positionTOmappingRnas.get(pos_temp).size();
+
+ if(thisContig.positionTOmappingRnas.higherKey(pos_temp) != null){
+ pos_temp = thisContig.positionTOmappingRnas.higherKey(pos_temp);
+ }else{
+ pos_temp = currentPos;
+ }
+ }while((pos_temp != currentPos) && !(pos_temp > currentPos));
+ }
+ }
+
+ int posStartExon = startPos;
+ currentExonLength = (currentPos - startPos) + GeneFinder.readLength;
+
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(currentPos));
+ numReadsExon += thisContig.positionTOmappingRnas.get(currentPos).size();
+
+ averageExonCov = (double)((double)(numReadsExon * GeneFinder.readLength)/(double)currentExonLength);
+
+ startedNewCluster = true;
+
+ Gene fakeGene = new Gene(); // this is the pseudo gene used to account for different isoforms
+
+ while(positionIt.hasNext()){
+
+ nextPos = positionIt.next();
+ acceptReads = -1;
+
+ while(!(spliceKey < 0) && (nextPos > spliceKey)){
+
+ if((localExonEnd != -1) && (localSpliceSite != -1)){
+ // update fussy exon for last local splice site, take the split as the end
+ if(cluster.possibleIntrons.containsKey(localSpliceSite)){
+ cluster.possibleIntrons.get(localSpliceSite)[2] = spliceKey;
+ cluster.possibleIntrons.get(localSpliceSite)[3] = 0;
+ localExonEnd = -1;
+ }else{
+ System.err.println("Splice site not included!");
+ }
+ }
+
+
+ double limit = GeneFinder.spliceLim; // according to average exon coverage specify a threshold that has to be exceeded
+ int rivalNum = 0; // number of competitors at this site (they "share" the overall coverage)
+
+ if(cluster.possibleFussyExons.containsKey(spliceKey)){
+ rivalNum++;
+ }
+ if((cluster.possibleIntrons.containsKey(spliceKey)) && (cluster.possibleIntrons.get(spliceKey)[0] != null)){
+ int intronNum = ((Vector<int[]>) cluster.possibleIntrons.get(spliceKey)[0]).size();
+ rivalNum += intronNum;
+ }
+
+ if(rivalNum > 0){
+ limit = ((double)(averageExonCov/(double)rivalNum))*0.1;
+ }
+
+ Object[] returnCheck = IntronExonSearch.checkIntronsAfterSpliceSiteSwitch(cluster,spliceKey,currentCompete, posiCovMap,nextPos,limit);
+
+ boolean spliceSiteNotContained = (Boolean) returnCheck[3];
+ if(!spliceSiteNotContained){
+ currentCompete = (Integer) returnCheck[0]; // if splice site not contained, this can mean that it was simply not above the limit or that it has been included in fake gene
+ }
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnCheck[1];
+ boolean switchedFromFussyToExon = (Boolean) returnCheck[2];
+
+ if(currentCompete != -1){
+ if(!switchedFromFussyToExon && spliceKey < currentCompete && (currentPos > currentCompeteStart) && (thisContig.splicePositions.get(spliceKey) >= GeneFinder.spliceLim)){
+ currentCompeteStart = spliceKey;
+ }
+ }else{
+ currentCompeteStart = -1;
+ }
+
+ if(!switchedFromFussyToExon && (rnasThatDoNotSupportAnySplit.size() != 0)){
+ Object[] returnVal = checkIfNonSplitRnasAreSufficient(cluster, rnasThatDoNotSupportAnySplit, spliceKey, currentPos,posiCovMap,limit);
+ localExonEnd = (Integer) returnVal[0];
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnVal[1];
+ localSpliceSite = (Integer) returnVal[2]; // is only set to local splice site if the non split rnas are kept
+ }else{
+ localSpliceSite = -1;
+ localExonEnd = -1;
+ }
+
+ rnasThatDoNotSupportAnySplit.clear();
+
+ if((thisContig.splicePositions.higherKey(spliceKey) == null)){
+ spliceKey = -1;
+ }else{
+ spliceKey = thisContig.splicePositions.higherKey(spliceKey);
+ }
+
+ considerSpliceSiteForCovUpdate = -1;
+ if(inNormalTranscript != -1){
+ int[] intronEndFound = HelperFunctions_GeneSearch.checkIfNextPosMergesWithPreviousIsoform(cluster,nextPos,inNormalTranscript);
+ if((intronEndFound[0] != -1) && (intronEndFound[1] == 1)){
+ inNormalTranscript = -1;
+ }
+ }
+ }
+
+ if(!(spliceKey < 0 ) && ((nextPos+GeneFinder.readLength)-spliceKey > 0)){
+
+ acceptReads = 1; // indicates that we deal with a split, so make sure that all rnas are included in normal candidate if we proceed
+ if((inTranscriptMode != -1) || (searchTranscriptStart != -1) || (inNormalTranscript != -1)){
+ // test in general if we merged with normal transcript, if not, break
+
+ int[] intronEndFound = HelperFunctions_GeneSearch.checkIfNextPosMergesWithPreviousIsoform(cluster,nextPos,inTranscriptMode);
+
+ if(intronEndFound[0] == -1){
+
+ // check if split is leading to a normal exon
+
+ Vector<Integer> theseEnds = IntronExonSearch.findIntronEnds(spliceKey,nextPos,thisContig);
+
+ acceptReads = -1; // if -1 we do not accept
+
+ for(int thisEnd : theseEnds){
+ int[] intronEndMerges = HelperFunctions_GeneSearch.checkIfIntronEndsInNormalTranscript(cluster,thisEnd,inTranscriptMode);
+
+ if(intronEndMerges[0] != -1){
+ acceptReads = 1;
+ break;
+ }
+ }
+
+
+ if(acceptReads == -1){
+
+ if(inTranscriptMode != -1){
+
+ // make sure that the fake gene gets a chance in next extraction
+
+ proceedFromIsoStartStuff[0] = inTranscriptMode;
+ ((Vector<Rna>)proceedFromIsoStartStuff[1]).addAll(isoformRnas);
+ fakeGene = new Gene();
+ isoformRnas.clear();
+ }
+
+ // we do not want to proceed with this candidate, but rather close it at endToPassForClosedGenes
+
+ if(endToPassForclosedGenes != -1){
+ chooseAlternativeEndInstead = true;
+ }
+
+ goToIni = true;
+ }
+
+ }
+ }
+
+ if(!goToIni){
+ // find splits associated with this site, only take introns into account that are supported by a sufficient number of reads
+
+ Object[] returnObject = new Object[4];
+
+ double limit = 1;
+ returnObject = IntronExonSearch.findIntrons_RespectAlternative(cluster, thisContig, spliceKey, nextPos, currentCompete, posiCovMap,limit);
+ currentCompete = (Integer) returnObject[0];
+ rnasThatDoNotSupportAnySplit.addAll((Vector<Rna>) returnObject[2]);
+
+ if(currentCompete != -1){
+ if(spliceKey < currentCompete && (currentPos > currentCompeteStart)){
+ currentCompeteStart = spliceKey;
+ }
+ }else{
+ currentCompeteStart = -1;
+ }
+
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnObject[1];
+
+ if(thisContig.splicePositions.get(spliceKey) >= GeneFinder.spliceLim){
+ considerSpliceSiteForCovUpdate = spliceKey;
+ }else{
+ Vector<Integer> otherSpliceSites = ((Vector<Integer>) returnObject[3]);
+ if(!otherSpliceSites.isEmpty()){
+ int minSite = Integer.MAX_VALUE;
+ for(int site : otherSpliceSites){
+ if(site < minSite){
+ minSite = site;
+ }
+ }
+ considerSpliceSiteForCovUpdate = minSite;
+ }
+ }
+ }
+
+ }
+
+ int covPlusNext = thisContig.positionTOmappingRnas.get(nextPos).size();
+
+ Object[] returnValues = updateCoverageInterval_respectAlternatives(thisContig,covPlusNext,nextPos,coverageVecAndPos,posiCovMap,considerSpliceSiteForCovUpdate);
+
+ coverageVecAndPos = (Object[]) returnValues[0];
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnValues[1];
+
+ if(thisContig.positionTOdiff.keySet().contains(nextPos)){
+ diff = thisContig.positionTOdiff.get(nextPos); // if there occurred insertions or deletions before this positions add/subtract the difference
+ }else{
+ diff = 0;
+ }
+
+ if(!goToIni && ((nextPos - currentPos) <= (GeneFinder.readLength)) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff > GeneFinder.endCoverage) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff < GeneFinder.maxCov)){
+
+ if((inTranscriptMode != -1) || (searchTranscriptStart != -1)){
+ int[] intronEndFound = HelperFunctions_GeneSearch.checkIfNextPosExceedsIntronEnd(cluster,nextPos,inTranscriptMode);
+
+ if(searchTranscriptStart != -1){
+ if((intronEndFound[0] == -1) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff >= GeneFinder.minCoverage)){
+ // start new isoform now
+
+
+ inTranscriptMode = searchTranscriptStart;
+
+ fakeGene = new Gene();
+ fakeGene.startPos = searchTranscriptStart;
+ if(inNormalTranscript != -1){
+ if(searchTranscriptStart > inNormalTranscript){
+ inNormalTranscript = searchTranscriptStart;
+ }
+ }else{
+ inNormalTranscript = searchTranscriptStart;
+ }
+
+ searchTranscriptStart = -1;
+
+ }else if(intronEndFound[0] != -1){
+ // already passed the "normal" transcript, so forget about alternative one
+ searchTranscriptStart = -1;
+ isoformRnas.clear();
+ fakeGene = new Gene();
+ if(intronEndFound[1] == 1){
+ inNormalTranscript = -1;
+ }
+ }
+ }
+ if((inTranscriptMode != -1) && (intronEndFound[0] != -1)){
+
+ if((normalExonAndDifferenceToIt[0] == intronEndFound[0]) && (normalExonAndDifferenceToIt[1] == -1)){
+
+ int[] transcriptStarts = FrameSearch.lookForStartOfIsoform(inTranscriptMode,contigSeq);
+ cluster.alternativeTranscripts.add(new Object[] {inTranscriptMode,inTranscriptMode, transcriptStarts[0],transcriptStarts[1]});
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,isoformRnas);
+
+ isoformRnas.clear();
+ fakeGene = new Gene();
+
+ IntronExonSearch.replaceEntryInAltTransWithUpdatedOne_endTranscript(cluster, inTranscriptMode, currentPos, normalExonAndDifferenceToIt[2]);
+ normalExonAndDifferenceToIt[0] = -1;
+ normalExonAndDifferenceToIt[1] = -1;
+ normalExonAndDifferenceToIt[2] = -1;
+ }else{
+
+ // this time we do not need to extract a fake gene
+ int[] transcriptStarts = FrameSearch.lookForStartOfIsoform(inTranscriptMode,contigSeq);
+ cluster.alternativeTranscripts.add(new Object[] {inTranscriptMode,inTranscriptMode, transcriptStarts[0],transcriptStarts[1]});
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,isoformRnas);
+
+ isoformRnas.clear();
+ fakeGene = new Gene();
+
+ IntronExonSearch.replaceEntryInAltTransWithUpdatedOne(cluster, inTranscriptMode, currentPos, intronEndFound[0],nextPos);
+ }
+
+ inTranscriptMode = -1;
+ searchTranscriptStart = -1;
+ if(intronEndFound[1] == 1){
+ inNormalTranscript = -1;
+ }
+ }
+
+ }
+
+ if(inNormalTranscript != -1){
+ int[] intronEndFound = HelperFunctions_GeneSearch.checkIfNextPosMergesWithPreviousIsoform(cluster,nextPos,inNormalTranscript);
+ if((intronEndFound[0] != -1) && (intronEndFound[1] == 1)){
+ inNormalTranscript = -1;
+ isoformRnas.clear();
+ fakeGene = new Gene();
+ }
+ }
+
+ if((searchTranscriptStart == -1) && (inTranscriptMode == -1) && (inNormalTranscript == -1)){
+ endToPassForclosedGenes = nextPos;
+ }
+
+ currentPos = nextPos;
+
+ if(currentPos >= currentCompete){
+ currentCompete = -1;
+ currentCompeteStart = -1;
+ }
+
+ if(((searchTranscriptStart != -1) || (inTranscriptMode != -1) || (inNormalTranscript != -1)) && (acceptReads != 1)){
+
+ isoformRnas.addAll(thisContig.positionTOmappingRnas.get(currentPos));
+ }else{
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(currentPos));
+ }
+
+ currentExonLength = (currentPos - posStartExon) + GeneFinder.readLength;
+
+ numReadsExon += thisContig.positionTOmappingRnas.get(currentPos).size();
+
+ averageExonCov = (double)((double)(numReadsExon * GeneFinder.readLength)/(double)currentExonLength);
+
+
+ if(localExonEnd != -1){
+ localExonEnd = currentPos + GeneFinder.readLength;
+ }
+
+ }else{
+ boolean closeGene = false;
+
+ if(goToIni){
+ currentCompete = -1;
+ currentCompeteStart = -1;
+ if(endToPassForclosedGenes != -1){
+ chooseAlternativeEndInstead = true;
+ }
+
+ closeGene = true;
+ }else{
+
+ int basesToAddForOverlap = 0;
+ int positionPosiCovMap = HelperFunctions_GeneSearch.findIntronNearNextPos(cluster,nextPos);
+
+ if(posiCovMap.containsKey(positionPosiCovMap)){
+ basesToAddForOverlap = posiCovMap.get(positionPosiCovMap).size();
+ }
+
+ if(inTranscriptMode != -1){
+ int[] intronEndFound = HelperFunctions_GeneSearch.checkIfNextPosExceedsIntronEnd(cluster,nextPos,inTranscriptMode);
+ if(intronEndFound[0] != -1){
+ if(((nextPos - currentPos) <= (GeneFinder.readLength))){
+ // put isoform to normal gene candidate
+ int[] transcriptStarts = FrameSearch.lookForStartOfIsoform(inTranscriptMode,contigSeq);
+ cluster.alternativeTranscripts.add(new Object[] {inTranscriptMode,inTranscriptMode, transcriptStarts[0],transcriptStarts[1]});
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,isoformRnas);
+
+ isoformRnas.clear();
+ fakeGene = new Gene();
+
+ IntronExonSearch.replaceEntryInAltTransWithUpdatedOne(cluster, inTranscriptMode, currentPos, intronEndFound[0],nextPos);
+ }else{
+ // put isoform as fake gene aside
+ fakeGene.startPos = inTranscriptMode;
+ HelperFunctions_GeneSearch.addRnasFromVector(fakeGene,isoformRnas);
+ fakeGene.geneID = id++;
+ fakeGene.stopPos = currentPos + GeneFinder.readLength;
+ temporaryGenes.add(fakeGene);
+
+ fakeGene = new Gene();
+ isoformRnas.clear();
+
+ }
+
+ inTranscriptMode = -1;
+ searchTranscriptStart = -1;
+
+ if(intronEndFound[1] == 1){
+ inNormalTranscript = -1;
+ }
+ }else if(positionPosiCovMap != -1){
+ normalExonAndDifferenceToIt[0] = positionPosiCovMap;
+ if(((nextPos - currentPos) <= (GeneFinder.readLength))){
+ normalExonAndDifferenceToIt[1] = 0;
+ }else{
+ normalExonAndDifferenceToIt[1] = -1;
+ normalExonAndDifferenceToIt[2] = currentPos; // indicates the end that we want
+ }
+
+ }
+ }
+
+ if((currentCompeteStart != -1 && ((currentCompeteStart - currentPos) < GeneFinder.readLength)) && ((currentCompete + basesToAddForOverlap + 1) >= nextPos) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff < GeneFinder.maxCov)){ // old: ((currentCompete+GeneFinder.readLength+1) >= nextPos)
+ // +1 because currentCompete defines the intron end
+ // have to go on because we are still within the cluster, so just decide if this part shall be included or not
+
+ int goOn = 0;
+ int spliceStartAfterCurrentPos = -1;
+ if((nextPos - currentPos) > (GeneFinder.readLength)){
+
+ int[] returnValuesCheckGoOn = new int[3];
+
+ if((inTranscriptMode != -1) || (searchTranscriptStart != -1) || (inNormalTranscript != -1)){
+ returnValuesCheckGoOn = IntronExonSearch.checkIfIntronSupported_fakeGene(cluster, currentPos, nextPos, posiCovMap, inTranscriptMode,searchTranscriptStart,endToPassForclosedGenes,inNormalTranscript);
+ goOn = returnValuesCheckGoOn[0];
+
+ if(goOn == -1){
+ // check for isos that should be closed or not
+ goOn = 0;
+ }
+ }else{
+ returnValuesCheckGoOn = IntronExonSearch.checkIfIntronSupported(cluster, currentPos, nextPos, posiCovMap, inTranscriptMode,searchTranscriptStart,endToPassForclosedGenes,inNormalTranscript);
+ goOn = returnValuesCheckGoOn[0];
+ if(goOn == 2){
+ // alternative transcript erased, adapt modi
+ goOn = 0;
+ searchTranscriptStart = -1;
+ inTranscriptMode = -1;
+ inNormalTranscript = -1;
+
+ }
+ if(goOn == 3){
+ goOn = -1;
+ }
+
+ }
+
+ endToPassForclosedGenes = returnValuesCheckGoOn[1];
+ spliceStartAfterCurrentPos = returnValuesCheckGoOn[2];
+
+ }
+
+ if(inNormalTranscript != -1){ // TODO: should not be necessary any longer! wrong here? because we already check with inTranscriptMode?
+ int[] intronEndFound = HelperFunctions_GeneSearch.checkIfNextPosExceedsIntronEnd(cluster,nextPos,inNormalTranscript);
+ if((intronEndFound[0] != -1) && (intronEndFound[1] == 1)){
+
+ if((nextPos - currentPos) > (GeneFinder.readLength)){
+
+ fakeGene = new Gene();
+ isoformRnas.clear();
+
+ inNormalTranscript = -1;
+ }else{
+ int[] transcriptStarts = FrameSearch.lookForStartOfIsoform(inTranscriptMode,contigSeq);
+ cluster.alternativeTranscripts.add(new Object[] {inTranscriptMode,inTranscriptMode, transcriptStarts[0],transcriptStarts[1]});
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,isoformRnas);
+
+ isoformRnas.clear();
+ fakeGene = new Gene();
+
+ IntronExonSearch.replaceEntryInAltTransWithUpdatedOne(cluster, inTranscriptMode, currentPos, intronEndFound[0],nextPos);
+ inNormalTranscript = -1;
+ }
+
+ }
+ }
+
+ if(positionPosiCovMap == -1 && ((nextPos - currentPos) > (GeneFinder.readLength)) && (nextPos < currentCompete)){
+ // indicates a potential transcript start
+
+ if(searchTranscriptStart != -1){
+ isoformRnas.clear();
+ fakeGene = new Gene();
+ }
+ if(!((inTranscriptMode != -1) && ((nextPos - currentPos) <= (2*GeneFinder.readLength)))){
+ // for alternatives, we do accept greater differences due to coverage inconsistencies
+
+ if(inTranscriptMode != -1){
+ // extract fake gene as a new potential candidate
+ fakeGene.startPos = inTranscriptMode;
+ HelperFunctions_GeneSearch.addRnasFromVector(fakeGene,isoformRnas);
+ fakeGene.geneID = id++;
+ fakeGene.stopPos = currentPos + GeneFinder.readLength;
+ temporaryGenes.add(fakeGene);
+
+ fakeGene = new Gene();
+ isoformRnas.clear();
+ }
+
+ searchTranscriptStart = nextPos;
+ inTranscriptMode = -1;
+
+ }
+ }
+
+ if(goOn != -1){
+
+ if((nextPos - currentPos) > (GeneFinder.readLength)){
+
+ int intronEnd = HelperFunctions_GeneSearch.findIntronNearNextPos(cluster, nextPos);
+
+ if(intronEnd == -1){
+ if((nextPos - currentPos) > (2*GeneFinder.readLength)){
+ posStartExon = nextPos;
+ numReadsExon = 0;
+ averageExonCov = 0;
+ }
+ }else{
+
+ if(intronEnd > nextPos){
+ intronEnd = nextPos;
+ }
+
+ posStartExon = intronEnd;
+ numReadsExon = 0;
+ averageExonCov = 0;
+ }
+
+ }
+
+ if(goOn == 0){
+ if(((searchTranscriptStart != -1) || (inTranscriptMode != -1) || (inNormalTranscript != -1)) && (acceptReads != 1)){
+ isoformRnas.addAll(thisContig.positionTOmappingRnas.get(currentPos));
+ }else{
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(nextPos));
+ }
+
+ currentExonLength = (nextPos - posStartExon) + GeneFinder.readLength;
+
+ numReadsExon += thisContig.positionTOmappingRnas.get(nextPos).size();
+
+ averageExonCov = (double)((double)(numReadsExon * GeneFinder.readLength)/(double)currentExonLength);
+
+ }
+
+ if(((nextPos - currentPos) > (GeneFinder.readLength)) || ((((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff < GeneFinder.endCoverage))){
+
+ if(rnasThatDoNotSupportAnySplit.size() > 0){
+ double limit = GeneFinder.spliceLim;
+ Object[] returnVal = checkIfNonSplitRnasAreSufficient(cluster, rnasThatDoNotSupportAnySplit, spliceKey, currentPos, posiCovMap, limit);
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnVal[1];
+ rnasThatDoNotSupportAnySplit.clear();
+ }
+
+ }
+
+ if(((nextPos - currentPos) > (GeneFinder.readLength)) && localExonEnd != -1 && localSpliceSite != -1 && (cluster.possibleIntrons.get(localSpliceSite)[0] != null)){
+ cluster.possibleIntrons.get(localSpliceSite)[2] = currentPos + GeneFinder.readLength;
+ if(spliceStartAfterCurrentPos == -1){
+ cluster.possibleIntrons.get(localSpliceSite)[3] = -2;
+ }
+ localExonEnd = -1;
+ localSpliceSite = -1;
+ }
+
+ if((nextPos - currentPos) > (GeneFinder.readLength)){
+ for(int position : posiCovMap.keySet()){
+ if(currentPos < (position-GeneFinder.readLength) && nextPos > (position+GeneFinder.readLength)){
+ // had no chance to look at it, so create fake intron to also regard this exon and close
+ IntronExonSearch.searchCompleteIntronForEndingAndMakeNewIsoform(cluster, position,posiCovMap);
+ }
+ }
+ }
+
+ currentPos = nextPos;
+
+ if(currentPos >= currentCompete){
+ currentCompete = -1;
+ currentCompeteStart = -1;
+ }
+ }else{
+
+ currentCompete = -1;
+ currentCompeteStart = -1;
+ if(endToPassForclosedGenes != -1){
+ chooseAlternativeEndInstead = true;
+ }
+
+ closeGene = true;
+ }
+
+ }else{
+ closeGene = true;
+ }
+ }
+
+ if(closeGene){
+
+ int basesToAdd = GeneFinder.readLength;
+
+ if((localExonEnd != -1) && (localSpliceSite != -1)){
+ // update fussy exon for last local splice site, take the split as the end
+ if(cluster.possibleIntrons.containsKey(localSpliceSite)){
+ cluster.possibleIntrons.get(localSpliceSite)[2] = (currentPos+basesToAdd);
+ cluster.possibleIntrons.get(localSpliceSite)[3] = -2;
+ }else{
+ System.err.println("Splice site not included!");
+ }
+ }
+
+ if(chooseAlternativeEndInstead){
+ currentPos = endToPassForclosedGenes;
+ }
+
+ if(((currentCompeteStart != -1 && ((currentCompeteStart - currentPos) < GeneFinder.readLength))) && currentCompete > currentPos){
+
+ if((nextPos - currentPos) > (GeneFinder.readLength)){
+ for(int position : posiCovMap.keySet()){
+ if(currentPos < (position-GeneFinder.readLength) && nextPos > (position+GeneFinder.readLength) && (position != currentCompete)){
+ // had no chance to look at it, so create fake intron to also regard this exon and close
+ IntronExonSearch.searchCompleteIntronForEndingAndMakeNewIsoform(cluster, position,posiCovMap);
+ }
+ }
+ }
+
+ if((currentCompeteStart != -1) && ((currentCompeteStart - currentPos) < GeneFinder.readLength) && !((currentCompete + basesToAdd) >= nextPos)){
+ IntronExonSearch.checkIfIntronSupported_forGeneIni(cluster, currentPos, nextPos, posiCovMap, inTranscriptMode,searchTranscriptStart,endToPassForclosedGenes);
+ }
+
+ currentPos = currentCompete + 1; // nextPos is bigger than split end, but we have to consider the split anyway
+
+ // look new number up in posiCovMap
+
+ if(posiCovMap.containsKey(currentCompete)){
+ basesToAdd = posiCovMap.get(currentCompete).size();
+ }
+ }
+
+
+
+ // extract this cluster, interval [startPos,currentPos+readLength]
+
+ id = clustIni(cluster, thisContig, contigSeq, startPos, id, currentPos,basesToAdd);
+
+ numIdentifiedClusters++;
+ currentPos = nextPos; // now nextPos is potential new start
+
+ doNotCountTwice = true;
+ break;
+ }
+ }
+ }
+
+ }else{
+ if(positionIt.hasNext()){
+ currentPos = positionIt.next();
+ doNotCountTwice = false;
+
+ cluster.idTOassociatedRnas.clear();
+ cluster.possibleIntrons.clear();
+
+ while(!(spliceKey < 0) && (currentPos > spliceKey)){
+
+ if((thisContig.splicePositions.higherKey(spliceKey) == null)){
+ spliceKey = -1;
+ }else{
+ spliceKey = thisContig.splicePositions.higherKey(spliceKey);
+ }
+ }
+
+ }else{
+ break;
+ }
+ }
+
+ if(!positionIt.hasNext() && (numIdentifiedClusters < 1) && startedNewCluster){ // to grab very last cluster
+
+ int basesToAdd = GeneFinder.readLength;
+
+ if((localExonEnd != -1) && (localSpliceSite != -1)){
+ // update fussy exon for last local splice site, take the split as the end
+ if(cluster.possibleIntrons.containsKey(localSpliceSite)){
+ cluster.possibleIntrons.get(localSpliceSite)[2] = (currentPos+basesToAdd);
+ cluster.possibleIntrons.get(localSpliceSite)[3] = -2;
+ }else{
+ System.err.println("Splice site not included!");
+ }
+ }
+
+ if(chooseAlternativeEndInstead){
+ currentPos = endToPassForclosedGenes;
+ }
+
+ if(((currentCompeteStart != -1 && ((currentCompeteStart - currentPos) < GeneFinder.readLength))) && currentCompete > currentPos){
+
+ if((nextPos - currentPos) > (GeneFinder.readLength)){
+ for(int position : posiCovMap.keySet()){
+ if(currentPos < (position-GeneFinder.readLength) && nextPos > (position+GeneFinder.readLength) && (position != currentCompete)){
+ // had no chance to look at it, so create fake intron to also regard this exon and close
+ IntronExonSearch.searchCompleteIntronForEndingAndMakeNewIsoform(cluster, position,posiCovMap);
+ }
+ }
+ }
+
+ if((currentCompeteStart != -1) && ((currentCompeteStart - currentPos) < GeneFinder.readLength) && !((currentCompete + basesToAdd) >= nextPos)){
+ IntronExonSearch.checkIfIntronSupported_forGeneIni(cluster, currentPos, nextPos, posiCovMap, inTranscriptMode,searchTranscriptStart,endToPassForclosedGenes);
+ }
+
+ currentPos = currentCompete + 1; // nextPos is bigger than split end, but we have to consider the split anyway
+
+ // look new number up in posiCovMap
+
+ if(posiCovMap.containsKey(currentCompete)){
+ basesToAdd = posiCovMap.get(currentCompete).size();
+ }
+ }
+
+ id = clustIni(cluster, thisContig, contigSeq, startPos, id, currentPos,basesToAdd);
+
+ numIdentifiedClusters++;
+ }
+
+ }while(positionIt.hasNext() && (numIdentifiedClusters < 1));
+
+ if(!positionIt.hasNext() && (numIdentifiedClusters < 1)){
+ // reached end of reference sequence, no cluster has been extracted this time, so stop
+ noMoreCluster = true;
+ break;
+ }
+
+ // now that we found the high-coverage area, search for start and stop codons
+
+ int possibleStart_FO = FrameSearch.findPossibleStarts_Forward(cluster, contigSeq, 0, (int) (cluster.startPos+3)); // now the start positions are directly the right ones
+ int possibleStart_RE = FrameSearch.findPossibleStarts_Reverse(cluster,contigSeq,0,(int) (cluster.startPos+3));
+
+ boolean foundNoStart = false;
+ boolean useClusterBef = false;
+ boolean doWithoutStart = false;
+
+ Gene clusterBef = null;
+ if(thisContig.allGenes.size() != 0){
+ clusterBef = thisContig.allGenes.get(thisContig.allGenes.size()-1);
+ }
+
+ if(((possibleStart_FO == -1) && (possibleStart_RE == -1)) || (clusterBef != null && !clusterBef.hasStop_temp) || (clusterBef != null && clusterBef.twinNode != null && !clusterBef.twinNode.hasStop_temp)){
+ if((possibleStart_FO == -1) && (possibleStart_RE == -1)){
+ numFoundNoStart_firstTime++;
+ }
+ if(thisContig.allGenes.size() == 0){
+ doWithoutStart = true;
+ }else{
+ boolean overlaps = false;
+ boolean overlapsTwin = false;
+
+ int tolerance = GeneFinder.interval;
+ if(tolerance == -1){
+ tolerance = GeneFinder.readLength;
+ }
+
+ if((cluster.startPos <= clusterBef.stopPos+1) || (cluster.startPos - clusterBef.stopPos+1 <= tolerance)){
+ overlaps = true;
+ }
+ if(clusterBef.twinNode != null){
+ if((cluster.startPos <= clusterBef.twinNode.stopPos+1) || (cluster.startPos - clusterBef.twinNode.stopPos+1 <= tolerance)){
+ overlapsTwin = true;
+ }
+ }
+
+ // combine current cluster with the one before
+ if(overlaps || overlapsTwin){
+ useClusterBef = true;
+ }else if((clusterBef.hasStop_temp) || (clusterBef.twinNode != null && clusterBef.twinNode.hasStop_temp)){
+ doWithoutStart = true;
+ }else{
+
+ clusterBef.hasStop_temp = true;
+
+ if(GeneFinder.inprogeaCall){
+ clusterBef.sequence = contigSeq.substring(clusterBef.startPos,Math.min(clusterBef.stopPos+1,contigSeq.length()));
+ }
+ clusterBef.realDirectionNotKnown = true;
+ if(clusterBef.twinNode != null){
+ clusterBef.twinNode.hasStop_temp = true;
+ if(GeneFinder.inprogeaCall){
+ clusterBef.twinNode.sequence = contigSeq.substring(clusterBef.twinNode.startPos,Math.min(clusterBef.twinNode.stopPos+1,contigSeq.length()));
+ }
+
+ clusterBef.twinNode.realDirectionNotKnown = true;
+ }
+
+ if((possibleStart_FO == -1) && (possibleStart_RE == -1)){
+ doWithoutStart = true;
+ }
+
+ }
+ }
+
+ }
+
+ int possibleStop_FO = -1;
+ int possibleStop_RE = -1;
+
+ possibleStop_FO = FrameSearch.findPossibleStops_Forward(cluster,contigSeq,0,(int) cluster.stopPos-2);
+ possibleStop_RE = FrameSearch.findPossibleStops_Reverse(cluster,contigSeq,0,(int) cluster.stopPos-2);
+
+ if(!foundNoStart){
+
+ // first have a look, if already forward or reverse direction is excluded due to missing start or stop
+
+ if(useClusterBef){
+
+ numMergedClusters++;
+ boolean overlaps = false;
+ boolean overlapsTwin = false;
+ clusterBef = thisContig.allGenes.get(thisContig.allGenes.size()-1);
+ thisContig.allGenes.remove(thisContig.allGenes.size()-1);
+
+ int[] intronBef = new int[2];
+ intronBef[0] = clusterBef.stopPos+1;
+ intronBef[1] = cluster.startPos;
+
+ int tolerance = GeneFinder.interval;
+ if(tolerance == -1){
+ tolerance = GeneFinder.readLength;
+ }
+ if((cluster.startPos <= clusterBef.stopPos+1) || (cluster.startPos - clusterBef.stopPos+1 <= tolerance)){
+ overlaps = true;
+ }
+
+ if(clusterBef.twinNode == null){
+
+ // easy case, we simply take direction of this cluster as our new direction
+
+ MergeClusters.updateCluster_AfterMerging(cluster,clusterBef,cluster.stopPos-2,contigSeq);
+ handleFrameSearchWithoutTwin(cluster,possibleStop_FO,possibleStop_RE,contigSeq,overlaps);
+
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+
+
+ }else{
+ // clusterBef has a twin, now it gets a bit more complicated: first check if we can exclude one direction because we do not find the adequate stop/start
+
+ int[] intronBef_twin = new int[2];
+ intronBef_twin[0] = clusterBef.twinNode.stopPos+1;
+ intronBef_twin[1] = cluster.startPos;
+ if((cluster.startPos <= clusterBef.twinNode.stopPos+1) || (cluster.startPos - clusterBef.twinNode.stopPos+1 <= tolerance)){
+ overlapsTwin = true;
+ }
+
+
+ if(possibleStop_RE == -1 && possibleStop_FO == -1){ // useClustBef, no start found
+
+ // ooops, found nothing at all, so merge with both?
+
+ if((!clusterBef.freeToResolve && !clusterBef.twinNode.freeToResolve) || (overlaps != overlapsTwin)){
+ MergeClusters.findLeadingTwinAndMerge(cluster,clusterBef,false,overlaps,overlapsTwin,intronBef,intronBef_twin,contigSeq);
+
+ }else{
+
+ MergeClusters.mergeClustWithBothBefs(cluster,clusterBef,contigSeq,false);
+
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster.twinNode,intronBef_twin);;
+ }
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+
+ }
+
+ numFoundNoStop_firstTime++;
+ cluster.realDirectionNotKnown = true;
+ cluster.twinNode.realDirectionNotKnown = true;
+
+ } else if(possibleStop_FO != -1 && possibleStop_RE != -1){ // useClustBef, both starts found
+
+ if((!clusterBef.freeToResolve && !clusterBef.twinNode.freeToResolve) || (overlaps != overlapsTwin)){
+
+ MergeClusters.findLeadingTwinAndMerge(cluster,clusterBef,true,overlaps,overlapsTwin,intronBef,intronBef_twin,contigSeq);
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ if(!cluster.onRevStrand){
+ int[] pair_FO = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ if(pair_FO == null){
+
+ cluster.stopPos = possibleStop_FO + 2;
+
+ checkIfAdequateAndRefine(cluster, false, cluster.possibleStarts_Forward, contigSeq);
+
+ numNoFrameFound++;
+
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ }
+ }else{
+ int[] pair_RE = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ if(pair_RE == null){
+
+ cluster.stopPos = possibleStop_RE + 2;
+
+ checkIfAdequateAndRefine(cluster, true, cluster.possibleStarts_Reverse, contigSeq);
+
+ numNoFrameFound++;
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }
+ }
+ }else{
+ int[] pair_FO = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ int[] pair_RE = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+
+
+ if(pair_FO == null && pair_RE == null){
+
+ // if possible, refer to XS tag
+
+ if(cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster.onRevStrand = false;
+ }else if(cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster.onRevStrand = true;
+ }
+
+ if(!cluster.onRevStrand){
+ cluster.stopPos = possibleStop_FO + 2;
+ checkIfAdequateAndRefine(cluster, false, cluster.possibleStarts_Forward, contigSeq);
+ }else{
+ cluster.stopPos = possibleStop_RE + 2;
+ checkIfAdequateAndRefine(cluster, true, cluster.possibleStarts_Reverse, contigSeq);
+ }
+
+ }else{
+ // choose smallest interval possible
+ if(pair_FO == null && pair_RE != null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }else if(pair_FO != null && pair_RE == null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ }else{
+
+ // if possible, refer to XS tag
+
+ if(cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ }else if(cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ }else{
+ if(pair_FO[2] <= pair_RE[2]){ // extract smallest possible interval
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ }
+ }
+
+ }
+
+ }
+ }
+
+
+ }else{
+
+ // merge with both
+
+ MergeClusters.mergeClustWithBothBefs(cluster,clusterBef,contigSeq,true);
+
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster.twinNode,intronBef_twin);
+ }
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+
+ // if possible, refer to XS tag
+
+ if((cluster.possibleIntrons.keySet().size() > 0) && cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster.onRevStrand = false;
+ }else if((cluster.possibleIntrons.keySet().size() > 0) && cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster.onRevStrand = true;
+ }
+
+ if(!cluster.onRevStrand){
+
+ int[] pair_FO = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ int[] pair_RE_Twin = FrameSearch.checkAndChooseReadingFrame(cluster.twinNode.possibleStarts_Reverse,cluster.twinNode.possibleStops_Reverse);
+
+ if(pair_FO == null){
+
+ cluster.stopPos = possibleStop_FO + 2;
+
+ checkIfAdequateAndRefine(cluster, false, cluster.possibleStarts_Forward, contigSeq);
+
+ numNoFrameFound++;
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ }
+
+ if(pair_RE_Twin == null){
+
+ cluster.twinNode.stopPos = possibleStop_RE + 2;
+
+ checkIfAdequateAndRefine(cluster.twinNode, true, cluster.twinNode.possibleStarts_Reverse, contigSeq);
+
+ numNoFrameFound++;
+ }else{
+ cluster.twinNode = refineExtractedCluster(cluster.twinNode, contigSeq, pair_RE_Twin[0], pair_RE_Twin[1]+2,true);
+ cluster.twinNode.realDirectionNotKnown = false;
+ }
+ }else{
+
+ int[] pair_RE = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ int[] pair_FO_Twin = FrameSearch.checkAndChooseReadingFrame(cluster.twinNode.possibleStarts_Forward,cluster.twinNode.possibleStops_Forward);
+
+ if(pair_RE == null){
+
+ cluster.stopPos = possibleStop_RE + 2;
+
+ checkIfAdequateAndRefine(cluster, true, cluster.possibleStarts_Reverse, contigSeq);
+
+ numNoFrameFound++;
+
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }
+
+ if(pair_FO_Twin == null){
+
+ cluster.twinNode.stopPos = possibleStop_FO + 2;
+
+ checkIfAdequateAndRefine(cluster.twinNode, false, cluster.twinNode.possibleStarts_Forward, contigSeq);
+
+ numNoFrameFound++;
+
+ }else{
+ cluster.twinNode = refineExtractedCluster(cluster.twinNode, contigSeq, pair_FO_Twin[0], pair_FO_Twin[1]+2,false);
+ cluster.twinNode.realDirectionNotKnown = false;
+ }
+ }
+ }
+
+ }else if(possibleStop_FO != -1 && possibleStop_RE == -1){ // useClustBef, forward start found
+
+ // take forward twin
+ if((!clusterBef.freeToResolve && !clusterBef.twinNode.freeToResolve) || (overlaps != overlapsTwin)){
+
+ boolean changeDirections = false;
+ boolean changeOverlap = false; // only necessary to give the right argument to changeDirections function
+
+ if(!clusterBef.freeToResolve && !clusterBef.twinNode.freeToResolve){
+
+ if(clusterBef.isMergedTwin){
+
+ if(clusterBef.onRevStrand || clusterBef.realDirectionNotKnown){
+ // make this twin a forward one, other twin becomes a reverse one instead
+ changeDirections = true;
+ }
+
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,false,true,true);
+
+ changeOverlap = overlaps;
+
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+
+ }else{
+ if(clusterBef.twinNode.onRevStrand || clusterBef.twinNode.realDirectionNotKnown){
+ // make this twin a forward one, other twin becomes a reverse one instead
+ changeDirections = true;
+ }
+
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef.twinNode,contigSeq,false,true,true);
+
+ changeOverlap = overlapsTwin;
+
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef_twin);
+ }
+
+ }
+ }else{
+ if(overlaps){
+ if(clusterBef.onRevStrand || clusterBef.realDirectionNotKnown){
+ // make this twin a forward one, other twin becomes a reverse one instead
+ changeDirections = true;
+ }
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,false,true,true);
+ changeOverlap = overlaps;
+ }else{
+ if(clusterBef.twinNode.onRevStrand || clusterBef.twinNode.realDirectionNotKnown){
+ // make this twin a forward one, other twin becomes a reverse one instead
+ changeDirections = true;
+ }
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef.twinNode,contigSeq,false,true,true);
+ changeOverlap = overlapsTwin;
+ }
+ }
+
+ int[] pair_FO = null;
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair_FO = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }else{
+ pair_FO = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }
+
+ if(pair_FO == null){
+
+ cluster.stopPos = possibleStop_FO + 2;
+
+ checkIfAdequateAndRefine(cluster, false, cluster.possibleStarts_Forward, contigSeq);
+
+ numNoFrameFound++;
+
+ }else{
+
+ if(changeDirections){
+ boolean haveChanged = changeDirectionsIfPossible(cluster,contigSeq,changeOverlap,true);
+ if(!haveChanged){
+ cluster.stopPos = possibleStop_FO + 2;
+ declareClusterAsNotCompleted(cluster,cluster.startPos,contigSeq);
+ numNoFrameFound++;
+ }
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ }
+ }
+
+ }else{
+
+ int[] pair_FO = null;
+ int[] pair_FO_twin = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty() && clusterBef.possibleIntrons.keySet().isEmpty()){
+ pair_FO = FrameSearch.checkAndChooseReadingFrame(clusterBef.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }else{
+ pair_FO = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(clusterBef.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }
+
+ if(cluster.possibleIntrons.keySet().isEmpty() && clusterBef.twinNode.possibleIntrons.keySet().isEmpty()){
+ pair_FO_twin = FrameSearch.checkAndChooseReadingFrame(clusterBef.twinNode.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }else{
+ pair_FO_twin = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(clusterBef.twinNode.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }
+
+ if(pair_FO != null && pair_FO_twin == null){
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,false,true,true);
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ } else if(pair_FO == null && pair_FO_twin != null){
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef.twinNode,contigSeq,false,true,true);
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef_twin);
+ }
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO_twin[0], pair_FO_twin[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ } else if((pair_FO != null && pair_FO_twin != null) || (pair_FO == null && pair_FO_twin == null)){
+
+ // merge with both
+ cluster.stopPos = possibleStop_FO + 2;
+ MergeClusters.mergeClustWithBothBefs(cluster,clusterBef,contigSeq,true);
+
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster.twinNode,intronBef_twin);
+ }
+
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+
+ if(pair_FO == null && pair_FO_twin == null){
+
+ numNoFrameFound++;
+
+ cluster.hasStop_temp = false;
+ cluster.twinNode.hasStop_temp = false;
+ cluster.realDirectionNotKnown = true;
+ cluster.twinNode.realDirectionNotKnown = true;
+
+ }else{
+
+ // merge with present forward, set other to noStop
+
+ if(!cluster.onRevStrand){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+
+ cluster.twinNode.stopPos = possibleStop_FO + 2;
+ declareClusterAsNotCompleted(cluster.twinNode,cluster.twinNode.startPos,contigSeq);
+ numNoFrameFound++;
+
+ }else{
+ cluster.twinNode = refineExtractedCluster(cluster.twinNode, contigSeq,pair_FO_twin[0], pair_FO_twin[1]+2,false);
+ cluster.twinNode.realDirectionNotKnown = false;
+
+ cluster.stopPos = possibleStop_FO + 2;
+ declareClusterAsNotCompleted(cluster,cluster.startPos,contigSeq);
+ numNoFrameFound++;
+ }
+ }
+
+ }
+ }
+
+ }else if(possibleStop_FO == -1 && possibleStop_RE != -1){ // useClustBef, reverse start found
+
+ // take reverse twin
+
+ if((!clusterBef.freeToResolve && !clusterBef.twinNode.freeToResolve) || (overlaps != overlapsTwin)){
+
+ boolean changeDirections = false;
+ boolean changeOverlap = false; // only necessary to give the right argument to changeDirections function
+
+ if(!clusterBef.freeToResolve && !clusterBef.twinNode.freeToResolve){
+ if(clusterBef.isMergedTwin){
+
+ if(!clusterBef.onRevStrand || clusterBef.realDirectionNotKnown){
+ // make this twin a forward one, other twin becomes a reverse one instead
+ changeDirections = true;
+ }
+
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,false,true,true);
+
+ changeOverlap = overlaps;
+
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+
+ }else{
+
+ if(!clusterBef.twinNode.onRevStrand || clusterBef.twinNode.realDirectionNotKnown){
+ // make this twin a forward one, other twin becomes a reverse one instead
+ changeDirections = true;
+ }
+
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef.twinNode,contigSeq,false,true,true);
+
+ changeOverlap = overlapsTwin;
+
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef_twin);
+ }
+ }
+ }else{
+ if(overlaps){
+ if(!clusterBef.onRevStrand || clusterBef.realDirectionNotKnown){
+ // make this twin a forward one, other twin becomes a reverse one instead
+ changeDirections = true;
+ }
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,false,true,true);
+ changeOverlap = overlaps;
+ }else{
+ if(!clusterBef.twinNode.onRevStrand || clusterBef.twinNode.realDirectionNotKnown){
+ // make this twin a forward one, other twin becomes a reverse one instead
+ changeDirections = true;
+ }
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef.twinNode,contigSeq,false,true,true);
+ changeOverlap = overlapsTwin;
+ }
+ }
+
+ int[] pair_RE = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair_RE = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }else{
+ pair_RE = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }
+
+ if(pair_RE == null){
+
+ cluster.stopPos = possibleStop_RE + 2;
+
+ checkIfAdequateAndRefine(cluster, true, cluster.possibleStarts_Reverse, contigSeq);
+
+ numNoFrameFound++;
+ }else{
+
+ if(changeDirections){
+ boolean haveChanged = changeDirectionsIfPossible(cluster,contigSeq,changeOverlap,false);
+ if(!haveChanged){
+ cluster.stopPos = possibleStop_RE + 2;
+ declareClusterAsNotCompleted(cluster,cluster.startPos,contigSeq);
+ numNoFrameFound++;
+ }
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }
+ }
+
+ }else{
+ int[] pair_RE = null;
+ int[] pair_RE_twin = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty() && clusterBef.possibleIntrons.keySet().isEmpty()){
+ pair_RE = FrameSearch.checkAndChooseReadingFrame(clusterBef.possibleStarts_Reverse,cluster.possibleStarts_Reverse);
+ }else{
+ pair_RE = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(clusterBef.possibleStarts_Reverse,cluster.possibleStarts_Reverse);
+ }
+
+ if(cluster.possibleIntrons.keySet().isEmpty() && clusterBef.twinNode.possibleIntrons.keySet().isEmpty()){
+ pair_RE_twin = FrameSearch.checkAndChooseReadingFrame(clusterBef.twinNode.possibleStarts_Reverse,cluster.possibleStarts_Reverse);
+ }else{
+ pair_RE_twin = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(clusterBef.twinNode.possibleStarts_Reverse,cluster.possibleStarts_Reverse);
+ }
+
+ if(pair_RE != null && pair_RE_twin == null){
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,false,true,true);
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ } else if(pair_RE == null && pair_RE_twin != null){
+ MergeClusters.mergeWithOneIncludeTwin(cluster,clusterBef.twinNode,contigSeq,false,true,true);
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef_twin);
+ }
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE_twin[0], pair_RE_twin[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ } else if((pair_RE != null && pair_RE_twin != null) || (pair_RE == null && pair_RE_twin == null)){
+
+ // merge with both
+ cluster.stopPos = possibleStop_RE + 2;
+ MergeClusters.mergeClustWithBothBefs(cluster,clusterBef,contigSeq,true);
+
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster.twinNode,intronBef_twin);
+ }
+
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+
+ if(pair_RE == null && pair_RE_twin == null){
+
+ numNoFrameFound++;
+
+ cluster.hasStop_temp = false;
+ cluster.twinNode.hasStop_temp = false;
+ cluster.realDirectionNotKnown = true;
+ cluster.twinNode.realDirectionNotKnown = true;
+
+ }else{
+
+ // merge with present reverse, set other to noStop
+
+ if(cluster.onRevStrand){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+
+ cluster.twinNode.stopPos = possibleStop_RE + 2;
+ declareClusterAsNotCompleted(cluster.twinNode,cluster.twinNode.startPos,contigSeq);
+ numNoFrameFound++;
+
+ }else{
+ cluster.twinNode = refineExtractedCluster(cluster.twinNode, contigSeq,pair_RE_twin[0], pair_RE_twin[1]+2,true);
+ cluster.twinNode.realDirectionNotKnown = false;
+
+ cluster.stopPos = possibleStop_RE +2;
+ declareClusterAsNotCompleted(cluster,cluster.startPos,contigSeq);
+ numNoFrameFound++;
+ }
+ }
+
+ }
+ }
+
+ }
+
+
+ }
+
+ }else if(possibleStart_FO == -1 && possibleStart_RE != -1){ // reverse start found
+
+ // if it is an ORF, a reverse one is more likely, so search only for a possible start position of an reverse gene
+
+ if(possibleStop_RE != -1){
+
+ int[] pair = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }else{
+ pair = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }
+
+ if(pair != null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair[0], pair[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }else{
+
+ cluster.startPos = possibleStart_RE;
+ cluster.stopPos = possibleStop_RE + 2;
+
+ checkIfAdequateAndRefine(cluster, true, cluster.possibleStarts_Reverse, contigSeq);
+
+ numNoFrameFound++;
+ }
+
+ }else{
+ cluster.onRevStrand = true;
+ declareClusterAsNotCompleted(cluster,possibleStart_RE,contigSeq);
+ numFoundNoStop_firstTime++;
+ }
+
+ }else if(possibleStart_RE == -1 && possibleStart_FO != -1){ // forward start found
+
+ // if it is an ORF, a forward one is more likely, so search only for possible stop positions of a forward gene
+
+ if(possibleStop_FO != -1){
+
+ int[] pair = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }else{
+ pair = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }
+
+ if(pair != null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair[0], pair[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ }else{
+
+ cluster.startPos = possibleStart_FO;
+ cluster.stopPos = possibleStop_FO + 2;
+
+ checkIfAdequateAndRefine(cluster, false, cluster.possibleStarts_Forward, contigSeq);
+
+ numNoFrameFound++;
+ }
+
+ }else{
+ declareClusterAsNotCompleted(cluster,possibleStart_FO,contigSeq);
+ numFoundNoStop_firstTime++;
+ }
+
+ }else if(possibleStart_RE != -1 && possibleStart_FO != -1){ // both starts found
+
+ // at the moment both directions are equally likely, so first search for the right frames
+
+ if((possibleStop_FO == -1) && (possibleStop_RE == -1)){
+ declareClusterAsNotCompleted(cluster,Math.max(possibleStart_FO,possibleStart_RE),contigSeq); // use the smallest possible interval!
+ numFoundNoStop_firstTime++;
+
+ }else if((possibleStop_FO != -1) && (possibleStop_RE == -1)){
+
+ int[] pair = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }else{
+ pair = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ }
+
+ if(pair != null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair[0], pair[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ }else{
+
+ cluster.startPos = possibleStart_FO;
+ cluster.stopPos = possibleStop_FO + 2;
+ checkIfAdequateAndRefine(cluster, false, cluster.possibleStarts_Forward, contigSeq);
+
+ numNoFrameFound++;
+ }
+
+ }else if((possibleStop_FO == -1) && (possibleStop_RE != -1)){
+
+ int[] pair = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }else{
+ pair = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }
+
+ if(pair != null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair[0], pair[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }else{
+
+ cluster.startPos = possibleStart_RE;
+ cluster.stopPos = possibleStop_RE + 2;
+ checkIfAdequateAndRefine(cluster, true, cluster.possibleStarts_Reverse, contigSeq);
+
+ numNoFrameFound++;
+ }
+
+ }else{
+ // both are still equally likely, so check both of them
+
+ int[] pair_FO = null;
+ int[] pair_RE = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair_FO = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ pair_RE = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }else{
+
+ pair_FO = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ pair_RE = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }
+
+ if(pair_FO == null && pair_RE == null){
+
+ // if possible, refer to XS tag
+
+ if(!(cluster.possibleIntrons.keySet().isEmpty()) && cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster.startPos = possibleStart_FO;
+ cluster.stopPos = possibleStop_FO + 2;
+ cluster.onRevStrand = false;
+ }else if(!(cluster.possibleIntrons.keySet().isEmpty()) && cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster.startPos = possibleStart_RE;
+ cluster.stopPos = possibleStop_RE + 2;
+ cluster.onRevStrand = true;
+ }else{
+ if((possibleStop_FO - possibleStart_FO) <= (possibleStop_RE - possibleStart_RE)){
+ cluster.startPos = possibleStart_FO;
+ cluster.stopPos = possibleStop_FO + 2;
+ cluster.onRevStrand = false;
+ }else{
+ cluster.startPos = possibleStart_RE;
+ cluster.stopPos = possibleStop_RE + 2;
+ cluster.onRevStrand = true;
+ }
+ }
+
+ declareClusterAsNotCompleted(cluster,cluster.startPos,contigSeq);
+ numNoFrameFound++;
+ } else if(pair_FO != null && pair_RE == null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ } else if(pair_FO == null && pair_RE != null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }else if(pair_FO != null && pair_RE != null){
+ // if possible, refer to XS tag
+ if(!(cluster.possibleIntrons.keySet().isEmpty()) && cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ }else if(!(cluster.possibleIntrons.keySet().isEmpty()) && cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }else{
+ // now is the time to create a twin, note: TODO: first a twin was always created -> change in results?
+
+ Gene clusterTwin = new Gene();
+
+ HelperFunctions_GeneSearch.declareAsShared(cluster);
+
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ clusterTwin = refineExtractedCluster(clusterTwin, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+
+ HelperFunctions_GeneSearch.addAssociatedRnas(clusterTwin, cluster.idTOassociatedRnas);
+
+ clusterTwin.geneID = id++;
+ HelperFunctions_GeneSearch.declareAsShared(clusterTwin);
+
+ twinIni(cluster,clusterTwin,thisContig);
+ }
+
+ }
+
+ }
+
+ }
+
+
+ if(doWithoutStart){
+ // we simply use the identified start as a starting point and do not have any forward start codon or reverse stop codon
+
+ possibleStart_FO = cluster.startPos;
+ possibleStart_RE = cluster.startPos;
+
+ if((possibleStop_FO == -1) && (possibleStop_RE == -1)){
+
+ declareClusterAsNotCompleted(cluster,cluster.startPos,contigSeq);
+ numFoundNoStop_firstTime++;
+
+ }else if((possibleStop_FO != -1) && (possibleStop_RE == -1)){
+ cluster = refineExtractedCluster(cluster, contigSeq,cluster.startPos, possibleStop_FO+2,false);
+ cluster.realDirectionNotKnown = true;
+ }else if((possibleStop_FO == -1) && (possibleStop_RE != -1)){
+ cluster = refineExtractedCluster(cluster, contigSeq,cluster.startPos, possibleStop_RE+2,true);
+ cluster.realDirectionNotKnown = true;
+ }else{
+
+ // if possible, refer to XS tag
+ if(!(cluster.possibleIntrons.keySet().isEmpty()) && cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster = refineExtractedCluster(cluster, contigSeq, cluster.startPos,possibleStop_FO+2,false);
+ cluster.realDirectionNotKnown = false;
+ }else if(!(cluster.possibleIntrons.keySet().isEmpty()) && cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster = refineExtractedCluster(cluster, contigSeq, cluster.startPos, possibleStop_RE+2,true);
+ cluster.realDirectionNotKnown = false;
+ }else{
+ if(possibleStop_FO <= possibleStop_RE){
+ cluster = refineExtractedCluster(cluster, contigSeq, cluster.startPos, possibleStop_FO+2,false);
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, cluster.startPos, possibleStop_RE+2,true);
+ }
+
+ cluster.realDirectionNotKnown = true;
+ }
+
+
+ }
+ }
+
+ // now test for overlap merging:
+
+ if(thisContig.allGenes.size() > 0){
+
+ Gene clusterBeforeInVec = thisContig.allGenes.get(thisContig.allGenes.size()-1);
+
+ if(MergeClusters.checkIfNeedMerge(cluster, clusterBeforeInVec,thisContig,contigSeq)){
+ numMergedClusters++;
+ }else{
+ thisContig.allGenes.add(cluster);
+ }
+
+ }else{
+ thisContig.allGenes.add(cluster);
+ }
+ }
+
+ if(thisContig.allGenes.size() > 1){
+ // test previously added gene, if it has a twin and if yes, if it can be resolved
+ Gene clustBef = thisContig.allGenes.get(thisContig.allGenes.size()-2);
+ if((clustBef.twinNode != null) && (clustBef.freeToResolve)){
+ thisContig.allGenes.setElementAt(LocalTwinResolve.resolveTwins(clustBef),thisContig.allGenes.size()-2);
+ clustBef = thisContig.allGenes.get(thisContig.allGenes.size()-2);
+ }
+ FrameSearch.useDirecInfo(clustBef,contigSeq);
+ if((clustBef.coreSeq <= (2*GeneFinder.readLength)) && ((clustBef.stopPos-clustBef.startPos +1) <= (clustBef.coreSeq + (2*GeneFinder.readLength)))){
+ if(!checkIfOnlyMultiRnas(clustBef)){
+ thisContig.allGenes.removeElement(clustBef);
+ removeGenesRnas(clustBef,thisContig);
+ }
+
+ }
+ }
+
+ if(!positionIt.hasNext()){
+ noMoreCluster = true;
+ break;
+ }
+
+ numIdentifiedClusters--;
+
+ }while(!noMoreCluster);
+
+ // check for the last extracted cluster if it has a proper stop or not and further check for twins
+
+ if(thisContig.allGenes.size() != 0 && !thisContig.allGenes.get(thisContig.allGenes.size()-1).hasStop_temp){
+
+ thisContig.allGenes.get(thisContig.allGenes.size()-1).hasStop_temp = true;
+
+ Gene clustBef = thisContig.allGenes.get(thisContig.allGenes.size()-1);
+ if((clustBef.twinNode != null) && (clustBef.freeToResolve)){
+ thisContig.allGenes.setElementAt(LocalTwinResolve.resolveTwins(clustBef),thisContig.allGenes.size()-1);
+ clustBef = thisContig.allGenes.get(thisContig.allGenes.size()-1);
+ }
+
+ FrameSearch.useDirecInfo(clustBef,contigSeq);
+
+ if((clustBef.coreSeq <= (2*GeneFinder.readLength)) && ((clustBef.stopPos-clustBef.startPos +1) <= (clustBef.coreSeq + (2*GeneFinder.readLength)))){
+ if(!checkIfOnlyMultiRnas(clustBef)){
+ thisContig.allGenes.removeElement(clustBef);
+ removeGenesRnas(clustBef,thisContig);
+ }
+
+ }
+
+ }
+
+ if(!temporaryGenes.isEmpty()){
+ // check also the temporarily extracted candidates, add the coreSeq, check if they are multi-read artifacts and assign a proper start and stop and direction
+ for(int pos = temporaryGenes.size()-1;pos >= 0; pos--){
+ Gene tempGene = temporaryGenes.get(pos);
+
+ if(tempGene.startPos == 0){
+ System.err.println("Fake gene " + tempGene.geneID + " has zero start!");
+ WriteOutput.writeToLogFile("Fake gene " + tempGene.geneID + " has zero start!\n");
+ }
+ tempGene.coreSeq = (tempGene.stopPos - tempGene.startPos);
+
+ if(GeneFinder.inprogeaCall){
+ tempGene.sequence = contigSeq.substring(tempGene.startPos,tempGene.stopPos+1);
+ }
+
+ if((tempGene.coreSeq > (2*GeneFinder.readLength)) && (checkIfOnlyMultiRnas(tempGene))){ // TODO: test if this stricter version yields better results
+
+ //tempGene.sequence = contigSeq.substring(tempGene.startPos,tempGene.stopPos+1);
+ tempGene.onRevStrand = false;
+ FrameSearch.findFrameAndCheckWithNeighbors(tempGene, thisContig, contigSeq);
+ FrameSearch.useDirecInfo(tempGene,contigSeq);
+ thisContig.allGenes.add(tempGene);
+
+ }else{
+ removeGenesRnas(tempGene,thisContig);
+ temporaryGenes.removeElementAt(pos);
+ }
+
+ }
+ }
+
+ if(!GeneFinder.secondPart){
+ String s = "";
+ s += "No more clusters can be found \n";
+ s += "Total identified clusters: " + thisContig.allGenes.size() + "\n";
+ s += "Number mergings: " + numMergedClusters + "\n";
+ s += "no start at first trial: " + numFoundNoStart_firstTime + "\n";
+ s += "no stop at first trial: " + numFoundNoStop_firstTime + "\n";
+ s += "no frame determined at first trial: " + numNoFrameFound + "\n";
+
+ WriteOutput.writeToLogFile(s + "\n");
+ System.out.println(s);
+ }
+
+ return id;
+ }
+
+ /*
+ * check if only multiRnaSupport
+ */
+
+ public static boolean checkIfOnlyMultiRnas(Gene gene){
+
+ for(String rnaString : gene.idTOassociatedRnas.keySet()){
+ Rna rna = ((Rna)gene.idTOassociatedRnas.get(rnaString)[0]);
+ if(rna.isMulti == 0){
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /*
+ * for all rnas mapping to this gene, remove the gene alignment
+ */
+
+ public static void removeGenesRnas(Gene gene, Contig thisContig){
+
+ for(String rnaID : gene.idTOassociatedRnas.keySet()){
+ Rna rna = (Rna) gene.idTOassociatedRnas.get(rnaID)[0];
+
+ for(Object[] info : rna.contigsMappedOn){
+
+ int alignPos = ((Integer) info[1]).intValue();
+
+ if((((Contig) info[0]).equals(thisContig)) && (alignPos > gene.startPos) && (alignPos < (gene.stopPos-GeneFinder.readLength))){
+ rna.contigsMappedOn.removeElement(info);
+ rna.assignedNum = rna.assignedNum -1;
+
+ break;
+ }
+ }
+
+ }
+ }
+
+
+ /*
+ * initiate twinNodes, set freeToresolve, alternative Splicing, hadTwinBefore, isMergedTwin etc...
+ */
+
+ public static void twinIni(Gene cluster, Gene clusterTwin, Contig thisContig){
+
+ cluster.twinNode = clusterTwin;
+ clusterTwin.twinNode = cluster;
+ cluster.freeToResolve = true;
+ cluster.twinNode.freeToResolve = true;
+
+ cluster.hadTwinBefore = true;
+ cluster.twinNode.hadTwinBefore = true;
+
+ cluster.isMergedTwin = false;
+ cluster.twinNode.isMergedTwin = false;
+
+ cluster.twinNode.possibleStarts_Forward = cluster.possibleStarts_Forward;
+ cluster.twinNode.possibleStarts_Reverse = cluster.possibleStarts_Reverse;
+ cluster.twinNode.possibleStops_Forward = cluster.possibleStops_Forward;
+ cluster.twinNode.possibleStops_Reverse = cluster.possibleStops_Reverse;
+
+ cluster.realDirectionNotKnown = false;
+ cluster.twinNode.realDirectionNotKnown = false;
+
+ cluster.twinNode.possibleIntrons.putAll(cluster.possibleIntrons);
+ cluster.twinNode.possibleFussyExons.putAll(cluster.possibleFussyExons);
+
+ cluster.twinNode.alternativeTranscripts.addAll(cluster.alternativeTranscripts);
+ cluster.twinNode.intronEndsThatAreNotContinued.addAll(cluster.intronEndsThatAreNotContinued);
+
+ cluster.twinNode.coreSeq = cluster.coreSeq;
+
+ }
+
+
+ /*
+ * if no stop was found, mark the cluster
+ */
+
+ public static void declareClusterAsNotCompleted(Gene cluster, int possibleStart, StringBuffer contigSeq){
+
+ cluster.startPos = possibleStart;
+ cluster.realDirectionNotKnown = true;
+ if(GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(cluster.startPos,cluster.stopPos+1);
+ }
+ cluster.hasStop_temp = false;
+
+ }
+
+
+ /*
+ * applied to twin nodes: we try to switch their directions and find suitable reading frames
+ */
+
+ public static boolean changeDirectionsIfPossible(Gene cluster, StringBuffer contigSeq, boolean overlap, boolean wantForward){
+
+ boolean haveChanged = false;
+
+ if(wantForward){
+ int[] pair_FO = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ int[] pair_RE_twin = FrameSearch.checkAndChooseReadingFrame(cluster.twinNode.possibleStarts_Reverse,cluster.twinNode.possibleStops_Reverse);
+
+ if(pair_FO != null && pair_RE_twin != null){
+ haveChanged = true;
+ cluster.onRevStrand = false;
+ cluster.startPos = pair_FO[0];
+ cluster.stopPos = pair_FO[1]+2;
+
+ cluster.twinNode.onRevStrand = true;
+ cluster.twinNode.startPos = pair_RE_twin[0];
+ cluster.twinNode.stopPos = pair_RE_twin[1]+2;
+
+ if(GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(cluster.startPos,cluster.stopPos+1);
+ cluster.twinNode.sequence = contigSeq.substring(cluster.twinNode.startPos,cluster.twinNode.stopPos+1);
+ }
+ cluster.realDirectionNotKnown = false;
+ cluster.twinNode.realDirectionNotKnown = false;
+
+ cluster.hasStop_temp = true;
+ cluster.twinNode.hasStop_temp = true;
+ }else if(pair_RE_twin != null && cluster.possibleStarts_Forward[0] >= 0 && !overlap){
+
+ // we can change directions, because start and stop are at least in same direction and twin is ok
+ haveChanged = true;
+ cluster.onRevStrand = false;
+ cluster.startPos = cluster.possibleStarts_Forward[0];
+ cluster.stopPos = cluster.possibleStops_Forward[0]+2;
+
+ cluster.twinNode.onRevStrand = true;
+ cluster.twinNode.startPos = pair_RE_twin[0];
+ cluster.twinNode.stopPos = pair_RE_twin[1]+2;
+
+ if(GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(cluster.startPos,cluster.stopPos+1);
+ cluster.twinNode.sequence = contigSeq.substring(cluster.twinNode.startPos,cluster.twinNode.stopPos+1);
+ }
+
+ cluster.realDirectionNotKnown = true;
+ cluster.twinNode.realDirectionNotKnown = false;
+
+ cluster.hasStop_temp = true;
+ cluster.twinNode.hasStop_temp = true;
+ }
+
+ }else{
+ int[] pair_RE = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ int[] pair_FO_twin = FrameSearch.checkAndChooseReadingFrame(cluster.twinNode.possibleStarts_Forward,cluster.twinNode.possibleStops_Forward);
+
+ if(pair_RE != null && pair_FO_twin != null){
+ haveChanged = true;
+ cluster.onRevStrand = true;
+ cluster.startPos = pair_RE[0];
+ cluster.stopPos = pair_RE[1]+2;
+
+ cluster.twinNode.onRevStrand = false;
+ cluster.twinNode.startPos = pair_FO_twin[0];
+ cluster.twinNode.stopPos = pair_FO_twin[1]+2;
+ if(GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(cluster.startPos,cluster.stopPos+1);
+ cluster.twinNode.sequence = contigSeq.substring(cluster.twinNode.startPos,cluster.twinNode.stopPos+1);
+ }
+ cluster.realDirectionNotKnown = false;
+ cluster.twinNode.realDirectionNotKnown = false;
+
+ cluster.hasStop_temp = true;
+ cluster.twinNode.hasStop_temp = true;
+ }else if(pair_FO_twin != null && cluster.possibleStarts_Reverse[0] >= 0 && !overlap){
+
+ // we can change directions, because start and stop are at least in same direction and twin is ok
+ haveChanged = true;
+ cluster.onRevStrand = true;
+ cluster.startPos = cluster.possibleStarts_Reverse[0];
+ cluster.stopPos = cluster.possibleStops_Reverse[0]+2;
+
+ cluster.twinNode.onRevStrand = false;
+ cluster.twinNode.startPos = pair_FO_twin[0];
+ cluster.twinNode.stopPos = pair_FO_twin[1]+2;
+ if(GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(cluster.startPos,cluster.stopPos+1);
+ cluster.twinNode.sequence = contigSeq.substring(cluster.twinNode.startPos,cluster.twinNode.stopPos+1);
+ }
+ cluster.realDirectionNotKnown = true;
+ cluster.twinNode.realDirectionNotKnown = false;
+
+ cluster.hasStop_temp = true;
+ cluster.twinNode.hasStop_temp = true;
+ }
+ }
+
+ return haveChanged;
+ }
+
+
+
+ /*
+ * updates the coverage entries in the vector representing the current covered interval
+ */
+
+ public static Object[] updateCoverageInterval_respectAlternatives(Contig thisContig, int covPlus, int currentPos, Object[] coverageVecAndPos, TreeMap<Integer,Vector<Integer>> posiCovMap,int considerSpliceSite){
+
+ boolean addedValues = false;
+
+ int splitDiff = 0; // necessary to consider splice site, do not update with covplus if we exceed splice site
+ if(considerSpliceSite != -1){
+ splitDiff = (considerSpliceSite-currentPos);
+ }
+
+ Vector<Integer> covVecClone = new Vector<Integer>(); // stores all coverage add values derived by posiCovMap
+ Object[] returnObject = HelperFunctions_GeneSearch.lookIntoPosiCovMap(posiCovMap, currentPos);
+
+ addedValues = (Boolean) returnObject[0];
+ covVecClone = (Vector<Integer>) returnObject[1];
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnObject[2];
+
+ if(currentPos - (Integer) coverageVecAndPos[1] > GeneFinder.readLength){
+
+ Vector<Integer> coverageVecNew = new Vector<Integer>();
+ // initialize:
+ for(int arrPos=0;arrPos<GeneFinder.readLength;++arrPos){
+ if((considerSpliceSite != -1) && (arrPos >= splitDiff)){
+ coverageVecNew.add(0);
+ }else{
+ coverageVecNew.add(covPlus);
+ }
+ }
+
+ coverageVecAndPos[0] = coverageVecNew;
+ coverageVecAndPos[2] = -1;
+
+
+ }else{
+
+ Vector<Integer> covVec = (Vector<Integer>) coverageVecAndPos[0];
+ for(int pos = 0;pos<(currentPos- (Integer) coverageVecAndPos[1]);++pos){
+ covVec.remove(covVec.firstElement());
+ covVec.add(0);
+ }
+ for(int vecPos=0;vecPos < covVec.size();++vecPos){
+ if(!((considerSpliceSite != -1) && (vecPos >= splitDiff))){
+ covVec.set(vecPos,(covVec.get(vecPos) + covPlus)); // correct?
+ }
+
+ }
+
+ if((((Integer)coverageVecAndPos[2] == -1)) || (currentPos-((Integer) coverageVecAndPos[2]) > (GeneFinder.readLength))){
+ // update overlap-position
+ boolean foundPos = false;
+ int pos_temp = (Integer) coverageVecAndPos[2];
+ if(pos_temp == -1){
+ coverageVecAndPos[2] = (Integer) coverageVecAndPos[1];
+ }else{
+ if(!addedValues){ // if we have introns, then stay with the current start position
+ do{
+ if(thisContig.positionTOmappingRnas.lastKey() != pos_temp){
+ pos_temp = thisContig.positionTOmappingRnas.higherKey(pos_temp);
+ }else{
+ pos_temp = (Integer) coverageVecAndPos[1];
+ }
+ if((currentPos-(pos_temp) <= (GeneFinder.readLength))){
+ coverageVecAndPos[2] = pos_temp;
+ foundPos = true;
+ }
+ }while(!foundPos || !(pos_temp >= (Integer) coverageVecAndPos[1]));
+ }
+ }
+ }
+ }
+
+ coverageVecAndPos[1] = currentPos;
+
+ if(addedValues){
+ for(int posVec = 0; posVec < GeneFinder.readLength; ++posVec){
+ ((Vector<Integer>) coverageVecAndPos[0]).setElementAt((((Vector<Integer>)coverageVecAndPos[0]).get(posVec) + covVecClone.get(posVec)),posVec);
+ }
+ }
+
+ Object[] toReturn = {coverageVecAndPos,posiCovMap,addedValues};
+ return toReturn;
+ }
+
+
+
+
+ /*
+ * set start and stop of the cluster + extract new sequence interval
+ * be careful: the position stop refers to the exact stop, for sequence extraction +1 has to be added
+ * stop = possibleStop + 2
+ */
+
+ public static Gene refineExtractedCluster(Gene cluster, StringBuffer contigSeq, int start, int stop, boolean isReverse){
+
+ if(GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(start,stop+1);
+ }
+ cluster.startPos= start;
+ cluster.stopPos = stop;
+
+ cluster.hasStop_temp = true;
+
+ cluster.onRevStrand = isReverse;
+
+ return cluster;
+ }
+
+
+ /*
+ * first extraction of each cluster, simply regard the high coverage interval
+ */
+
+ public static int clustIni(Gene cluster, Contig thisContig, StringBuffer contigSeq, int startPos, int id, int currentPos, int basesToAdd){
+
+ if(GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(startPos,currentPos+basesToAdd);
+ }
+
+ cluster.geneID = id++;
+ cluster.onRevStrand = false;
+ cluster.startPos = startPos;
+ cluster.stopPos = currentPos+basesToAdd-1; // note: without "-1" stopPos would be the position necessary for seq-extraction = actual stop + 1
+
+ cluster.coreSeq = (cluster.stopPos-cluster.startPos);
+
+ cluster.realDirectionNotKnown = true;
+
+ DefineAlternativeTranscripts.finalIni_alternativeStartsCheck(cluster);
+ IntronExonSearch.finalIni_IntronSupportCheck(cluster,thisContig);
+
+ return id;
+ }
+
+
+ /*
+ * check which of the possible start positions is the smaller one
+ * return value shall be understood as "isReverse"
+ */
+
+ public static boolean checkWhichStart(Gene cluster){
+
+ if(cluster.possibleStarts_Reverse[0] >= 0 && cluster.possibleStarts_Forward[0] >= 0){
+
+ if(cluster.possibleStarts_Reverse[0] > cluster.possibleStarts_Forward[0]){
+ // take reverse start
+ cluster.startPos = cluster.possibleStarts_Reverse[0];
+ return true;
+ }else{
+ // take forward start
+ cluster.startPos = cluster.possibleStarts_Forward[0];
+ return false;
+ }
+
+ }else if(cluster.possibleStarts_Reverse[0] >= 0 && cluster.possibleStarts_Forward[0] < 0){
+ // take reverse start
+ cluster.startPos = cluster.possibleStarts_Reverse[0];
+ return true;
+ }else if(cluster.possibleStarts_Reverse[0] < 0 && cluster.possibleStarts_Forward[0] >= 0){
+ // take forward start
+ cluster.startPos = cluster.possibleStarts_Forward[0];
+ return false;
+ }
+
+ return true; // we only call this method if we will never reach here, so whether true or false stands here makes no difference
+ }
+
+ /*
+ * if clusterBef had no twin, then perform this frame search after the merge
+ */
+
+ public static void handleFrameSearchWithoutTwin(Gene cluster, int possibleStop_FO, int possibleStop_RE, StringBuffer contigSeq, boolean overlaps){
+
+ // check both directions
+
+ int[] pair_FO = null;
+ int[] pair_RE = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair_FO = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ pair_RE = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }else{
+ pair_FO = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ pair_RE = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }
+
+ // stop and clusterBef start should be in frame
+
+ if(pair_FO == null && pair_RE == null){
+
+ boolean foundAdequateStartStop_FO = false;
+ boolean foundAdequateStartStop_RE = false;
+
+ if(possibleStop_FO == -1 && possibleStop_RE == -1){
+ numFoundNoStop_firstTime++;
+
+ // if possible, refer to XS tag
+
+ if((cluster.possibleIntrons.keySet().size() > 0) && cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster.onRevStrand = false;
+ }else if((cluster.possibleIntrons.keySet().size() > 0) && cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster.onRevStrand = true;
+ }
+
+ }else{
+ if(possibleStop_FO != -1 && possibleStop_RE == -1){
+ cluster.stopPos = possibleStop_FO + 2;
+
+ if(cluster.possibleStarts_Forward[0] >= 0){
+ cluster.startPos = cluster.possibleStarts_Forward[0];
+
+ if(!overlaps || (cluster.possibleIntrons.keySet().size() > 0)){
+ foundAdequateStartStop_FO = true;
+ }
+
+ }
+
+ }else if(possibleStop_FO == -1 && possibleStop_RE != -1){
+ cluster.stopPos = possibleStop_RE + 2;
+
+ if(cluster.possibleStarts_Reverse[0] >= 0){
+ cluster.startPos = cluster.possibleStarts_Reverse[0];
+
+ if(!overlaps || (cluster.possibleIntrons.keySet().size() > 0)){
+ foundAdequateStartStop_RE = true;
+ }
+
+ }
+ }else{
+
+ // if possible, refer to XS tag
+
+ boolean isReverse;
+
+ if(cluster.possibleStarts_Reverse[0] < 0 && cluster.possibleStarts_Forward[0] < 0){
+ cluster.stopPos = Math.min(possibleStop_FO,possibleStop_RE) + 2;
+ }else{
+ if((cluster.possibleIntrons.keySet().size() > 0) && cluster.direcCounter[0] > cluster.direcCounter[1]){
+ isReverse = false;
+ }else if( (cluster.possibleIntrons.keySet().size() > 0) && cluster.direcCounter[0] < cluster.direcCounter[1]){
+ isReverse = true;
+ }else{
+ isReverse = checkWhichStart(cluster);
+ }
+
+ if(isReverse){
+ cluster.stopPos = possibleStop_RE + 2;
+ if(!overlaps || (cluster.possibleIntrons.keySet().size() > 0)){
+ foundAdequateStartStop_RE = true;
+ }
+ }else{
+ cluster.stopPos = possibleStop_FO + 2;
+ if(!overlaps || (cluster.possibleIntrons.keySet().size() > 0)){
+ foundAdequateStartStop_FO = true;
+ }
+ }
+ }
+ }
+ numNoFrameFound++;
+ }
+
+ if(foundAdequateStartStop_FO){
+ cluster = refineExtractedCluster(cluster, contigSeq, cluster.startPos, cluster.stopPos,false);
+ cluster.realDirectionNotKnown = true;
+ }else if(foundAdequateStartStop_RE){
+ cluster = refineExtractedCluster(cluster, contigSeq, cluster.startPos, cluster.stopPos,true);
+ cluster.realDirectionNotKnown = true;
+ }else{
+ declareClusterAsNotCompleted(cluster,cluster.startPos,contigSeq);
+ }
+
+ } else if(pair_FO != null && pair_RE == null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ cluster.realDirectionNotKnown = false;
+ } else if(pair_FO == null && pair_RE != null){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ cluster.realDirectionNotKnown = false;
+ }else if(pair_FO != null && pair_RE != null){
+
+ // if possible, refer to XS tag
+
+ if((cluster.possibleIntrons.keySet().size() > 0) && cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ }else if((cluster.possibleIntrons.keySet().size() > 0) && cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ }else{
+ if(pair_FO[2] <= pair_RE[2]){ // extract smallest possible interval
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ }else{
+ cluster = refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ }
+ cluster.realDirectionNotKnown = true;
+ }
+
+
+ }
+
+ }
+
+
+ /*
+ * method to test whether we find at least start and stop on same strand, though they are not in frame
+ */
+
+ public static void checkIfAdequateAndRefine(Gene cluster, boolean wantReverse, int[] possibleStarts, StringBuffer contigSeq){
+
+ boolean foundAdequateStartStop = false;
+
+ cluster.onRevStrand = wantReverse;
+
+ if(possibleStarts[0] >= 0){
+ cluster.startPos = possibleStarts[0];
+ if(cluster.possibleIntrons.keySet().size() > 0){
+ foundAdequateStartStop = true;
+ }
+ }
+
+ if(foundAdequateStartStop){
+ cluster = refineExtractedCluster(cluster, contigSeq, cluster.startPos, cluster.stopPos,wantReverse);
+ cluster.realDirectionNotKnown = true;
+ }else{
+ declareClusterAsNotCompleted(cluster,cluster.startPos,contigSeq);
+ }
+
+ }
+
+
+ /*
+ * when we have passed a split start, then check if the number of rnas not supporting a split is sufficient
+ */
+
+ public static Object[] checkIfNonSplitRnasAreSufficient(Gene cluster, Vector<Rna> nonSplitRnas, int splitStart, int currentPos, TreeMap<Integer,Vector<Integer>> posiCovMap, double limit){
+
+ int exonEnd = -1;
+ int kept = splitStart;
+
+ if(nonSplitRnas.size() < limit){
+ // erase
+ kept = -1;
+ for(Rna rna : nonSplitRnas){
+
+ if(cluster.moreThanOneHitRnas.contains(rna.rnaID)){
+ // if it is contained here, this serves as one "back-up life" that is now destroyed
+ cluster.moreThanOneHitRnas.remove(rna.rnaID);
+ }else{
+ cluster.idTOassociatedRnas.remove(rna.rnaID);
+ }
+
+ for(Object[] info : rna.contigsMappedOn){
+ if(info[4] == null && ((splitStart - (Integer) info[1]) < GeneFinder.readLength) && ((splitStart > (Integer) info[1]))){
+ int diffToSplitStart = (splitStart - (Integer) info[1]);
+ posiCovMap = HelperFunctions_GeneSearch.updatePosiCovMap_AfterSpliceSwitch(posiCovMap,splitStart,(GeneFinder.readLength-diffToSplitStart));
+ rna.contigsMappedOn.removeElement(info);
+ break; // have to be removed soon!
+ }
+ }
+
+ }
+
+ if(cluster.possibleIntrons.containsKey(splitStart)){
+ cluster.possibleIntrons.get(splitStart)[2] = -1;
+ }
+
+ if(cluster.possibleFussyExons.containsKey(splitStart)){
+ cluster.possibleFussyExons.remove(splitStart);
+ }
+ }else{
+ // keep this alternative
+ exonEnd = currentPos + GeneFinder.readLength;
+ if(cluster.possibleIntrons.containsKey(splitStart)){
+ cluster.possibleIntrons.get(splitStart)[2] = exonEnd;
+ }else{
+ System.err.println("Splice site not included!");
+ }
+
+ for(Rna rna : nonSplitRnas){ // do this update here to save time later
+ cluster.idTOassociatedRnas.get(rna.rnaID)[2] = splitStart;
+ }
+ }
+
+ return new Object[] {exonEnd,posiCovMap,kept};
+ }
+
+
+}
+
+
diff --git a/src/geneFinder/FindExonsOfGene.java b/src/geneFinder/FindExonsOfGene.java
new file mode 100755
index 0000000..614b046
--- /dev/null
+++ b/src/geneFinder/FindExonsOfGene.java
@@ -0,0 +1,404 @@
+package geneFinder;
+
+import types.*;
+import java.util.*;
+
+/**
+ * for each gene all possible exons defined by alternative splicing are listed
+* Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class FindExonsOfGene {
+
+ /*
+ * method that goes through possibleIntron map of the gene and extracts all exons
+ * currentStart is initialized with startPos
+ */
+
+ public static boolean findExonsForGene(Gene gene){
+
+ if(gene.possibleIntrons.keySet().isEmpty()){
+ // no splicing, whole gene is exon
+ gene.exonsOfGene.add(new int[] {gene.startPos,gene.stopPos+1});
+ }else{
+
+ // first add the exon starting at the starting position
+
+ int firstKey = gene.possibleIntrons.firstKey();
+
+ gene.exonsOfGene.add(new int[] {gene.startPos,firstKey});
+
+ if(((Integer) gene.possibleIntrons.get(firstKey)[2]) != -1){ // add the fussy exon
+
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ // note: the "-1" in [1] is necessary for transcript extraction, it is important to add it later!!!!!
+
+ int endIndicator = 0; // 0 if no end, 1 if it shall be a transcript end
+ if(((Integer) gene.possibleIntrons.get(firstKey)[3]) != -2){
+ endIndicator = 0;
+ }else{
+ // treat this fussy as a transcript end
+ endIndicator = 1;
+ }
+ Object[] newFussy = {gene.startPos,((Integer) gene.possibleIntrons.get(firstKey)[2])-1,gene.startPos,gene.startPos,((Integer) gene.possibleIntrons.get(firstKey)[2]),vecTemp,endIndicator};
+ if(!gene.alternativeTranscripts.isEmpty()){
+ boolean doNotAdd = false;
+ for(Object[] altTrans : gene.alternativeTranscripts){
+ int start = (Integer) altTrans[0];
+ int stop = (Integer) altTrans[1];
+
+ if((altTrans.length >= 7) && (start == gene.startPos) && (stop == ((Integer) gene.possibleIntrons.get(firstKey)[2])-1)){
+ doNotAdd = true;
+ break;
+ }
+ }
+ if(!doNotAdd){
+ gene.alternativeTranscripts.add(newFussy);
+ }
+ }else{
+ gene.alternativeTranscripts.add(newFussy);
+ }
+
+ }
+
+ for(int key : gene.possibleIntrons.keySet()){
+
+ boolean hadIntron = false;
+ if(gene.possibleIntrons.get(key)[0] != null){
+
+ Vector<int[]> exonSubSet = new Vector<int[]>();
+
+ for(int[] intron : ((Vector<int[]>) gene.possibleIntrons.get(key)[0])){
+ hadIntron = true;
+ if(!gene.intronEndsThatAreNotContinued.isEmpty()){
+ boolean allowed = true;
+ for(int[] forbiddenIntron : gene.intronEndsThatAreNotContinued){
+ if((forbiddenIntron[0] == intron[0]) && (forbiddenIntron[1] == intron[1])){
+ allowed = false;
+ break;
+ }else if((forbiddenIntron[0] != intron[0]) && (forbiddenIntron[1] == intron[1])){
+ // another intron that has the same ending, could be a second transcript, so make a new one or first try to merge it some other alternative
+ allowed = false;
+ if(!gene.intronEndsThatAreNotContinued.contains(intron)){
+ gene.intronEndsThatAreNotContinued.add(intron);
+
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ vecTemp.add(intron);
+ int begin = searchTranscriptWithCorrectEnd(gene,forbiddenIntron);
+ if(begin == -1){
+ System.err.println("Intron end not contained");
+ }else{
+ gene.alternativeTranscripts.add(new Object[] {begin,Integer.MAX_VALUE,-1,-1,intron[1],vecTemp});
+ }
+ }
+
+ break;
+ }
+ }
+ if(allowed){
+ exonSubSet = exonStopSearch(gene, key, intron[1], exonSubSet);
+ }
+ }else{
+ exonSubSet = exonStopSearch(gene, key, intron[1], exonSubSet);
+ }
+
+ }
+ if(((Vector<int[]>)gene.possibleIntrons.get(key)[0]).size() > ((double)(Math.pow(GeneFinder.minCoverage,2))/(double)GeneFinder.spliceLim)){
+ testExonsIfLargeIntronKey(gene, key,exonSubSet);
+ }
+
+
+ }
+
+ if(!hadIntron){
+ // this only marks an exon end point, so grab the next start
+ int nextStart = ((Integer) gene.possibleIntrons.get(key)[3]); // very important that [2] is -1!
+ if(nextStart >= 0){
+ // this is the next start
+ Vector<int[]> exonSub = new Vector<int[]>();
+ exonStopSearch(gene, key, nextStart, exonSub);
+
+ }
+ }
+ }
+ }
+
+ if(gene.exonsOfGene.isEmpty()){
+ return false;
+ }
+
+ return true;
+
+ }
+
+ /*
+ * for each intron end, search for the next starting intron and extract the exon in between
+ */
+
+ public static Vector<int[]> exonStopSearch(Gene gene, int key, int intronEnd, Vector<int[]> exonSubSet){
+
+ if(gene.possibleIntrons.higherKey(key) != null){
+
+ int higherKey = gene.possibleIntrons.higherKey(key);
+ boolean foundEnd = false;
+
+ do{
+ if(higherKey > intronEnd){
+ int[] newExon = {intronEnd,higherKey};
+ if(checkIfExonIsContained(gene.exonsOfGene, newExon) == 0){
+ gene.exonsOfGene.add(newExon);
+ exonSubSet.add(newExon);
+ }
+
+ if(((Integer) gene.possibleIntrons.get(higherKey)[2]) != -1){ // add the fussy exon
+
+ int endIndicator = 0; // 0 if no end, 1 if it shall be a transcript end
+ if(((Integer) gene.possibleIntrons.get(higherKey)[3]) != -2){
+ endIndicator = 0;
+ }else{
+ // treat this fussy as a transcript end
+ endIndicator = 1;
+ }
+
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ Object[] newFussy = {intronEnd,((Integer) gene.possibleIntrons.get(higherKey)[2])-1,intronEnd,intronEnd,((Integer) gene.possibleIntrons.get(higherKey)[2]),vecTemp,endIndicator};
+
+ if(!gene.alternativeTranscripts.isEmpty()){
+ boolean doNotAdd = false;
+ for(Object[] altTrans : gene.alternativeTranscripts){
+ int start = (Integer) altTrans[0];
+ int stop = (Integer) altTrans[1];
+
+ if((altTrans.length >= 7) && (start == intronEnd) && (stop == ((Integer) gene.possibleIntrons.get(higherKey)[2])-1)){
+ doNotAdd = true;
+ break;
+ }
+ }
+ if(!doNotAdd){
+ gene.alternativeTranscripts.add(newFussy);
+ }
+ }else{
+ gene.alternativeTranscripts.add(newFussy);
+ }
+
+ }
+ foundEnd = true;
+ break;
+ }
+
+ if(gene.possibleIntrons.higherKey(higherKey) == null){
+ break;
+ }else{
+ higherKey = gene.possibleIntrons.higherKey(higherKey);
+ }
+ }while(!foundEnd);
+
+ if(!foundEnd){
+ int[] newExon = {intronEnd,(gene.stopPos+1)};
+ if(checkIfExonIsContained(gene.exonsOfGene, newExon) == 0){
+ gene.exonsOfGene.add(newExon);
+ exonSubSet.add(newExon);
+ }
+
+ }
+
+ }else{
+ int[] newExon = {intronEnd,(gene.stopPos+1)};
+ if(checkIfExonIsContained(gene.exonsOfGene, newExon) == 0){
+ gene.exonsOfGene.add(newExon);
+ exonSubSet.add(newExon);
+ }
+
+ }
+
+ return exonSubSet;
+ }
+
+
+ /*
+ * check if the exon is already contained
+ */
+
+ public static int checkIfExonIsContained(Vector<int[]> exonsOld, int[] newExon){
+
+ for(int[] exon : exonsOld){
+ if((exon[0] == newExon[0]) && (exon[1] == newExon[1])){
+ return 1;
+ }
+ }
+
+ return 0;
+ }
+
+ /*
+ * if a splice site has an unusually large number of introns, check if they can be condensed
+ */
+
+ public static void testExonsIfLargeIntronKey(Gene gene, int key, Vector<int[]> exonSubSet){
+
+ HashMap<Integer,Vector<int[]>> endPosiToNum = new HashMap<Integer,Vector<int[]>>();
+ for(int[] exon : exonSubSet){
+ if(endPosiToNum.containsKey(exon[1])){
+ endPosiToNum.get(exon[1]).add(exon);
+ }else{
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ vecTemp.add(exon);
+ endPosiToNum.put(exon[1],vecTemp);
+ }
+ }
+
+ for(int posi : endPosiToNum.keySet()){
+ if(endPosiToNum.get(posi).size() > ((double)(Math.pow(GeneFinder.minCoverage,2))/(double)GeneFinder.spliceLim)){
+ // condense to the smallest exon
+ int maxStart = -1;
+ for(int i = 0; i< endPosiToNum.get(posi).size();++i){
+ int[] exon = endPosiToNum.get(posi).get(i);
+ if(exon[0] >= maxStart){
+ maxStart = exon[0];
+ }
+ }
+
+ gene.exonsOfGene.removeAll(endPosiToNum.get(posi));
+ gene.exonsOfGene.add(new int[] {maxStart,posi});
+
+ for(int j = ((Vector<int[]>)gene.possibleIntrons.get(key)[0]).size()-1;j >= 0; j--){
+ int[] intron = ((Vector<int[]>)gene.possibleIntrons.get(key)[0]).get(j);
+ if(intron[1] < maxStart){
+ ((Vector<int[]>)gene.possibleIntrons.get(key)[0]).removeElement(intron);
+ ((Vector<Vector<Rna>>)gene.possibleIntrons.get(key)[1]).removeElementAt(j); //TODO: these rnas also have to be removed from the gene!!
+ }
+ }
+ }
+ }
+
+ }
+
+ /*
+ * when an intron has been declared as non supported, erase the corresponding exons if the do not have support of other introns
+ */
+
+ public static void searchCorrespondingExons(Gene gene, int[] intron){
+
+ for(int pos = gene.exonsOfGene.size() -1; pos >= 0; pos--){
+
+ int[] exon = gene.exonsOfGene.get(pos);
+ if(exon[0] == intron[1]){
+ if(!searchOtherIntronSupport(gene,intron)){
+ gene.exonsOfGene.removeElement(exon);
+ }
+ }
+
+ }
+ }
+
+ /*
+ * if one exon is to be erased, first search for other intron supporting this exon
+ */
+
+ public static boolean searchOtherIntronSupport(Gene gene, int[] intron){
+
+ for(int key : gene.possibleIntrons.keySet()){
+ if((key != intron[0]) && gene.possibleIntrons.get(key)[0] != null){
+ for(int[] intronOfKey : (Vector<int[]>) gene.possibleIntrons.get(key)[0]){
+ if((intronOfKey[1] == intron[1]) && (intronOfKey != intron)){
+ return true; // found other support
+ }
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /*
+ * sort exons via bubblesort
+ */
+
+ public static void sortExons(Gene gene){
+
+ int[] temp;
+ for(int i=1; i<gene.exonsOfGene.size(); ++i) {
+ for(int j=0; j<gene.exonsOfGene.size()-i; ++j) {
+ if((gene.exonsOfGene.get(j)[0]) > (gene.exonsOfGene.get(j+1)[0])) {
+ temp=gene.exonsOfGene.get(j);
+ gene.exonsOfGene.setElementAt(gene.exonsOfGene.get(j+1),j);
+ gene.exonsOfGene.setElementAt(temp,j+1);
+ }
+
+ }
+ }
+ }
+
+ /*
+ * sort exons in a vector via bubblesort
+ */
+
+ public static Vector<int[]> sortExonsInVector(Vector<int[]> exonVec){
+
+ int[] temp;
+ for(int i=1; i<exonVec.size(); ++i) {
+ for(int j=0; j<exonVec.size()-i; ++j) {
+ if((exonVec.get(j)[0]) > (exonVec.get(j+1)[0])) {
+ temp=exonVec.get(j);
+ exonVec.setElementAt(exonVec.get(j+1),j);
+ exonVec.setElementAt(temp,j+1);
+ }
+
+ }
+ }
+
+ return exonVec;
+ }
+
+ /*
+ * checks if the given exon set is already present from previous isoforms
+ */
+
+ public static boolean checkIfExonsAlreadyPresent(Vector<int[]> exonsOfTranscript, Vector<Vector<int[]>> allTranscriptVecs){
+
+ boolean isContained = false;
+
+ for(Vector<int[]> formerSet : allTranscriptVecs){
+ int counterSame = 0;
+ if(formerSet.size() == exonsOfTranscript.size()){
+ for(int posi = 0; posi < formerSet.size(); ++posi){
+ if((formerSet.get(posi)[0] == exonsOfTranscript.get(posi)[0]) && (formerSet.get(posi)[1] == exonsOfTranscript.get(posi)[1])){
+ // same exon
+ counterSame++;
+ }else{
+ break;
+ }
+ }
+ }
+
+ if(counterSame == exonsOfTranscript.size()){
+ isContained = true;
+ break;
+ }
+ }
+
+ return isContained;
+
+ }
+
+ /*
+ * returns the correct value to generate the new alternative Transcripts
+ */
+
+ public static int searchTranscriptWithCorrectEnd(Gene gene, int[] intron){
+
+ int end = -1;
+
+ for(Object[] altTrans : gene.alternativeTranscripts){
+ if((((Integer)altTrans[1]).intValue() == Integer.MAX_VALUE) && (((Integer)altTrans[4]).intValue() == intron[1])){
+ return ((Integer)altTrans[0]).intValue();
+ }
+ }
+
+ return end;
+ }
+}
diff --git a/src/geneFinder/FrameSearch.java b/src/geneFinder/FrameSearch.java
new file mode 100755
index 0000000..d469eaa
--- /dev/null
+++ b/src/geneFinder/FrameSearch.java
@@ -0,0 +1,582 @@
+package geneFinder;
+
+import java.util.Vector;
+
+import types.Contig;
+import types.Gene;
+
+/**
+ * contains methods necessary for the frame determination of candidate gene regions
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class FrameSearch {
+
+ /*
+ * new way of gene extraction by remembering all starts that are not in the same frame (maxNum = 3)
+ * after that, starts and stops are checked if we find a combination that defines the frame of the cluster
+ *
+ * tempStop is initially (int) (cluster.startPos+3) ; after that it is the position of the last detected ATG
+ * recursive call
+ */
+
+ public static int findPossibleStarts_Forward(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop){
+
+ if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
+ return -1;
+ }
+
+ int start1 = contigSeq.substring((int)Math.max(0,(cluster.startPos-GeneFinder.readLength)),tempStop).lastIndexOf("ATG");
+
+ if(start1 == -1){
+ return start1;
+ }
+
+ start1 = (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))+start1;
+
+ boolean foundSameFrame = false;
+
+ for(int i = 0; i<posAr;++i){
+ if((cluster.possibleStarts_Forward[i] - start1) % 3 == 0){
+ foundSameFrame = true;
+ break;
+ }
+ }
+
+ if(!foundSameFrame){
+ cluster.possibleStarts_Forward[posAr++] = start1;
+ findPossibleStarts_Forward(cluster,contigSeq,posAr,start1);
+ }else{
+ findPossibleStarts_Forward(cluster,contigSeq,posAr,start1);
+ }
+
+ return start1;
+ }
+
+ /*
+ * new way of gene extraction by remembering all starts that are not in the same frame (maxNum = 3)
+ * after that, starts and stops are checked if we find a combination that defines the frame of the cluster
+ *
+ * tempStop is initially (int) (cluster.startPos+3) ; after that it is the position of the last detected StopCodon (reverse)
+ * search for TTA, TCA and CTA
+ *
+ * recursive call
+ */
+
+ public static int findPossibleStarts_Reverse(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop){
+
+ int start_RE = -1;
+
+ if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
+ return -1;
+ }
+
+ String startPart = contigSeq.substring((int)Math.max(0,(cluster.startPos-GeneFinder.readLength)), tempStop);
+
+ int startSub[] = {startPart.lastIndexOf("TTA"),startPart.lastIndexOf("TCA"),startPart.lastIndexOf("CTA")};
+
+ java.util.Arrays.sort(startSub);
+
+ if(startSub[2] > -1){
+ start_RE = (int) Math.max(0,(cluster.startPos-GeneFinder.readLength)) + startSub[2];
+ }else if(startSub[1] > -1){
+ start_RE = (int) Math.max(0,(cluster.startPos-GeneFinder.readLength))+ startSub[1];
+ } else if(startSub[0] > -1){
+ start_RE = (int )Math.max(0,(cluster.startPos-GeneFinder.readLength)) + startSub[0];
+ }
+
+ if(start_RE == -1){
+ return start_RE;
+ }
+
+ boolean foundSameFrame = false;
+
+ for(int i = 0; i<posAr;++i){
+ if((cluster.possibleStarts_Reverse[i] - start_RE) % 3 == 0){
+ foundSameFrame = true;
+ break;
+ }
+ }
+
+ if(!foundSameFrame){
+ cluster.possibleStarts_Reverse[posAr++] = start_RE;
+ findPossibleStarts_Reverse(cluster,contigSeq,posAr,start_RE);
+ }else{
+ findPossibleStarts_Reverse(cluster,contigSeq,posAr,start_RE);
+ }
+
+ return start_RE;
+ }
+
+ /*
+ * new way of gene extraction by remembering all starts that are not in the same frame (maxNum = 3)
+ * after that, starts and stops are checked if we find a combination that defines the frame of the cluster
+ *
+ * tempStart is initially (int) (cluster.stopPos - 2) ; after that it is the position of the last detected CAT+3
+ * recursive call
+ */
+
+ public static int findPossibleStops_Forward(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart){
+
+ int stop_FO = -1;
+
+ if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
+ return -1;
+ }
+
+ String stopPart = contigSeq.substring(tempStart, (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1));
+ int stopSub[] = {stopPart.indexOf("TAA"),stopPart.indexOf("TGA"),stopPart.indexOf("TAG")};
+
+ java.util.Arrays.sort(stopSub);
+
+ if(stopSub[0] > -1){
+ stop_FO = tempStart + stopSub[0];
+ }else if(stopSub[1] > -1){
+ stop_FO = tempStart + stopSub[1];
+ } else if(stopSub[2] > -1){
+ stop_FO = tempStart + stopSub[2];
+ }
+
+ if(stop_FO == -1){
+ return stop_FO;
+ }
+
+ boolean foundSameFrame = false;
+
+ for(int i = 0; i<posAr;++i){
+ if((cluster.possibleStops_Forward[i] - stop_FO) % 3 == 0){
+ foundSameFrame = true;
+ break;
+ }
+ }
+
+ if(!foundSameFrame){
+ cluster.possibleStops_Forward[posAr++] = stop_FO;
+ findPossibleStops_Forward(cluster,contigSeq,posAr,stop_FO+3);
+ }else{
+ findPossibleStops_Forward(cluster,contigSeq,posAr,stop_FO+3);
+ }
+
+ return stop_FO;
+ }
+
+ /*
+ * new way of gene extraction by remembering all starts that are not in the same frame (maxNum = 3)
+ * after that, starts and stops are checked if we find a combination that defines the frame of the cluster
+ *
+ * tempStart is initially (int) (cluster.stopPos - 2) ; after that it is the position of the last detected CAT+3
+ * recursive call
+ */
+
+ public static int findPossibleStops_Reverse(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart){
+
+ if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
+ return -1;
+ }
+
+ int start1 = contigSeq.substring(tempStart, (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)).indexOf("CAT");
+
+ if(start1 == -1){
+ return start1;
+ }
+
+ start1 = start1 + tempStart;
+
+ boolean foundSameFrame = false;
+
+ for(int i = 0; i<posAr;++i){
+ if((cluster.possibleStops_Reverse[i] - start1) % 3 == 0){
+ foundSameFrame = true;
+ break;
+ }
+ }
+
+ if(!foundSameFrame){
+ cluster.possibleStops_Reverse[posAr++] = start1;
+ findPossibleStops_Reverse(cluster,contigSeq,posAr,start1+3);
+ }else{
+ findPossibleStops_Reverse(cluster,contigSeq,posAr,start1+3);
+ }
+
+ return start1;
+ }
+
+ /*
+ * test if there is one of the possible start-stop codon pairs which is in frame
+ * take the smallest interval possible
+ */
+
+ public static int[] checkAndChooseReadingFrame(int[] startPosis, int[] stopPosis){
+
+ Vector<int[]> pairs = new Vector<int[]>();
+ for(int start : startPosis){
+ if(start >= 0){
+ for(int stop : stopPosis){
+ if(stop >= 0 && (stop - start) % 3 == 0 && (stop - start) > 0){
+ // both are in frame
+ int[] pairInfo = {start,stop,(stop-start)};
+ if(pairs.size() != 0){
+ if(pairs.get(0)[2] > (stop-start)){
+ pairs.remove(0);
+ pairs.add(pairInfo);
+ }
+ }else{
+ pairs.add(pairInfo);
+ }
+
+ }
+ }
+ }
+ }
+
+ if(pairs.size() == 0){
+ return null;
+ }else{
+ return pairs.get(0);
+ }
+ }
+
+ /*
+ * especially for spliced genes, because here the in-frame search becomes less important
+ * take the smallest interval of start and stop codon that is possible
+ */
+
+ public static int[] checkAndChooseReadingFrame_SplicingVariant(int[] startPosis, int[] stopPosis){
+
+ Vector<int[]> pairs = new Vector<int[]>();
+ for(int start : startPosis){
+ if(start >= 0){
+ for(int stop : stopPosis){
+ if(stop >= 0 && (stop - start) > 0){
+ // both might be not in frame!
+ int[] pairInfo = {start,stop,(stop-start)};
+ if(pairs.size() != 0){
+ if(pairs.get(0)[2] > (stop-start)){
+ pairs.remove(0);
+ pairs.add(pairInfo);
+ }
+ }else{
+ pairs.add(pairInfo);
+ }
+
+ }
+ }
+ }
+ }
+
+ if(pairs.size() == 0){
+ return null;
+ }else{
+ return pairs.get(0);
+ }
+ }
+
+
+ /*
+ * it appears that a new transcript started within an intron, so search for its start in the given interval
+ */
+
+ public static int[] lookForStartOfIsoform(int oldStart, StringBuffer contigSeq){
+
+ int startFO = contigSeq.substring((int)Math.max(0,(oldStart-GeneFinder.readLength)),oldStart+3).lastIndexOf("ATG");
+ if(startFO != -1){
+ startFO = (int)Math.max(0,(oldStart-GeneFinder.readLength))+startFO;
+ }
+
+ int startRE = -1;
+ String startPart = contigSeq.substring((int)Math.max(0,(oldStart-GeneFinder.readLength)), oldStart+3);
+
+ int startSub[] = {startPart.lastIndexOf("TTA"),startPart.lastIndexOf("TCA"),startPart.lastIndexOf("CTA")};
+
+ java.util.Arrays.sort(startSub);
+
+ if(startSub[2] > -1){
+ startRE = (int) Math.max(0,(oldStart-GeneFinder.readLength)) + startSub[2];
+ }else if(startSub[1] > -1){
+ startRE = (int) Math.max(0,(oldStart-GeneFinder.readLength))+ startSub[1];
+ } else if(startSub[0] > -1){
+ startRE = (int )Math.max(0,(oldStart-GeneFinder.readLength)) + startSub[0];
+ }
+
+ return new int[] {startFO,startRE};
+ }
+
+
+ /*
+ * use the information of direc counter to refine the frame
+ */
+
+ public static void useDirecInfo(Gene gene, StringBuffer contigSeq){
+
+ if(!(gene.possibleIntrons.keySet().isEmpty())){
+ if(gene.direcCounter[0] > gene.direcCounter[1]){
+ // try forward
+
+ if(gene.onRevStrand){
+ //System.out.println("should be forward!");
+ int [] pair_FO = checkAndChooseReadingFrame_SplicingVariant(gene.possibleStarts_Forward,gene.possibleStops_Forward);
+ if(pair_FO != null){
+ //System.out.println("changed to forward");
+ gene = ExtractGeneCandidates.refineExtractedCluster(gene, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ gene.realDirectionNotKnown = false;
+ }else{
+ if(gene.possibleStarts_Forward[0] != -1 || gene.possibleStops_Forward[0] != -1){
+ //System.out.print("changed to forward without pair for gene " + gene.geneID);
+ int start = -1;
+ int stop = -1;
+
+ if(gene.possibleStarts_Forward[0] != -1){
+ int[] temp = gene.possibleStarts_Forward;
+ start = biggestNonNegativeArrayEntry(temp);
+ stop = gene.stopPos;
+ //System.out.println(" Old start: " + gene.startPos + " new start: " + start);
+ }else{
+ int[] temp = gene.possibleStops_Forward;
+ stop = smallestNonNegativeArrayEntry(temp);
+ start = gene.startPos;
+ //System.out.println(" Old stop: " + gene.stopPos + " new stop: " + stop);
+ }
+
+ gene = ExtractGeneCandidates.refineExtractedCluster(gene, contigSeq, start,stop+2,false);
+ gene.realDirectionNotKnown = false;
+ }else{
+ // change both start and stop
+ gene = ExtractGeneCandidates.refineExtractedCluster(gene, contigSeq, gene.startPos, gene.stopPos+2,false);
+ gene.realDirectionNotKnown = false;
+ }
+ }
+ }
+
+ }else if(gene.direcCounter[1] > gene.direcCounter[0]){
+ // try reverse
+
+ if(!gene.onRevStrand){
+ int [] pair_RE = checkAndChooseReadingFrame_SplicingVariant(gene.possibleStarts_Reverse,gene.possibleStops_Reverse);
+ if(pair_RE != null){
+ gene = ExtractGeneCandidates.refineExtractedCluster(gene, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ gene.realDirectionNotKnown = false;
+ }else{
+ if(gene.possibleStarts_Reverse[0] != -1 || gene.possibleStops_Reverse[0] != -1){
+ //System.out.print("changed to reverse without pair for gene " + gene.geneID);
+ int start = -1;
+ int stop = -1;
+ if(gene.possibleStarts_Reverse[0] != -1){
+ int[] temp = gene.possibleStarts_Reverse;
+ start = biggestNonNegativeArrayEntry(temp);
+ stop = gene.stopPos;
+ }else{
+ int[] temp = gene.possibleStops_Reverse;
+ stop = smallestNonNegativeArrayEntry(temp);
+ start = gene.startPos;
+ }
+
+ gene = ExtractGeneCandidates.refineExtractedCluster(gene, contigSeq, start, stop+2,true);
+ gene.realDirectionNotKnown = false;
+ }else{
+ // change both start and stop
+ gene = ExtractGeneCandidates.refineExtractedCluster(gene, contigSeq, gene.startPos, gene.stopPos+2,true);
+ gene.realDirectionNotKnown = false;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ * small function that returns the smallest position not -1
+ */
+
+ public static int smallestNonNegativeArrayEntry(int[] arrayOrg){
+
+ int smallest = -1;
+
+ int[] array = new int[arrayOrg.length];
+ for(int entry = 0; entry < arrayOrg.length;++entry){
+ array[entry] = arrayOrg[entry];
+ }
+
+ java.util.Arrays.sort(array);
+
+ if(array[0] > -1){
+ smallest = array[0];
+ }else if(array[1] > -1){
+ smallest = array[1];
+ } else if(array[2] > -1){
+ smallest = array[2];
+ }
+
+ return smallest;
+ }
+
+ /*
+ * small function that returns the biggest position not -1
+ */
+
+ public static int biggestNonNegativeArrayEntry(int[] arrayOrg){
+
+ int biggest = -1;
+
+ int[] array = new int[arrayOrg.length];
+ for(int entry = 0; entry < arrayOrg.length;++entry){
+ array[entry] = arrayOrg[entry];
+ }
+
+ java.util.Arrays.sort(array);
+
+ if(array[2] > -1){
+ biggest = array[2];
+ }else if(array[1] > -1){
+ biggest = array[1];
+ } else if(array[0] > -1){
+ biggest = array[0];
+ }
+
+ return biggest;
+ }
+
+ /*
+ * manages the frame search
+ */
+
+ public static void findFrameAndCheckWithNeighbors(Gene cluster, Contig thisContig, StringBuffer contigSeq){
+
+ // now that we found the high-coverage area, search for start and stop codons
+
+ int possibleStart_FO = FrameSearch.findPossibleStarts_Forward(cluster, contigSeq, 0, (int) (cluster.startPos+3)); // now the start positions are directly the right ones
+ int possibleStart_RE = FrameSearch.findPossibleStarts_Reverse(cluster,contigSeq,0,(int) (cluster.startPos+3));
+
+ int possibleStop_FO = -1;
+ int possibleStop_RE = -1;
+
+ possibleStop_FO = FrameSearch.findPossibleStops_Forward(cluster,contigSeq,0,(int) cluster.stopPos-2);
+ possibleStop_RE = FrameSearch.findPossibleStops_Reverse(cluster,contigSeq,0,(int) cluster.stopPos-2);
+
+ // first have a look, if already forward or reverse direction is excluded due to missing start or stop
+
+ if(possibleStart_FO == -1 && possibleStart_RE != -1){ // reverse start found
+
+ // if it is an ORF, a reverse one is more likely, so search only for a possible start position of an reverse gene
+
+ if(possibleStop_RE != -1){
+
+ testFrames(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse,true,cluster,contigSeq);
+
+ }else{
+ cluster.onRevStrand = true;
+ ExtractGeneCandidates.declareClusterAsNotCompleted(cluster,possibleStart_RE,contigSeq);
+ }
+
+ }else if(possibleStart_RE == -1 && possibleStart_FO != -1){ // forward start found
+
+ // if it is an ORF, a forward one is more likely, so search only for possible stop positions of a forward gene
+
+ if(possibleStop_FO != -1){
+
+ testFrames(cluster.possibleStarts_Forward,cluster.possibleStops_Forward,false,cluster,contigSeq);
+
+ }else{
+ cluster.onRevStrand = false;
+ ExtractGeneCandidates.declareClusterAsNotCompleted(cluster,possibleStart_FO,contigSeq);
+ }
+
+ }else if(possibleStart_RE != -1 && possibleStart_FO != -1){ // both starts found
+
+ // at the moment both directions are equally likely, so first search for the right frames
+
+ if((possibleStop_FO == -1) && (possibleStop_RE == -1)){
+ ExtractGeneCandidates.declareClusterAsNotCompleted(cluster,Math.max(possibleStart_FO,possibleStart_RE),contigSeq); // use the smallest possible interval
+ }else if((possibleStop_FO != -1) && (possibleStop_RE == -1)){
+ testFrames(cluster.possibleStarts_Forward,cluster.possibleStops_Forward,false,cluster,contigSeq);
+ }else if((possibleStop_FO == -1) && (possibleStop_RE != -1)){
+ testFrames(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse,true,cluster,contigSeq);
+ }else{
+ // both are still equally likely, so check both of them
+
+ int[] pair_FO = null;
+ int[] pair_RE = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair_FO = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ pair_RE = FrameSearch.checkAndChooseReadingFrame(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }else{
+
+ pair_FO = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Forward,cluster.possibleStops_Forward);
+ pair_RE = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(cluster.possibleStarts_Reverse,cluster.possibleStops_Reverse);
+ }
+
+ if(pair_FO == null && pair_RE == null){
+
+ // if possible, refer to XS tag
+
+ if((possibleStop_FO - possibleStart_FO) <= (possibleStop_RE - possibleStart_RE)){
+ cluster.startPos = possibleStart_FO;
+ cluster.stopPos = possibleStop_FO + 2;
+ cluster.onRevStrand = false;
+ }else{
+ cluster.startPos = possibleStart_RE;
+ cluster.stopPos = possibleStop_RE + 2;
+ cluster.onRevStrand = true;
+ }
+
+
+ } else if(pair_FO != null && pair_RE == null){
+ cluster = ExtractGeneCandidates.refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ } else if(pair_FO == null && pair_RE != null){
+ cluster = ExtractGeneCandidates.refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ }else if(pair_FO != null && pair_RE != null){
+ // if possible, refer to XS tag
+ if(!(cluster.possibleIntrons.keySet().isEmpty()) && cluster.direcCounter[0] > cluster.direcCounter[1]){
+ cluster = ExtractGeneCandidates.refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ }else if(!(cluster.possibleIntrons.keySet().isEmpty()) && cluster.direcCounter[0] < cluster.direcCounter[1]){
+ cluster = ExtractGeneCandidates.refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ }else{
+ // take longest interval, because this region is likely to be overlapping with other regions
+
+ if(pair_FO[2] >= pair_RE[2]){
+ cluster = ExtractGeneCandidates.refineExtractedCluster(cluster, contigSeq, pair_FO[0], pair_FO[1]+2,false);
+ }else{
+ cluster = ExtractGeneCandidates.refineExtractedCluster(cluster, contigSeq, pair_RE[0], pair_RE[1]+2,true);
+ }
+
+ }
+
+ }
+
+ }
+
+ }
+
+ }
+
+ /*
+ * test if there exist a pair in frame
+ */
+
+ public static void testFrames(int[] starts, int[] stops, boolean isReverse, Gene cluster, StringBuffer contigSeq){
+
+ int[] pair = null;
+
+ if(cluster.possibleIntrons.keySet().isEmpty()){
+ pair = FrameSearch.checkAndChooseReadingFrame(starts,stops);
+ }else{
+ pair = FrameSearch.checkAndChooseReadingFrame_SplicingVariant(starts,stops);
+ }
+
+ if(pair != null){
+ cluster = ExtractGeneCandidates.refineExtractedCluster(cluster, contigSeq, pair[0], pair[1]+2,isReverse);
+ }else{
+
+ cluster.startPos = starts[0];
+ cluster.stopPos = stops[0] + 2;
+
+ ExtractGeneCandidates.checkIfAdequateAndRefine(cluster, isReverse, starts, contigSeq);
+
+ }
+
+ }
+}
diff --git a/src/geneFinder/GeneFinder.java b/src/geneFinder/GeneFinder.java
new file mode 100755
index 0000000..bf48ac4
--- /dev/null
+++ b/src/geneFinder/GeneFinder.java
@@ -0,0 +1,241 @@
+package geneFinder;
+
+
+/**
+ * manages gene finding
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+
+import java.io.*;
+import java.util.HashMap;
+import java.util.Map;
+
+import types.*;
+
+
+public class GeneFinder {
+
+ public static String pathToGenomeFiles; // path to directory with genome.fasta
+ public static String pathOut; // path to directory for resultsFiles
+ public static String pathToHelpFiles; // path to directory with all help scripts
+
+ public static File logFile; // file to store all run information
+ public static String outputName; // basis name for all output files
+ public static String haveSam; // if a sam file already exists, haveSam contains the name
+
+ public static Map<File,String> genomeFilesWithNames = new HashMap<File,String>();
+ public static Map<File,String> rnaFilesWithNames = new HashMap<File,String>();
+
+ public static boolean useTopHat; // indicator for mapping tool
+ public static String settingMapper; // setting for the mapping tool, differs slightly depending on which tool was chosen
+ public static int maxReportedHitsBWA; // specify number of multiple hits maximal reported in tag of BWA
+ public static boolean useBWAsw; // use longer reads and therefore choose BWAsw as mapping tool
+ public static boolean useCPLEX;
+ public static boolean useGLPK;
+ public static int memForCplex = -1; // if necessary, memory is set by user
+ public static boolean optiSolve; // if we have separate runs to reduce the memory consumption, this parameter turns off the optimization solver
+
+ public static int readLength; // read length of the rna reads
+ public static double minCoverage; // coverage threshold for the beginning of a gene
+ public static double endCoverage; // coverage threshold for the end of a gene
+
+ public static double maxCov; // threshold in case average cov and median cov differ too much
+
+ public static int interval; // minimal interval length
+ public static boolean dispCov; // decide whether the coverage histogram is computed
+ public static double spliceLim; // coverage threshold for splice site acceptance
+
+ public static boolean isProkaryote; // if set to true, it forbids the extraction of spliced reads
+
+ public static boolean noAmbiOpti; // do not use ambiguous reads and perform no optimization
+ public static int numberThreads; // indicates the maximal number of threads used during the run
+
+ public static Map<String,Contig> mappedContigs = new HashMap<String,Contig>();
+ public static Map<Contig,String> contigTOname = new HashMap<Contig,String>();
+
+ public static boolean splitRunAndOpti; // indicates if the optimization and GIIRA shall be run separately, to reduce the memory consumption
+ public static int iteration; // number of iterations
+ public static double ambiProportion;
+
+ public static boolean secondPart; // indicates whether we print out messages or not, required to avoid double messaging when in splitted analysis
+ public static boolean useSequential; // indicate if we have a second chrom sorted Sam file which is parsed in a sequential manner
+ public static String haveSam_ChromSorted; // if a sam file already exists, haveSam_ChromSorted contains the name (needed at the moment for sequential stuff)
+
+ public static boolean inprogeaCall; // indicates whether called from inprogea framework or independently
+
+ public static void main(String[] args){
+
+ manager(args);
+
+ }
+
+ /*
+ * manages the first part
+ */
+
+ public static Object[] manager(String[] args){
+
+ ReadInParameters_GeneFinder.readIn_GF(args);
+
+ /*Gene gene = new Gene();
+ gene.startPos = 0;
+ String seq = readInFasta();
+ Prokaryote_Specials.define_OrfsInOperon(seq,gene);
+ System.exit(0);*/
+
+ long timeBef = System.currentTimeMillis();
+
+ PrepareMapping_GF mappingPrep = new PrepareMapping_GF();
+ String nameRef = mappingPrep.prepareRefFile_GF();
+
+ File inputFileAlign;
+ File inputFileAlign2 = null;
+
+ if(GeneFinder.useSequential){
+ inputFileAlign2 = new File(haveSam_ChromSorted);
+ }
+
+ if(useTopHat){
+
+ if(haveSam != null){
+ inputFileAlign = new File(haveSam);
+ }else{
+
+ inputFileAlign = new File(pathOut+"resultsRun/accepted_hits.sam");
+ GeneFinder.haveSam = pathOut+"resultsRun/accepted_hits.sam";
+
+ // first test if not already created in earlier rounds
+
+ try{
+
+ BufferedReader br = new BufferedReader(new FileReader(inputFileAlign));
+ br.close();
+
+ }catch (IOException e) {
+ TopHat_Call tophatStart = new TopHat_Call();
+ tophatStart.callTopHat(nameRef);
+ }
+
+ }
+ }else{
+
+ if(haveSam != null){
+ inputFileAlign = new File(haveSam);
+ }else{
+
+ inputFileAlign = new File(pathOut+"resultsRun/aln_BWA.sam");
+ GeneFinder.haveSam = pathOut+"resultsRun/aln_BWA.sam";
+
+ // first test if not already created in earlier rounds
+
+ try{
+
+ BufferedReader br = new BufferedReader(new FileReader(inputFileAlign));
+ br.close();
+
+ }catch (IOException e) {
+ BWA_Call bwaStart = new BWA_Call();
+ bwaStart.callBWA(new File(nameRef+".fasta"), pathOut+"resultsRun/aln_BWA.sam");
+
+ }
+
+ }
+ }
+
+ if(iteration == 2){
+ WriteOutput.sortReassignSamFile();
+ }
+
+ SamParser parser = new SamParser();
+
+ parser.samFileParser(inputFileAlign,inputFileAlign2,nameRef);
+
+ if(iteration == 2){
+ WriteOutput.removeReassignSamFile();
+ }
+
+ geneFinder_managing(nameRef, "");
+
+ long timeAft = System.currentTimeMillis();
+ System.out.println("Gene identification finished in " + (double) (timeAft-timeBef)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Gene identification finished in " + (double) (timeAft-timeBef)/1000.0 +"s.");
+
+ if(inprogeaCall){
+ return new Object[] {mappedContigs,readLength};
+ }else{
+ return null;
+ }
+ }
+
+ /*
+ * coordinates everything necessary exclusively for the gene finder
+ */
+
+ public static void geneFinder_managing(String nameRef, String namePartOut){
+
+ long timeBef = System.currentTimeMillis();
+
+ if(!isProkaryote){
+ ExtractGeneCandidates searchC = new ExtractGeneCandidates();
+ searchC.initializeClusterSearch(nameRef);
+ }else{
+ ProkaryoteExtraction searchC_Pro = new ProkaryoteExtraction();
+ searchC_Pro.initializeClusterSearch(nameRef);
+ }
+
+ long timeAft = System.currentTimeMillis();
+
+ if(!GeneFinder.secondPart){
+ System.out.println("Time required for candidate extraction: " + (double) (timeAft-timeBef)/1000.0 +"s.\n");
+ WriteOutput.writeToLogFile("Time required for candidate extraction: " + (double) (timeAft-timeBef)/1000.0 +"s.\n");
+
+ }
+
+ if(!GeneFinder.noAmbiOpti){
+ OptimizeAmbis.maxFlowLP();
+
+ }
+
+ double[] minMax = CalculateScores.assignGeneScores(false);
+
+ if(isProkaryote){
+ WriteOutput.writeGeneFinderOutput_Prokaryote(minMax,namePartOut);
+ }else{
+ WriteOutput.writeGeneFinderOutput(minMax,namePartOut);
+ }
+
+ }
+
+ public static String readInFasta(){
+
+
+ String seq = "";
+
+ try {
+ BufferedReader br = new BufferedReader(new FileReader(new File("/home/franziska/paper_giira/EcoliSim/testAlign_30_6/extract4.fasta")));
+
+ String line = "";
+
+ while((line = br.readLine()) != null){
+ if(!line.startsWith(">")){
+ seq += line;
+ }
+ }
+ } catch (FileNotFoundException e) {
+
+ e.printStackTrace();
+ } catch (IOException e) {
+
+ e.printStackTrace();
+ }
+
+ System.out.println(seq);
+ return seq;
+ }
+
+}
diff --git a/src/geneFinder/Giira.java b/src/geneFinder/Giira.java
new file mode 100755
index 0000000..c381279
--- /dev/null
+++ b/src/geneFinder/Giira.java
@@ -0,0 +1,238 @@
+package geneFinder;
+
+/**
+ * controls the jar call
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ */
+
+import java.io.*;
+import java.net.URLDecoder;
+import java.util.Arrays;
+import java.util.Map;
+
+import types.Contig;
+
+public class Giira {
+
+ public static int iterationNum;
+ public static int optiMethod; // 0 - cplex // 1 - glpk
+ public static String libPath; // library path for cplex optimization
+ public static boolean splitRunAndOpti; // indicates if the optimization and giira shall be run separately, to reduce the memory consumption
+ public static String classPath; // the class path provided in the GIIRA call
+
+ public static void main(String[] args) {
+
+ if(args.length == 0 || args[0].equals("-h") || args[0].equals("--help")){
+ ReadInParameters_GeneFinder.printHelp_GF();
+ }
+
+ // get memory
+ Runtime rtMem = Runtime.getRuntime();
+ double maxMemMB = (((rtMem.maxMemory())/1000.0)/1000.0);
+ String str = String.valueOf(maxMemMB);
+ String[] mem = str.split("\\.");
+ //System.out.println("Maximal Memory: " + mem[0] + "MB");
+
+ // get path
+ String path = Giira.class.getProtectionDomain().getCodeSource().getLocation().getPath();
+
+ long timeBef = System.currentTimeMillis();
+
+ try {
+ String decodedPath = URLDecoder.decode(path, "UTF-8");
+ String scriptPath = decodedPath.substring(0,decodedPath.length()-9);
+ //System.out.println("Path of Giira: " + decodedPath);
+
+ classPath = "";
+
+ parseForGlobalParas(args);
+
+ // get class path
+
+ if(classPath.length() == 0){
+ classPath = System.getProperty("java.class.path");
+ }
+
+ //System.out.println("Class Path: " + classPath);
+
+ String argString = "";
+
+ for(String part : args){
+ argString += part + " ";
+ }
+
+ for(int itNum = 1;itNum<=iterationNum;++itNum){
+
+ //System.out.println();
+ //System.out.println("Iteration " + itNum);
+ //System.out.println();
+
+ if(splitRunAndOpti){
+ // perform two runs, with optimization in between
+
+ // run 1
+
+ String sysCall_1 = "java -Xmx"+ mem[0] +"m -cp " + classPath + ":" + decodedPath + " geneFinder.GeneFinder " + argString + "-iter " + itNum + " -solverOn n -splitRunAndOpti n -scripts " + scriptPath + "scripts/";
+ System.out.println("Call part 1:");
+ callAndHandleOutput(sysCall_1);
+
+ // optimization
+
+ String sysCall_Opti = "";
+ if(optiMethod == 0){
+ sysCall_Opti = "java -Xmx"+ mem[0] +"m -Djava.library.path=" + libPath + " -cp " + classPath + ":" + decodedPath + " geneFinder.GeneFinder " + argString + "-iter " + itNum + " -splitRunAndOpti y -secondPart y -scripts " + scriptPath + "scripts/";
+ }else{
+ sysCall_Opti = "java -Xmx"+ mem[0] +"m -cp " + classPath + ":" + decodedPath + " geneFinder.GeneFinder " + argString + "-iter " + itNum + " -splitRunAndOpti y -secondPart y -scripts " + scriptPath + "scripts/";
+ }
+
+ System.out.println("Optimization: " + sysCall_Opti);
+ callAndHandleOutput(sysCall_Opti);
+
+ // run 2
+
+ String sysCall_2 = "java -Xmx"+ mem[0] +"m -cp " + classPath + ":" + decodedPath + " geneFinder.GeneFinder " + argString + "-iter " + itNum + " -solverOn n -splitRunAndOpti n -secondPart y -scripts " + scriptPath + "scripts/";
+ System.out.println("Call part 2:");
+ callAndHandleOutput(sysCall_2);
+
+ }else{
+
+ String sysCall = "";
+
+ if(optiMethod == 0){
+ sysCall = "java -Xmx"+ mem[0] +"m -Djava.library.path=" + libPath + " -cp " + classPath + ":" + decodedPath + " geneFinder.GeneFinder " + argString + "-iter " + itNum + " -scripts " + scriptPath + "scripts/";
+ }else{
+ sysCall = "java -Xmx"+ mem[0] +"m -cp " + classPath + ":" + decodedPath + " geneFinder.GeneFinder " + argString + "-iter " + itNum + " -scripts " + scriptPath + "scripts/";
+ }
+
+ System.out.println("Call: ");
+ callAndHandleOutput(sysCall);
+
+ }
+
+
+ }
+ } catch (IOException e) {
+ System.out.println("IO Exception.");
+ }
+
+ long timeAft = System.currentTimeMillis();
+ System.out.println("Finished GIIRA in " + (double) (timeAft-timeBef)/1000.0 +"s.");
+
+ }
+
+ /*
+ * extract the global parameters for GIIRA
+ */
+
+ public static void parseForGlobalParas(String[] args){
+
+ boolean foundLib = false;
+ boolean foundOpti = false;
+ boolean foundSplitIndicator = false;
+ boolean foundIteration = false;
+ boolean foundCPLEX = false;
+
+ String parameter = Arrays.toString(args);
+ if(!parameter.isEmpty() && args.length > 0){
+ System.out.println();
+ for(int i = 0; i<args.length;i++){
+ String arg = args[i];
+ if(arg.equals("-iter")){
+ foundIteration = true;
+ int iteration = Integer.parseInt(args[i+1]);
+ if(iteration == 1){
+ iterationNum = 1;
+ } else{
+ iterationNum = 2;
+ }
+ }
+ if(arg.equals("-libPath")){
+ foundLib = true;
+ libPath = args[i+1];
+ } else if(arg.equals("-opti")){
+ foundOpti = true;
+ String optimizer = args[i+1];
+
+ if(optimizer.equals("glpk")){
+ optiMethod = 1;
+ } else{
+ optiMethod = 0;
+ }
+ } else if(arg.equals("-splitRunAndOpti")){
+ foundSplitIndicator = true;
+
+ if(args[i+1].equals("y")){
+ splitRunAndOpti = true;
+ } else{
+ splitRunAndOpti = false;
+ }
+ } else if(arg.equals("-cp")){
+ foundCPLEX = true;
+ classPath = args[i+1];
+ }
+ }
+
+ if(!foundLib){
+ if(!foundOpti || (foundOpti && optiMethod == 0)){
+ System.out.println("Cplex is chosen as optimizer, but no Djava.library.path is provided.");
+ System.out.println("Either choose glpk as the optimizer (-opti glpk), or specify the Djava.library.path for cplex with -libPath [PATH].");
+ System.exit(0);
+ }
+ }
+ if(!foundCPLEX){
+ if(!foundOpti || (foundOpti && optiMethod == 0)){
+ System.out.println("Cplex is chosen as optimizer, but no path to cplex.jar is provided.");
+ System.out.println("Either choose glpk as the optimizer (-opti glpk), or specify the path to both files with -cp PATH_TO_CPLEX/cplex.jar");
+ System.exit(0);
+ }
+ }
+ if(!foundSplitIndicator){
+ splitRunAndOpti = false;
+ }
+ if(!foundIteration){
+ iterationNum = 1;
+ }
+
+ }
+ }
+
+ /*
+ * call GIIRA and handle the output streams
+ */
+
+ public static void callAndHandleOutput(String sysCall){
+
+ try{
+
+ System.out.println(sysCall);
+ System.out.println();
+
+ Runtime run = Runtime.getRuntime();
+ Process exe = run.exec(sysCall);
+
+ BufferedReader bExe = new BufferedReader(new InputStreamReader(exe.getInputStream()));
+ BufferedReader bErr = new BufferedReader(new InputStreamReader(exe.getErrorStream()));
+
+ String lineExe = "";
+
+ while((lineExe = bExe.readLine()) != null){
+ System.out.println(lineExe);
+ }
+ while((lineExe = bErr.readLine()) != null){
+ System.out.println(lineExe);
+ }
+
+ exe.waitFor();
+
+ bExe.close();
+ bErr.close();
+ } catch (InterruptedException e) {
+ System.out.println("Interrupted Exception.");
+ } catch (IOException e) {
+ System.out.println("IO Exception.");
+ }
+ }
+}
diff --git a/src/geneFinder/HelperFunctions_GeneSearch.java b/src/geneFinder/HelperFunctions_GeneSearch.java
new file mode 100755
index 0000000..6685cbd
--- /dev/null
+++ b/src/geneFinder/HelperFunctions_GeneSearch.java
@@ -0,0 +1,590 @@
+package geneFinder;
+
+import java.util.HashMap;
+import java.util.TreeMap;
+import java.util.Vector;
+
+import types.Contig;
+import types.Gene;
+import types.Rna;
+
+/**
+ * contains methods for various kinds of tasks required during cluster extraction, intron and exon search and frame determination
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class HelperFunctions_GeneSearch {
+
+ /*
+ * if we do not take into account ambiguous hits, remove them from mapping rnas
+ *
+ */
+
+ public static void removeAmbiHits(Contig thisContig){
+
+ for(int currentPos : thisContig.positionTOmappingRnas.keySet()){
+
+ Vector<Rna> rnaWithoutMulti = new Vector<Rna>();
+ for(Rna rna : thisContig.positionTOmappingRnas.get(currentPos)){
+ if(rna.isMulti == 0){
+ rnaWithoutMulti.add(rna);
+ }else{
+ // search splice sites and remove support, go through all info entries in case this read has mapped more than once on the same position
+ for(Object[] info : rna.contigsMappedOn){
+ if(((Integer) info[1] == currentPos) && (((Contig) info[0]).equals(thisContig))){
+ if(info[4] != null){
+ int splitStart = ((int[]) info[4])[1];
+ if(thisContig.splicePositions.containsKey(splitStart)){
+ thisContig.splicePositions.put(splitStart,thisContig.splicePositions.get(splitStart)-1);
+ }
+
+ }
+ }
+ }
+ }
+ }
+
+ thisContig.positionTOmappingRnas.put(currentPos,rnaWithoutMulti);
+ }
+
+ }
+
+ /*
+ * remove all hits from posisitions they are not any longer assigned to
+ */
+
+ public static void removeReassignedHits(Contig thisContig){
+
+ for(int currentPos : thisContig.positionTOmappingRnas.keySet()){
+
+ Vector<Rna> rnaWithoutMulti = new Vector<Rna>();
+
+ for(Rna rna : thisContig.positionTOmappingRnas.get(currentPos)){
+ if(rna.isMulti == 2){
+ if(rna.contigsMappedOn.size() > 0){
+ Object[] info = rna.contigsMappedOn.get(0);
+
+ if(((Integer)info[1]).intValue() == currentPos){
+ rnaWithoutMulti.add(rna);
+ }
+ }
+ }else{
+ rnaWithoutMulti.add(rna);
+ }
+ }
+
+ thisContig.positionTOmappingRnas.put(currentPos,rnaWithoutMulti);
+ }
+
+ }
+
+ /*
+ * this method updates the posiCovMap that stores coverage-Add-Vectors for intron-starts and endings
+ * here we subtract 1 for each covered position, because if we call this method the read has been erased
+ */
+
+ public static TreeMap<Integer,Vector<Integer>> updatePosiCovMap_AfterSpliceSwitch(TreeMap<Integer,Vector<Integer>> posiCovMap, int posi, int lengthThatIsLeft){
+
+ if(posiCovMap.containsKey(posi)){
+
+ for(int i = lengthThatIsLeft - 1; i >= 0; --i){
+ if(posiCovMap.get(posi).size() > i){
+ int oldCov = posiCovMap.get(posi).get(i);
+
+ if(oldCov == 1){
+ posiCovMap.get(posi).removeElementAt(i);
+ }else{
+ posiCovMap.get(posi).setElementAt((oldCov - 1),i);
+ }
+
+ }
+ }
+
+ if(posiCovMap.get(posi).size() == 0){
+ posiCovMap.remove(posi);
+ }
+
+ }
+
+ return posiCovMap;
+ }
+
+
+ /*
+ * this method updates the posiCovMap that stores coverage-Add-Vectors for intron-starts and endings
+ */
+
+ public static TreeMap<Integer,Vector<Integer>> updatePosiCovMap(TreeMap<Integer,Vector<Integer>> posiCovMap, int posi, int lengthThatIsLeft){
+
+ if(posiCovMap.containsKey(posi)){
+
+ for(int i = 0; i < lengthThatIsLeft; ++i){
+ if(posiCovMap.get(posi).size() > i){
+ int oldCov = posiCovMap.get(posi).get(i);
+ posiCovMap.get(posi).setElementAt((oldCov + 1),i);
+ }else{
+ posiCovMap.get(posi).add(1);
+ }
+ }
+
+ }else{
+ Vector<Integer> vec_tmp = new Vector<Integer>();
+ for(int i = 0; i < lengthThatIsLeft; ++i){
+ vec_tmp.add(1);
+ }
+ posiCovMap.put(posi,vec_tmp);
+ }
+
+ return posiCovMap;
+ }
+
+
+ /*
+ * help function to update the original coverageVec according to entries stored in posiCovMap
+ */
+
+ public static Object[] lookIntoPosiCovMap(TreeMap<Integer,Vector<Integer>> posiCovMap, int currentPos){
+
+ Vector<Integer> covVecClone = new Vector<Integer>(); // stores all coverage add values derived by posiCovMap
+
+ for(int arrPos=0;arrPos<GeneFinder.readLength;++arrPos){
+ covVecClone.add(0);
+ }
+
+ boolean addedValues = false;
+
+ //Vector<Integer> visitedPositions = new Vector<Integer>(); // positions that have been visited can be removed in map!
+
+ if(posiCovMap.keySet().size() != 0){
+ for(int position : posiCovMap.keySet()){
+ if(Math.abs((position-currentPos)) <= GeneFinder.readLength){
+ // add this coverages to cov vec
+ Vector<Integer> tmpCovs = posiCovMap.get(position);
+
+ if(position <= currentPos){
+ int posInClone = 0;
+ for(int posTmp = (currentPos - position); posTmp < tmpCovs.size(); ++posTmp){
+
+ if(tmpCovs.get(posTmp) > 0){
+ addedValues = true;
+ }
+ covVecClone.setElementAt((covVecClone.get(posInClone) + tmpCovs.get(posTmp)),posInClone++);
+ posiCovMap.get(position).setElementAt(0,posTmp);
+ }
+ }else{
+ int posInClone = (position - currentPos);
+ int threshold = Math.min(tmpCovs.size(), (GeneFinder.readLength - (position - currentPos)));
+ for(int posTmp = 0; posTmp < threshold; ++posTmp){
+ if(tmpCovs.get(posTmp) > 0){
+ addedValues = true;
+ }
+ covVecClone.setElementAt((covVecClone.get(posInClone) + tmpCovs.get(posTmp)),posInClone++);
+ posiCovMap.get(position).setElementAt(0,posTmp);
+ }
+ }
+
+ }else if(position > (currentPos + GeneFinder.readLength)){
+ break;
+ }
+
+ }
+ }
+
+ Object[] returnObject = {addedValues,covVecClone, posiCovMap};
+
+ return returnObject;
+ }
+
+ /*
+ * method that rnas to map
+ */
+
+ public static void addRnasFromVector(Gene cluster, Vector<Rna> newRnas){
+
+ for(Rna potRna : newRnas){
+
+ if(!cluster.idTOassociatedRnas.containsKey(potRna.rnaID)){
+ cluster.idTOassociatedRnas.put(potRna.rnaID, new Object[] {potRna,new int[] {-1,-1},-1});
+ potRna.assignedNum++;
+ }else{
+ cluster.moreThanOneHitRnas.add(potRna.rnaID);
+ }
+
+ }
+
+ }
+
+ /*
+ * method that rnas to map, without caring for duplications
+ */
+
+ public static void addRnasFromVector_NotRespectDuplis(Gene cluster, Vector<Rna> newRnas){
+
+ for(Rna potRna : newRnas){
+
+ cluster.idTOassociatedRnas.put(potRna.rnaID, new Object[] {potRna,new int[] {-1,-1},-1});
+
+ }
+
+ }
+
+
+
+ /*
+ * method to ensure that only reads that are not already included will be assigned to the vector
+ */
+
+ public static void addAssociatedRnas(Gene cluster, HashMap<String, Object[]> newRnas){
+
+ for(String potRna : newRnas.keySet()){
+ if(!cluster.idTOassociatedRnas.containsKey(potRna)){
+ cluster.idTOassociatedRnas.put(potRna, newRnas.get(potRna));
+ ((Rna) newRnas.get(potRna)[0]).assignedNum++;
+ }else{
+ cluster.moreThanOneHitRnas.add(potRna);
+ }
+ }
+
+ }
+
+ /*
+ * when we extract twin clusters, declare all their rnas as "shared"
+ */
+
+ public static void declareAsShared(Gene cluster){
+
+ for(String rnaKey : cluster.idTOassociatedRnas.keySet()){
+ Rna rna = ((Rna) cluster.idTOassociatedRnas.get(rnaKey)[0]);
+ rna.isSharedBy.add(cluster.geneID);
+ }
+ }
+
+ /*
+ * when two clusters are merged, take care to correctly assign the shared rnas
+ */
+
+ public static void declareShared_AfterMerge(Gene cluster, Gene clusterBef){
+
+ for(String rnaKey : cluster.idTOassociatedRnas.keySet()){
+ Rna rna = ((Rna) cluster.idTOassociatedRnas.get(rnaKey)[0]);
+ if(rna.isSharedBy.contains(clusterBef.geneID)){
+
+ rna.isSharedBy.removeElement(clusterBef.geneID);
+ if(!rna.isSharedBy.contains(cluster.geneID)){
+ rna.isSharedBy.add(cluster.geneID);
+ }
+
+ }
+ }
+ }
+
+ /*
+ * also update the shared rna information for the twin node
+ * note: the modulo operation tests whether this shared rna really is shared between cluster and twin or between cluster and another overlapping cluster
+ */
+
+ public static void declareShared_For_FutureTwin(Gene cluster, Gene futureTwin){
+
+ for(String rnaKey : cluster.idTOassociatedRnas.keySet()){
+ Rna rna = ((Rna) cluster.idTOassociatedRnas.get(rnaKey)[0]);
+ if(rna.isSharedBy.contains(cluster.geneID) && !rna.isSharedBy.contains(futureTwin.geneID)){
+
+ rna.isSharedBy.add(futureTwin.geneID);
+
+ }
+ }
+ }
+
+
+ /*
+ * check whether there exists an intron end near nextPos, such that we can extract the number of bases, that define the overlapping region with nextPos
+ */
+
+ public static Integer findIntronNearNextPos(Gene cluster, int nextPos){
+
+ int posiCovPosition = -1;
+ for(int key : cluster.possibleIntrons.keySet()){
+ if(cluster.possibleIntrons.get(key)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) cluster.possibleIntrons.get(key)[0])){
+ if(Math.abs(nextPos - intron[1]) <= GeneFinder.readLength){
+ posiCovPosition = intron[1];
+ break;
+ }
+ }
+ }
+ if(posiCovPosition != -1){
+ break;
+ }
+ }
+
+ return posiCovPosition;
+ }
+
+
+ /*
+ * check if nextPos is within readLength range of an intronEnd (exceeding it)
+ * also check if a splice site opens after transcript start, if yes, take what occurs first, split start or intron end
+ */
+
+ public static int[] checkIfNextPosExceedsIntronEnd(Gene cluster, int nextPos, int transcriptMode){
+
+ int minEndKey = Integer.MAX_VALUE;
+ int minEndIntron = Integer.MAX_VALUE;
+ int keyThreshold = Integer.MAX_VALUE;
+
+ int inNormalTranscriptKey = 1; // 1 indicates "yes", 0 indicates "no"
+ int inNormalTranscriptIntron = 1; // 1 indicates "yes", 0 indicates "no"
+
+ if(transcriptMode != -1){
+ keyThreshold = transcriptMode;
+ }
+
+ for(int key : cluster.possibleIntrons.keySet()){
+ if(cluster.possibleIntrons.get(key)[0] != null){ //&& key<=keyThreshold){ tryOut
+ for(int[] intron : ( (Vector<int[]>) cluster.possibleIntrons.get(key)[0])){
+ if((key >= transcriptMode) && (transcriptMode != -1)){
+ if(key<minEndKey){
+ minEndKey = key;
+ if(key >= keyThreshold){
+ inNormalTranscriptKey = 0;
+ }
+ }
+
+ }
+ if((nextPos - intron[1] <= GeneFinder.readLength) && (nextPos >= intron[1])){
+ if(intron[1] < minEndIntron){
+ minEndIntron = intron[1];
+ if(key >= keyThreshold){
+ inNormalTranscriptIntron = 0;
+ }
+ }
+
+ }
+ }
+
+ }
+
+ }
+ if((minEndKey != Integer.MAX_VALUE) || (minEndIntron != Integer.MAX_VALUE)){
+ if(minEndKey <= minEndIntron){
+ return new int[] {minEndKey,inNormalTranscriptKey};
+ }else{
+ return new int[] {minEndIntron,inNormalTranscriptIntron};
+ }
+ }
+
+ return new int[] {-1,0};
+ }
+
+ /*
+ * check if nextPos is within readLength range of an intronEnd (exceeding it) of a previous transcript
+ * also check if a splice site opens after transcript start, if yes, take what occurs first, split start or intron end
+ */
+
+ public static int[] checkIfNextPosMergesWithPreviousIsoform(Gene cluster, int nextPos, int transcriptMode){
+
+ int minEndKey = Integer.MAX_VALUE;
+ int minEndIntron = Integer.MAX_VALUE;
+ int keyThreshold = Integer.MAX_VALUE;
+
+ int inNormalTranscriptKey = 1; // 1 indicates "yes", 0 indicates "no"
+ int inNormalTranscriptIntron = 1; // 1 indicates "yes", 0 indicates "no"
+
+ if(transcriptMode != -1){
+ keyThreshold = transcriptMode;
+ }
+
+ for(int key : cluster.possibleIntrons.keySet()){
+ if(cluster.possibleIntrons.get(key)[0] != null){ //&& key<=keyThreshold){ tryOut
+ for(int[] intron : ( (Vector<int[]>) cluster.possibleIntrons.get(key)[0])){
+ if((key >= transcriptMode) && (transcriptMode != -1)){
+ if(key<minEndKey){
+ minEndKey = key;
+ if(key >= keyThreshold){
+ inNormalTranscriptKey = 0;
+ }
+ }
+
+ }
+ if((nextPos - intron[1] <= GeneFinder.readLength) && (nextPos >= intron[1])){
+ if(intron[1] < minEndIntron){
+ minEndIntron = intron[1];
+ if(key >= keyThreshold){
+ inNormalTranscriptIntron = 0;
+ }
+ }
+
+ }
+ }
+
+ }
+
+ }
+ if((minEndKey != Integer.MAX_VALUE) || (minEndIntron != Integer.MAX_VALUE)){
+
+ if(inNormalTranscriptKey == 1 || inNormalTranscriptIntron == 1){
+ if(inNormalTranscriptKey == 1 && inNormalTranscriptIntron == 1){
+ if(minEndKey <= minEndIntron){
+ return new int[] {minEndKey,inNormalTranscriptKey};
+ }else{
+ return new int[] {minEndIntron,inNormalTranscriptIntron};
+ }
+ }else{
+ if(inNormalTranscriptKey == 1 && (minEndKey != Integer.MAX_VALUE)){
+ return new int[] {minEndKey,inNormalTranscriptKey};
+ }else if((minEndIntron != Integer.MAX_VALUE)){
+ return new int[] {minEndIntron,inNormalTranscriptIntron};
+ }
+ }
+ }else{
+ if(minEndKey <= minEndIntron){
+ return new int[] {minEndKey,inNormalTranscriptKey};
+ }else{
+ return new int[] {minEndIntron,inNormalTranscriptIntron};
+ }
+ }
+ }
+
+ return new int[] {-1,0};
+ }
+
+ /*
+ * check if the intron end leads to a previous transcript
+ */
+
+ public static int[] checkIfIntronEndsInNormalTranscript(Gene cluster, int intronEnd, int transcriptMode){
+
+ int minEndIntron = Integer.MAX_VALUE;
+ int keyThreshold = Integer.MAX_VALUE;
+
+ int inNormalTranscriptIntron = 1; // 1 indicates "yes", 0 indicates "no"
+
+ if(transcriptMode != -1){
+ keyThreshold = transcriptMode;
+ }
+
+ for(int key : cluster.possibleIntrons.keySet()){
+ if(cluster.possibleIntrons.get(key)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) cluster.possibleIntrons.get(key)[0])){
+
+ if((Math.abs(intronEnd - intron[1]) <= GeneFinder.readLength)){
+ if(intron[1] < minEndIntron){
+ minEndIntron = intron[1];
+ if(key >= keyThreshold){
+ inNormalTranscriptIntron = 0;
+ }
+ }
+
+ }
+ }
+
+ }
+
+ }
+
+ if(minEndIntron != Integer.MAX_VALUE){
+ return new int[] {minEndIntron,inNormalTranscriptIntron};
+ }
+
+ return new int[] {-1,0};
+ }
+
+
+
+ /*
+ * find the intron that defined currentCompete
+ * return 1 means we go on with the gene
+ */
+
+ public static int findIntronOfCurrentCompete(Gene gene, int currentCompete){
+
+ for(int key : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(key)[0] != null){
+ for(int[] intron : ((Vector<int[]>)gene.possibleIntrons.get(key)[0])){
+ if(intron[1] == currentCompete){
+ // found intron
+ if(((Vector<int[]>)gene.possibleIntrons.get(key)[0]).size() == 1){
+ return 1;
+ }else{
+ return -1;
+ }
+ }
+ }
+ }
+ }
+
+ return 1;
+ }
+
+ /*
+ * test if also other splits support the currentCompete, such that it is better to goOn and leave candidate open
+ */
+
+ public static boolean checkForOtherSupport(Gene gene, int currentCompete, int nextPos){
+
+ int numBiggerThanNextPos = 0;
+
+ for(int key : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(key)[0] != null){
+ for(int[] intron : ((Vector<int[]>)gene.possibleIntrons.get(key)[0])){
+ if((intron[1] == currentCompete) || (intron[1] > nextPos)){
+ // found intron
+ numBiggerThanNextPos++;
+ }
+ }
+ }
+ }
+ if(numBiggerThanNextPos >= 2){
+ return true;
+ }
+ return false;
+ }
+
+ /*
+ * add rnas from the interval start-currentPos
+ */
+
+ public static void addRnasFromInterval(Gene cluster, Contig thisContig, int currentPos, int formerPosi){
+
+ int pos_temp = formerPosi;
+
+ if(pos_temp != -1){
+ do{
+
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(pos_temp));
+
+ if(thisContig.positionTOmappingRnas.higherKey(pos_temp) != null){
+ pos_temp = thisContig.positionTOmappingRnas.higherKey(pos_temp);
+ }else{
+ pos_temp = currentPos;
+ }
+ }while((pos_temp != currentPos) && !(pos_temp > currentPos));
+ }
+ }
+
+ /*
+ * add rnas from the interval start-currentPos not respecting duplication
+ */
+
+ public static void addRnasFromInterval_notRespectDuplis(Gene cluster, Contig thisContig, int currentPos, int formerPosi){
+
+ int pos_temp = formerPosi;
+
+ if(pos_temp != -1){
+ do{
+
+ HelperFunctions_GeneSearch.addRnasFromVector_NotRespectDuplis(cluster,thisContig.positionTOmappingRnas.get(pos_temp));
+
+ if(thisContig.positionTOmappingRnas.higherKey(pos_temp) != null){
+ pos_temp = thisContig.positionTOmappingRnas.higherKey(pos_temp);
+ }else{
+ pos_temp = currentPos;
+ }
+ }while((pos_temp != currentPos) && !(pos_temp > currentPos));
+ }
+ }
+}
diff --git a/src/geneFinder/IntronExonSearch.java b/src/geneFinder/IntronExonSearch.java
new file mode 100755
index 0000000..bb9899f
--- /dev/null
+++ b/src/geneFinder/IntronExonSearch.java
@@ -0,0 +1,1152 @@
+package geneFinder;
+
+import java.util.TreeMap;
+import java.util.Vector;
+
+import types.*;
+
+/**
+ * class that contains methods to extract and verify introns and define the corresponding exons
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class IntronExonSearch {
+
+ /*
+ * check if all introns have enough support. If they are below the threshold, erase them and the connected rnas
+ */
+
+ public static void finalIni_IntronSupportCheck(Gene cluster, Contig thisContig){
+
+ Vector<Integer> keysToErase = new Vector<Integer>();
+
+ for(int spliceSite : cluster.possibleIntrons.keySet()){
+
+ if(cluster.possibleIntrons.get(spliceSite)[0] != null){
+ for(int i = ((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).size() -1; i>= 0; --i){
+ if((((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).get(i)[1] > cluster.stopPos)){
+ // intron has not enough support, so erase and also delete rnas from cluster
+
+ int[] badIntron = ((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).get(i);
+
+ for(Rna rna : ((Vector<Vector<Rna>>)cluster.possibleIntrons.get(spliceSite)[1]).get(i)){
+ if(cluster.moreThanOneHitRnas.contains(rna.rnaID)){
+ // if it is contained here, this serves as one "back-up life" that is now destroyed
+ cluster.moreThanOneHitRnas.remove(rna.rnaID);
+ }else{
+ cluster.idTOassociatedRnas.remove(rna.rnaID);
+ }
+
+ if((((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).get(i)[1] > cluster.stopPos)){
+ // remove direcCounter
+ for(Object[] info : rna.contigsMappedOn){
+ if(info[4] != null){
+ if((((int[])info[4])[1] == badIntron[0]) && ((((int[])info[4])[0]) + (((int[])info[4])[1]) == badIntron[1])){
+ if(info[6] != null){
+ if(((String)info[6]).equals("+")){
+ cluster.direcCounter[0]--;
+ }else{
+ cluster.direcCounter[1]--;
+ }
+ }
+
+ rna.contigsMappedOn.removeElement(info);
+ rna.assignedNum = rna.assignedNum -1;
+
+ break;
+ }
+ }
+ }
+
+ }
+
+ }
+ ((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).removeElementAt(i);
+ ((Vector<Vector<Rna>>)cluster.possibleIntrons.get(spliceSite)[1]).removeElementAt(i);
+
+
+ if(( (Vector<int[]>) cluster.possibleIntrons.get(spliceSite)[0]).size() == 0){
+ if((badIntron[1] > cluster.stopPos) && (spliceSite < cluster.stopPos)){
+ //checkSplitRemovalSituation(cluster, spliceSite,keysToErase);
+ searchLastIntronEndBeforeCurrentPos_closeGene(cluster, spliceSite,keysToErase);
+ }
+ }
+
+
+ }else{
+
+ // update idTOassociatedRnas, saves time later
+
+ for(Rna rna : ((Vector<Vector<Rna>>)cluster.possibleIntrons.get(spliceSite)[1]).get(i)){
+ cluster.idTOassociatedRnas.get(rna.rnaID)[1] = new int[] {spliceSite,((Vector<int[]>) cluster.possibleIntrons.get(spliceSite)[0]).get(i)[1]};
+ }
+ }
+
+ }
+
+ if(( (Vector<int[]>) cluster.possibleIntrons.get(spliceSite)[0]).size() == 0){
+ keysToErase.add(spliceSite);
+ }else{
+ // check if fussy indicated and if it is suitable (we only accept a fussy end if it is followed by an intron or if it marks a transcript end)
+
+ if(((Integer)cluster.possibleIntrons.get(spliceSite)[2]).intValue() > 0){
+ if((((Integer)cluster.possibleIntrons.get(spliceSite)[2]).intValue() != -2) && (((Integer)cluster.possibleIntrons.get(spliceSite)[2]).intValue() != 0)){
+ // remove fussy
+
+ cluster.possibleIntrons.get(spliceSite)[2] = -1;
+ cluster.possibleIntrons.get(spliceSite)[3] = -1;
+
+ // remove from fussy list
+ if(cluster.possibleFussyExons.containsKey(spliceSite)){
+ for(Rna rna : cluster.possibleFussyExons.get(spliceSite)){
+
+ if(cluster.moreThanOneHitRnas.contains(rna.rnaID)){
+ // if it is contained here, this serves as one "back-up life" that is now destroyed
+ cluster.moreThanOneHitRnas.remove(rna.rnaID);
+
+ if(cluster.idTOassociatedRnas.containsKey(rna.rnaID)){
+ if(((Integer)cluster.idTOassociatedRnas.get(rna.rnaID)[2]).intValue() != -1){
+ cluster.idTOassociatedRnas.get(rna.rnaID)[2] = -1;
+ }
+
+ }
+ }else{
+ if(cluster.idTOassociatedRnas.containsKey(rna.rnaID)){
+ cluster.idTOassociatedRnas.remove(rna.rnaID);
+ rna.assignedNum = rna.assignedNum -1;
+ }
+ }
+
+ }
+
+ cluster.possibleFussyExons.remove(spliceSite);
+ }
+ }
+ }
+ }
+
+ }
+
+ }
+
+ for(int key : keysToErase){
+ cluster.possibleIntrons.remove(key);
+ }
+ }
+
+
+ /*
+ * if we are in currentCompete but between currentPos and nextPos is a large gap, search for a supporting split
+ * if none is found, erase the intron and all other ones, which end beyond the current position and close candidate gene at currentPos
+ */
+
+ public static int[] checkIfIntronSupported(Gene gene, int currentPos, int nextPos, TreeMap<Integer,Vector<Integer>> posiCovMap, int inTranscriptMode, int searchTranscriptStart, int endToPassForclosedGenes, int inNormalTranscript){
+
+ int foundSupport = -1;
+ int foundCurrentPosSupport = -1;
+ for(int split : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(split)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) gene.possibleIntrons.get(split)[0])){
+ if(currentPos<=intron[0]){
+ foundSupport = 0;
+ foundCurrentPosSupport = 0;
+ break;
+ }else{
+ if((Math.abs(nextPos - intron[1]) <= GeneFinder.readLength)){ // && nextPos <= (intron[1] + GeneFinder.readLength)
+ foundSupport = 0;
+ }
+
+ int fussyPossible = (Integer)(gene.possibleIntrons.get(split)[2]);
+ if(currentPos <= fussyPossible){
+ foundCurrentPosSupport = 0;
+ }
+ }
+
+ }
+ }
+ if((foundCurrentPosSupport == 0) && (foundSupport == 0)){
+ break;
+ }
+ }
+
+ if(foundSupport == -1){
+
+ if(inTranscriptMode == -1){
+ // close gene here
+ if((searchTranscriptStart == -1)){// && (inNormalTranscript == -1)){
+
+ foundSupport = 3;
+
+ }else{
+ foundSupport = 0;
+ }
+
+ }else{
+ //delete transcript or close it
+
+ foundSupport = replaceEntryInAltTransWithUpdatedOne(gene, inTranscriptMode, currentPos, -1, nextPos); // -1 indicates that closed within an intron
+
+ }
+ }else{
+ for(int position : posiCovMap.keySet()){
+ if(currentPos < (position-GeneFinder.readLength) && nextPos > (position+GeneFinder.readLength)){
+ // had no chance to look at it, so create fake intron to also regard this exon and close
+ searchCompleteIntronForEndingAndMakeNewIsoform(gene, position,posiCovMap);
+ endToPassForclosedGenes = position+posiCovMap.get(position).size();
+ }
+ }
+
+ if(foundCurrentPosSupport == -1){
+ // we have no split near currentPos, so indicate this via new transcript end (define end by a fake intron starting at currentPos + read length)
+
+ searchLastIntronEndBeforeCurrentPos(gene,currentPos,searchTranscriptStart,inTranscriptMode); // this intronEnd is not allowed to be paired with the next higher key, but instead it is defined as a new exon,
+ // so indicate this for ExonSearch
+ }
+ }
+
+ return new int[] {foundSupport,endToPassForclosedGenes,foundCurrentPosSupport};
+ }
+
+ /*
+ * if we are in currentCompete but between currentPos and nextPos is a large gap, search for a supporting split
+ * if none is found, erase the intron and all other ones, which end beyond the current position and close candidate gene at currentPos
+ */
+
+ public static int[] checkIfIntronSupported_fakeGene(Gene gene, int currentPos, int nextPos, TreeMap<Integer,Vector<Integer>> posiCovMap, int inTranscriptMode, int searchTranscriptStart, int endToPassForclosedGenes, int inNormalTranscript){
+
+ int foundSupport = -1;
+ int foundCurrentPosSupport = -1;
+ for(int split : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(split)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) gene.possibleIntrons.get(split)[0])){
+ if(currentPos<=intron[0]){
+ foundSupport = 0;
+ foundCurrentPosSupport = 0;
+ break;
+ }else{
+ if(nextPos <= (intron[1] + GeneFinder.readLength) && (Math.abs(nextPos - intron[1]) <= GeneFinder.readLength)){
+ foundSupport = 0;
+ }
+
+ int fussyPossible = (Integer)(gene.possibleIntrons.get(split)[2]);
+ if(currentPos <= fussyPossible){
+ foundCurrentPosSupport = 0;
+ }
+ }
+
+ }
+ }
+ if((foundCurrentPosSupport == 0) && (foundSupport == 0)){
+ break;
+ }
+ }
+
+ if(foundSupport != -1){
+ for(int position : posiCovMap.keySet()){
+ if(currentPos < (position-GeneFinder.readLength) && nextPos > (position+GeneFinder.readLength)){
+ // had no chance to look at it, so create fake intron to also regard this exon and close
+ searchCompleteIntronForEndingAndMakeNewIsoform(gene, position,posiCovMap);
+ endToPassForclosedGenes = position+posiCovMap.get(position).size();
+ }
+ }
+
+ if(foundCurrentPosSupport == -1){
+ // we have no split near currentPos, so indicate this via new transcript end (define end by a fake intron starting at currentPos + read length)
+
+ searchLastIntronEndBeforeCurrentPos(gene,currentPos,searchTranscriptStart,inTranscriptMode); // this intronEnd is not allowed to be paired with the next higher key, but instead it is defined as a new exon,
+ // so indicate this for ExonSearch
+ }
+ }
+
+ return new int[] {foundSupport,endToPassForclosedGenes,foundCurrentPosSupport};
+ }
+
+ /*
+ * check each support during the candidate initialization
+ */
+
+ public static void checkIfIntronSupported_forGeneIni(Gene gene, int currentPos, int nextPos, TreeMap<Integer,Vector<Integer>> posiCovMap, int inTranscriptMode, int searchTranscriptStart, int endToPassForclosedGenes){
+
+ int foundSupport = -1;
+ int foundCurrentPosSupport = -1;
+ for(int split : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(split)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) gene.possibleIntrons.get(split)[0])){
+ if(currentPos<=intron[0]){
+ foundSupport = 0;
+ foundCurrentPosSupport = 0;
+ break;
+ }else{
+ if(nextPos <= (intron[1] + GeneFinder.readLength) && (Math.abs(nextPos - intron[1]) <= GeneFinder.readLength)){
+ foundSupport = 0;
+ }
+
+ int fussyPossible = (Integer)(gene.possibleIntrons.get(split)[2]);
+ if(currentPos <= fussyPossible){
+ foundCurrentPosSupport = 0;
+ }
+ }
+
+ }
+ }
+ if((foundCurrentPosSupport == 0) && (foundSupport == 0)){
+ break;
+ }
+ }
+
+ if(foundSupport == -1 && foundCurrentPosSupport == -1){
+
+ if(searchTranscriptStart == -1){
+ for(int position : posiCovMap.keySet()){
+ if(currentPos < (position-GeneFinder.readLength) && nextPos > (position+GeneFinder.readLength)){
+ // had no chance to look at it, so create fake intron to also regard this exon and close
+ searchCompleteIntronForEndingAndMakeNewIsoform(gene, position,posiCovMap);
+ endToPassForclosedGenes = position+posiCovMap.get(position).size();
+ }
+ }
+ }
+
+ if(foundCurrentPosSupport == -1){
+ // we have no split near currentPos, so indicate this via new transcript end (define end by a fake intron starting at currentPos + read length)
+
+ searchLastIntronEndBeforeCurrentPos(gene,currentPos,searchTranscriptStart,inTranscriptMode); // this intronEnd is not allowed to be paired with the next higher key, but instead it is defined as a new exon,
+ // so indicate this for ExonSearch
+ }
+
+ }
+
+ }
+
+ /*
+ * searches the intron which has the specified position as its ending
+ * for each intron we generate a new isoform that ends here
+ */
+
+ public static void searchCompleteIntronForEndingAndMakeNewIsoform(Gene gene, int ending, TreeMap<Integer,Vector<Integer>> posiCovMap){
+
+ for(int split : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(split)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) gene.possibleIntrons.get(split)[0])){
+ if(ending == intron[1]){
+ if(testIfAlternativeIsoformIsNotContained(gene,(ending+posiCovMap.get(ending).size()),ending)){
+ gene.intronEndsThatAreNotContinued.add(intron);
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ //vecTemp.add(intron);
+ gene.alternativeTranscripts.add(new Object[] {(ending+posiCovMap.get(ending).size()),Integer.MAX_VALUE,-1,-1,ending,vecTemp});
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ * this intronEnd is not allowed to be paired with the next higher key, but instead it is defined as a new exon,
+ * so indicate this for ExonSearch
+ */
+
+ public static int searchLastIntronEndBeforeCurrentPos(Gene gene, int currentPos, int searchTranscriptStart, int inTranscriptMode){
+
+ int[] maxEnd = {-1,-1};
+
+ for(int split : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(split)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) gene.possibleIntrons.get(split)[0])){
+ if((currentPos>= intron[1]) && (maxEnd[1] < intron[1])){
+ maxEnd = intron;
+ }
+ }
+ }
+ }
+
+ if(maxEnd[1] != -1 && (searchTranscriptStart == -1) && (inTranscriptMode == -1)){ // && (gene.possibleIntrons.higherKey(maxEnd[0]) == null)){
+ if(checkIfExtractOk(gene, maxEnd, currentPos)){
+ gene.intronEndsThatAreNotContinued.add(maxEnd);
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ gene.alternativeTranscripts.add(new Object[] {(currentPos+GeneFinder.readLength),Integer.MAX_VALUE,-1,-1,maxEnd[1],vecTemp});
+
+ return 0;
+ }
+
+ }
+ return -1;
+
+ }
+
+ /*
+ * if we have identified a possible intron end, check if this end is really feasible or if there is a contradicting splice site
+ */
+
+ public static boolean checkIfExtractOk(Gene gene, int[] maxEnd, int currentPos){
+
+ if(gene.possibleIntrons.higherKey(maxEnd[0]) == null){
+ return true;
+ }else{
+
+ int higherKey = maxEnd[0];
+
+ do{
+ higherKey = gene.possibleIntrons.higherKey(higherKey);
+
+ if((higherKey > maxEnd[1]) && (higherKey < currentPos)){
+ return false;
+ }
+
+ }while(gene.possibleIntrons.higherKey(higherKey) != null);
+ }
+
+ return true;
+
+ }
+
+ /*
+ * replace entry in alternative transcripts with one containing a stop position
+ */
+
+ public static int replaceEntryInAltTransWithUpdatedOne(Gene gene, int inTranscriptMode, int currentPos, int indicatorIfWithinIntron, int nextPos){
+
+ int foundSupport = -1;
+
+ for(Object[] altTrans : gene.alternativeTranscripts){
+ if(((Integer) altTrans[0]) == inTranscriptMode){
+ if(!((indicatorIfWithinIntron == -1) && ((nextPos - currentPos) > (2*GeneFinder.readLength)))){
+ Object[] newEntry = new Object[6];
+ newEntry[0] = altTrans[0];
+ newEntry[1] = altTrans[1];
+ newEntry[2] = altTrans[2];
+ newEntry[3] = altTrans[3];
+ newEntry[4] = indicatorIfWithinIntron;
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ newEntry[5] = vecTemp;
+
+ gene.alternativeTranscripts.setElementAt(newEntry,gene.alternativeTranscripts.indexOf(altTrans));
+ foundSupport = 0; // to include the rnas
+ }
+
+ if(foundSupport == -1){
+ gene.alternativeTranscripts.removeElement(altTrans);
+ foundSupport = 2;
+ }
+
+ break;
+ }
+ }
+
+ return foundSupport;
+
+ }
+
+ /*
+ * replace entry in alternative transcripts with one containing a stop position, which is also the end of the transcript
+ */
+
+ public static int replaceEntryInAltTransWithUpdatedOne_endTranscript(Gene gene, int inTranscriptMode, int currentPos, int indicatorIfWithinIntron){
+
+ int foundSupport = -1;
+
+ for(Object[] altTrans : gene.alternativeTranscripts){
+ if(((Integer) altTrans[0]) == inTranscriptMode){
+
+ if((gene.alternativeTranscripts.get(gene.alternativeTranscripts.indexOf(altTrans)).length < 6) || ((gene.alternativeTranscripts.get(gene.alternativeTranscripts.indexOf(altTrans)).length == 6) && (indicatorIfWithinIntron != -1))){
+ Object[] newEntry = new Object[7];
+ newEntry[0] = altTrans[0];
+ newEntry[1] = altTrans[1];
+ newEntry[2] = altTrans[2];
+ newEntry[3] = altTrans[3];
+ newEntry[4] = indicatorIfWithinIntron;
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ newEntry[5] = vecTemp;
+ newEntry[6] = Integer.MAX_VALUE;
+
+ gene.alternativeTranscripts.setElementAt(newEntry,gene.alternativeTranscripts.indexOf(altTrans));
+ foundSupport = 0; // to include the rnas
+ }
+
+ break;
+ }
+ }
+
+ return foundSupport;
+
+ }
+
+
+ /*
+ * this intronEnd is not allowed to be paired with the next higher key, but instead it is defined as a new exon,
+ * so indicate this for ExonSearch
+ * note that in the close gene version we try to make sure, that a new transcript is included to avoid big exons
+ */
+
+ public static int searchLastIntronEndBeforeCurrentPos_closeGene(Gene gene, int splitTooBig, Vector<Integer> keysToErase){
+
+ int[] maxEnd = {-1,-1};
+
+ for(int split : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(split)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) gene.possibleIntrons.get(split)[0])){
+ if((splitTooBig>= intron[1]) && (maxEnd[1] < intron[1])){
+ maxEnd = intron;
+ }
+ }
+ }
+ }
+
+ if(maxEnd[1] != -1){
+ int indicator = checkIfExtractOk_closeGene(gene, maxEnd, splitTooBig);
+
+ if(indicator == 1){ // simply extract the alternative transcript
+
+ if(testIfAlternativeIsoformIsNotContained(gene,splitTooBig,maxEnd[1])){
+ Vector<int[]> vecTemp = new Vector<int[]>();
+ gene.intronEndsThatAreNotContinued.add(maxEnd);
+ gene.alternativeTranscripts.add(new Object[] {splitTooBig,Integer.MAX_VALUE,-1,-1,maxEnd[1],vecTemp});
+ }
+
+ return 1;
+ }
+
+ }else{
+ // check if splitTooBig is the only site or the first site, if yes, assign to gene start
+ boolean assignToGeneStart = true;
+
+ for(int split : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(split)[0] != null){
+ if(split < splitTooBig){
+ assignToGeneStart = false;
+ break;
+ }
+ }
+ }
+
+ if(assignToGeneStart){
+
+ if(gene.alternativeTranscripts.size() > 0){
+ // search for the alternative start with the left-most position
+ int start = Integer.MAX_VALUE;
+ int bestPos = -1;
+ for(int posTrans = 0; posTrans < gene.alternativeTranscripts.size();++posTrans){
+
+ Object[] altTrans = gene.alternativeTranscripts.get(posTrans);
+ int startThis = -1;
+
+ if(((Integer)altTrans[1]).intValue() == Integer.MAX_VALUE){
+ startThis = ((Integer)altTrans[4]).intValue();
+ }else{
+ startThis = ((Integer)altTrans[0]).intValue();
+ }
+
+ if((startThis < start) && (startThis < splitTooBig)){
+ gene.startPos = startThis;
+ start = startThis;
+ bestPos = posTrans;
+ }
+ }
+
+ if(bestPos != -1){
+ // defined an alternative transcript as the new start, so this transcript is regarded as normal and can be erased
+ gene.alternativeTranscripts.removeElementAt(bestPos);
+ }
+ }
+
+ return 0;
+ }
+
+ }
+
+ return -1;
+
+ }
+
+ /*
+ * this intronEnd is not allowed to be paired with the next higher key, but instead it is defined as a new exon,
+ * so indicate this for ExonSearch
+ * note that in the close gene version we try to make sure, that a new transcript is included to avoid big exons
+ */
+
+ public static int checkSplitRemovalSituation(Gene gene, int splitTooBig, Vector<Integer> keysToErase){
+
+ boolean assignToGeneStart = true;
+
+ for(int split : gene.possibleIntrons.keySet()){
+ if(gene.possibleIntrons.get(split)[0] != null){
+ if(split < splitTooBig){
+ assignToGeneStart = false;
+ break;
+ }
+ }
+ }
+
+ if(assignToGeneStart){
+
+ if(gene.alternativeTranscripts.size() > 0){
+ // search for the alternative start with the left-most position
+ int start = Integer.MAX_VALUE;
+ int bestPos = -1;
+ for(int posTrans = 0; posTrans < gene.alternativeTranscripts.size();++posTrans){
+
+ Object[] altTrans = gene.alternativeTranscripts.get(posTrans);
+ int startThis = -1;
+
+ if(((Integer)altTrans[1]).intValue() == Integer.MAX_VALUE){
+ startThis = ((Integer)altTrans[4]).intValue();
+ }else{
+ startThis = ((Integer)altTrans[0]).intValue();
+ }
+
+ if((startThis < start) && (startThis < splitTooBig)){
+ gene.startPos = startThis;
+ start = startThis;
+ bestPos = posTrans;
+ }
+ }
+
+ if(bestPos != -1){
+ // defined an alternative transcript as the new start, so this transcript is regarded as normal and can be erased
+ gene.alternativeTranscripts.removeElementAt(bestPos);
+ }
+ }
+ }
+ return 0;
+ }
+
+ /*
+ * check if the new isoform is already contained
+ */
+
+ public static boolean testIfAlternativeIsoformIsNotContained(Gene gene, int split, int end){
+
+ for(Object[] altTrans : gene.alternativeTranscripts){
+ if(altTrans.length >= 6){
+ int start = (Integer) altTrans[0];
+ int stop = (Integer) altTrans[4];
+
+ if((start == split) && (stop == end)){
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ /*
+ * if we have identified a possible intron end, check if this end is really feasible or if there is a contradicting splice site
+ */
+
+ public static int checkIfExtractOk_closeGene(Gene gene, int[] maxEnd, int currentPos){
+
+ if(gene.possibleIntrons.higherKey(maxEnd[0]) == null){
+ return 1;
+ }else{
+
+ int higherKey = maxEnd[0];
+
+ do{
+ higherKey = gene.possibleIntrons.higherKey(higherKey);
+
+ if((higherKey > maxEnd[1]) && (higherKey < currentPos)){
+ return -1;
+ }else if((higherKey > currentPos) && (higherKey < gene.stopPos)){
+ return -1;
+ }
+
+ }while(gene.possibleIntrons.higherKey(higherKey) != null);
+ }
+
+ return 1;
+
+ }
+
+ /*
+ * check whether an intron is already included for a certain splice site, and if not, add it
+ * note: most of the times these introns only emerge due to the merging of two clusters, so their supporting rna vector is empty!
+ */
+
+ public static void checkIfIntronIsIncluded_AndAdd(Gene cluster, int[] intron){
+
+
+ if(cluster.possibleIntrons.containsKey(intron[0])){
+
+ if((cluster.possibleIntrons.get(intron[0])[0] != null)){
+ boolean foundIntron = false;
+
+ for(int[] intronOld : ((Vector<int[]>)cluster.possibleIntrons.get(intron[0])[0])){
+ if(intronOld[1] == intron[1]){
+ foundIntron = true;
+ break;
+ }
+ }
+
+ if(!foundIntron){
+ Vector<Rna> rnaTmp = new Vector<Rna>();
+ ( (Vector<int[]>) cluster.possibleIntrons.get(intron[0])[0]).add(intron);
+ ( (Vector<Vector<Rna>>) cluster.possibleIntrons.get(intron[0])[1]).add(rnaTmp);
+ }
+ }
+
+ }else{
+
+ Vector<int[]> intronTmp = new Vector<int[]>();
+ intronTmp.add(intron);
+ Vector<Rna> rnaTmp = new Vector<Rna>();
+ Vector<Vector<Rna>> rnaVec = new Vector<Vector<Rna>>();
+ rnaVec.add(rnaTmp);
+
+ Object[] tmp = {intronTmp,rnaVec,-1,-1};
+ cluster.possibleIntrons.put(intron[0],tmp);
+ }
+ }
+
+ /*
+ * extract all introns supported by a sufficient number of reads
+ * posiCovMap contains positions that belong to different introns and indicates how the coverage map has to be updated at this position
+ */
+
+ public static Object[] findIntrons_RespectAlternative(Gene cluster, Contig thisContig, int splitStart, int currentPos, int currentCompete, TreeMap<Integer,Vector<Integer>> posiCovMap, double limit){
+
+ Vector<int[]> listOfIntrons = new Vector<int[]>(); // contains all introns supported by this splice site (note: first take all possible introns, remove introns without sufficient support later)
+ Vector<Rna> rnasThatDoNotFit = new Vector<Rna>(); // store all rnas that have to be erased from this position
+ Vector<Rna> rnasThatDoNotSupportAnySplit = new Vector<Rna>(); // store all rnas that are not split reads
+
+ Vector<Integer> otherSpliceSites = new Vector<Integer>(); // if the currentSite does not have enough support, but others in reach, store them to regard them in coverage update
+
+ boolean hadSiteBefore = false;
+
+ if(cluster.possibleIntrons.containsKey(splitStart) && cluster.possibleIntrons.get(splitStart)[0] != null){
+ listOfIntrons = (Vector<int[]>) cluster.possibleIntrons.get(splitStart)[0];
+ // simply add up all rna vectors if new support is found
+ hadSiteBefore = true;
+ }
+
+ for(Rna rna : thisContig.positionTOmappingRnas.get(currentPos)){
+
+ int occurenceCounter = 0;
+
+ Vector<Object[]> badInfo = new Vector<Object[]>(); // if a splice site is not accepted, remove this info and remove this read from the current position
+
+ for(Object[] info : rna.contigsMappedOn){
+ if(((Integer) info[1] == currentPos) && (((Contig) info[0]).equals(thisContig))){
+
+ occurenceCounter++;
+
+ if(info[4] != null){
+ if(((int[])info[4])[1] == splitStart){
+ if(thisContig.splicePositions.get(splitStart) >= limit){
+ Object[] returnObject = findRightIntronAndAddRna(cluster, currentPos, ((int[])info[4])[1] , hadSiteBefore, listOfIntrons, posiCovMap, rna, info);
+
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnObject[0];
+ boolean updateCompete = (Boolean) returnObject[1];
+ listOfIntrons = (Vector<int[]>) returnObject[2];
+
+ if((currentCompete == -1) || updateCompete){
+ currentCompete = Math.max(currentCompete,(((int[])info[4])[0] + splitStart));
+ }
+
+ if(info[6] != null){
+ if(((String)info[6]).equals("+")){
+ cluster.direcCounter[0]++;
+ }else{
+ cluster.direcCounter[1]++;
+ }
+ }
+ }else{
+ badInfo.add(info);
+ }
+ }else{
+ if(thisContig.splicePositions.get(((int[])info[4])[1]) >= limit){
+ otherSpliceSites.add(((int[])info[4])[1]);
+ Vector<int[]> listOfIntrons_alternative = new Vector<int[]>();
+
+ boolean hadSiteBefore_alternative = false;
+
+ if(cluster.possibleIntrons.containsKey(((int[])info[4])[1]) && cluster.possibleIntrons.get(((int[])info[4])[1])[0] != null){
+ listOfIntrons_alternative = (Vector<int[]>) cluster.possibleIntrons.get(((int[])info[4])[1])[0];
+ // simply add up all rna vectors if new support is found
+ hadSiteBefore_alternative = true;
+ }
+
+ Object[] returnObject = findRightIntronAndAddRna(cluster, currentPos, ((int[])info[4])[1] , hadSiteBefore_alternative, listOfIntrons_alternative, posiCovMap, rna, info);
+
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnObject[0];
+
+ listOfIntrons_alternative = (Vector<int[]>) returnObject[2]; // TODO: was listOfIntrons before!
+
+ if(currentCompete == -1){ // at the moment no open alternative interval, so open a new one
+
+ if(cluster.possibleIntrons.containsKey(splitStart) && cluster.possibleIntrons.get(splitStart)[0] != null && !((Vector<int[]>)cluster.possibleIntrons.get(splitStart)[0]).isEmpty()){
+ currentCompete = Math.max(((Vector<int[]>) cluster.possibleIntrons.get(splitStart)[0]).get(0)[1],(((int[])info[4])[0] + ((int[])info[4])[1]));
+ }else{
+ currentCompete = ((int[])info[4])[0] + ((int[])info[4])[1];
+ }
+
+ }else{
+ currentCompete = Math.max(currentCompete,(((int[])info[4])[0] + ((int[])info[4])[1]));
+ }
+
+ if(info[6] != null){
+ if(((String)info[6]).equals("+")){
+ cluster.direcCounter[0]++;
+ }else{
+ cluster.direcCounter[1]++;
+ }
+ }
+
+ }else{
+ // else we will not accept this site, so delete this entry for the mappings of the current read
+ badInfo.add(info);
+ }
+
+ }
+
+ }else{
+ if(thisContig.splicePositions.get(splitStart) >= limit){
+ if(!rnasThatDoNotSupportAnySplit.contains(rna)){
+ rnasThatDoNotSupportAnySplit.add(rna);
+ HelperFunctions_GeneSearch.updatePosiCovMap(posiCovMap, splitStart, (GeneFinder.readLength - (splitStart-currentPos)));
+ }
+ }
+
+ }
+
+ }
+ }
+
+ if(badInfo.size() > 0){
+
+ for(Object[] info : badInfo){
+ rna.contigsMappedOn.removeElement(info);
+ rna.assignedNum--;
+ if(rna.contigsMappedOn.size() == 0){
+ rnasThatDoNotFit.add(rna);
+ }
+
+ }
+
+ if(occurenceCounter <= badInfo.size() && !(rnasThatDoNotFit.contains(rna))){
+ rnasThatDoNotFit.add(rna);
+ }
+
+ }
+
+ if(occurenceCounter > 1){
+ for(int i = (occurenceCounter - badInfo.size()); i > 1;i--){ // if there are x occurrences, we include x-1 in moreThanOnehitRnas and 1 in idTomaapingRnas
+ cluster.moreThanOneHitRnas.add(rna.rnaID);
+ }
+ }
+
+
+ }
+
+ if(rnasThatDoNotFit.size() > 0){
+ thisContig.positionTOmappingRnas.get(currentPos).removeAll(rnasThatDoNotFit);
+ }
+
+ if(rnasThatDoNotSupportAnySplit.size() != 0){
+ if(cluster.possibleIntrons.containsKey(splitStart)){
+ cluster.possibleIntrons.get(splitStart)[2] = (currentPos + GeneFinder.readLength);
+ }else{
+ Vector<int[]> intronVec_tmp = new Vector<int[]>();
+ Vector<Vector<Rna>> rnaTmp = new Vector<Vector<Rna>>();
+ Object[] objectTmp = {intronVec_tmp,rnaTmp,(currentPos + GeneFinder.readLength),-1};
+ cluster.possibleIntrons.put(splitStart,objectTmp);
+ }
+
+ if(cluster.possibleFussyExons.containsKey(splitStart)){
+ cluster.possibleFussyExons.get(splitStart).addAll(rnasThatDoNotSupportAnySplit);
+ }else{
+ cluster.possibleFussyExons.put(splitStart,rnasThatDoNotSupportAnySplit);
+ }
+
+ }
+
+ Object[] returnObject ={currentCompete,posiCovMap,rnasThatDoNotSupportAnySplit,otherSpliceSites};
+
+ return returnObject;
+ }
+
+ /*
+ * help function to search for the right intron and add the rna
+ */
+
+ public static Object[] findRightIntronAndAddRna(Gene cluster, int currentPos, int splitStart, boolean hadSiteBefore, Vector<int[]> listOfIntrons, TreeMap<Integer,Vector<Integer>> posiCovMap, Rna rna, Object[] info){
+
+
+ boolean foundIntron = false;
+
+ for(int pos = 0; pos < listOfIntrons.size(); ++pos){
+ int[] intron = listOfIntrons.get(pos);
+ if(intron[1] == (((int[])info[4])[0] + splitStart)){
+ boolean updateMap = true;
+ // already have this intron, so add to list of supporting rnas
+ if(hadSiteBefore){
+ if(!(((Vector<Vector<Rna>>) cluster.possibleIntrons.get(splitStart)[1]).get(pos).contains(rna))){
+ ((Vector<Vector<Rna>>) cluster.possibleIntrons.get(splitStart)[1]).get(pos).add(rna);
+ }else{
+ updateMap = false;
+ }
+
+ }else{
+ updatePossibleIntronsOfCluster(cluster, splitStart, rna,((int[])info[4])[0]);
+ listOfIntrons = (Vector<int[]>) cluster.possibleIntrons.get(splitStart)[0];
+ hadSiteBefore = true;
+ }
+ foundIntron = true;
+ if(updateMap){
+ HelperFunctions_GeneSearch.updatePosiCovMap(posiCovMap,(((int[])info[4])[0] + splitStart), (GeneFinder.readLength - (splitStart-currentPos)));
+ }
+ break; // break to get out of this intron for-loop
+ }
+ }
+ boolean updateCompete = false;
+
+ if(!foundIntron){
+
+ if(hadSiteBefore){
+ int[] intron_tmp = {splitStart,((int[])info[4])[0] + splitStart};
+ ((Vector<int[]>) cluster.possibleIntrons.get(splitStart)[0]).add(intron_tmp);
+ Vector<Rna> rnaTmp = new Vector<Rna>();
+ rnaTmp.add(rna);
+ ((Vector<Vector<Rna>>) cluster.possibleIntrons.get(splitStart)[1]).add(rnaTmp);
+ updateCompete = true;
+ }else{
+ updatePossibleIntronsOfCluster(cluster, splitStart, rna,((int[])info[4])[0]);
+ hadSiteBefore = true;
+ updateCompete = true;
+ }
+
+ listOfIntrons = (Vector<int[]>) cluster.possibleIntrons.get(splitStart)[0];
+
+ HelperFunctions_GeneSearch.updatePosiCovMap(posiCovMap,(((int[])info[4])[0] + splitStart), (GeneFinder.readLength - (splitStart-currentPos)));
+ }
+
+ return new Object[] {posiCovMap,updateCompete,listOfIntrons};
+ }
+
+ /*
+ * function to update the possible intron feature of the cluster, needed for alternative splicing
+ */
+
+ public static void updatePossibleIntronsOfCluster(Gene cluster, int splitStart, Rna rna, int sizeIntron){
+
+ Object[] objectTmp = new Object[4];
+ Vector<int[]> intronVec_tmp = new Vector<int[]>();
+ Vector<Vector<Rna>> rnaTmp = new Vector<Vector<Rna>>();
+ Vector<Rna> firstVec = new Vector<Rna>();
+
+ firstVec.add(rna);
+ int[] intron_tmp = {splitStart, sizeIntron + splitStart};
+ intronVec_tmp.add(intron_tmp);
+ rnaTmp.add(firstVec);
+
+ objectTmp[0] = intronVec_tmp;
+ objectTmp[1] = rnaTmp;
+ objectTmp[2] = -1;
+ objectTmp[3] = -1;
+
+ cluster.possibleIntrons.put(splitStart,objectTmp);
+
+ }
+
+
+ /*
+ * after a splice site change, check if the introns of the last splice site have enough support
+ */
+
+ public static Object[] checkIntronsAfterSpliceSiteSwitch(Gene cluster, int spliceSite, int currentMaxInterval, TreeMap<Integer,Vector<Integer>> posiCovMap, int nextPos, double limit){
+
+ int maxInterval = -1;
+ boolean changeCurrentMaxInterval = false;
+ boolean foundAtLeastOneIntron = false; // if set to true, this indicates that this splice site can stay in possible introns
+ boolean switchedFromFussyToExon = false;
+ boolean notContained = false; // indicates whether this spliceSite is contained in cluster or not
+
+ if(cluster.possibleIntrons.containsKey(spliceSite)){
+ for(int position = ((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).size() - 1; position >= 0; --position){
+ if(((Vector<Vector<Rna>>)cluster.possibleIntrons.get(spliceSite)[1]).get(position).size() < limit){
+
+ // intron has not enough support, so erase and also delete rnas from cluster
+
+ int[] badIntron = ((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).get(position);
+ ((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).removeElementAt(position);
+
+ if(badIntron[1] == currentMaxInterval){
+ int localMax = checkIfOtherSpliceSitesSupportCurrentmax(cluster,currentMaxInterval,nextPos);
+ if(localMax != -1){
+ maxInterval = localMax;
+ }
+ changeCurrentMaxInterval = true;
+ }
+
+ for(Rna rna : ((Vector<Vector<Rna>>)cluster.possibleIntrons.get(spliceSite)[1]).get(position)){
+
+ if(cluster.moreThanOneHitRnas.contains(rna.rnaID)){
+ // if it is contained here, this serves as one "back-up life" that is now destroyed
+ cluster.moreThanOneHitRnas.remove(rna.rnaID);
+ }else{
+ cluster.idTOassociatedRnas.remove(rna.rnaID);
+ }
+
+ for(Object[] info : rna.contigsMappedOn){
+ if(info[4] != null){
+ if((((int[])info[4])[1] == badIntron[0]) && ((((int[])info[4])[0]) + (((int[])info[4])[1]) == badIntron[1])){
+
+ int diffToSplitStart = (badIntron[0] - (Integer) info[1]);
+
+ posiCovMap = HelperFunctions_GeneSearch.updatePosiCovMap_AfterSpliceSwitch(posiCovMap,badIntron[1],(GeneFinder.readLength-diffToSplitStart));
+
+ rna.contigsMappedOn.removeElement(info);
+ rna.assignedNum = rna.assignedNum - 1;
+ break;
+ }
+ }
+ }
+
+ }
+
+ ((Vector<Vector<Rna>>)cluster.possibleIntrons.get(spliceSite)[1]).removeElementAt(position);
+
+ }else{
+ maxInterval = Math.max(maxInterval,((Vector<int[]>)cluster.possibleIntrons.get(spliceSite)[0]).get(position)[1]);
+ foundAtLeastOneIntron = true;
+ }
+ }
+ }else{
+ notContained = true;
+ }
+
+ if(changeCurrentMaxInterval){
+ currentMaxInterval = maxInterval; // if all splice sites have been erased, set to -1, but this is as desired
+ }
+
+ if(!foundAtLeastOneIntron && cluster.possibleIntrons.containsKey(spliceSite)){
+ // also remove this splice site from possible intron map, only remember "fussy exon", if exist
+
+ int fussyExon = ((Integer)cluster.possibleIntrons.get(spliceSite)[2]);
+
+ if(fussyExon != -1){
+ // remove from fussy exons, serves as real exon now!
+
+ if(cluster.possibleFussyExons.containsKey(spliceSite)){
+ cluster.possibleFussyExons.remove(spliceSite);
+ }else{
+ System.err.println("Splice site not in fussy exons!");
+ }
+
+ switchedFromFussyToExon = true;
+ }
+
+ cluster.possibleIntrons.remove(spliceSite);
+ }
+
+ Object[] returnObject = {currentMaxInterval,posiCovMap,switchedFromFussyToExon,notContained};
+
+ return returnObject;
+ }
+
+ /*
+ * only change currentCompete interval if there are no other splice sites supporting it
+ */
+
+ public static Integer checkIfOtherSpliceSitesSupportCurrentmax(Gene cluster, int currentMaxInterval, int nextPos){
+
+ int localMax = -1;
+
+ for(int key : cluster.possibleIntrons.keySet()){
+ if(cluster.possibleIntrons.get(key)[0] != null){
+ for(int[] intron : ( (Vector<int[]>) cluster.possibleIntrons.get(key)[0])){
+ if(intron[1] >= nextPos){
+ if(intron[1] > localMax){
+ localMax = intron[1];
+ }
+ }
+ }
+ }
+ }
+
+ return localMax;
+ }
+
+ /*
+ * find all intron ends associated with the given spliceSite
+ */
+
+ public static Vector<Integer> findIntronEnds(int spliceKey, int nextPos,Contig thisContig){
+
+ Vector<Integer> intronEndings = new Vector<Integer>();
+
+ for(Rna rna : thisContig.positionTOmappingRnas.get(nextPos)){
+
+ for(Object[] info : rna.contigsMappedOn){
+ if(((Integer) info[1] == nextPos) && (((Contig) info[0]).equals(thisContig))){
+
+ if(info[4] != null){
+
+ if(thisContig.splicePositions.get(spliceKey) >= GeneFinder.spliceLim){
+ int ending = ((int[])info[4])[0] + ((int[])info[4])[1];
+ if(!intronEndings.contains(ending)){
+ intronEndings.add(ending);
+ }
+
+ }
+
+
+ }
+ }
+ }
+ }
+
+ return intronEndings;
+ }
+
+ /*
+ * manages the splice analysis
+ */
+
+ public static Object[] performSpliceAna(Gene cluster, Contig thisContig, int spliceKey, int currentPos, int nextPos, int currentCompete, int currentCompeteStart, TreeMap<Integer,Vector<Integer>> posiCovMap){
+
+ Object[] returnFromAna = new Object[5]; // currentCompete, currentCompeteStart, posiCovMap, considerSpliceSiteForCovUpdate, rnasThatDoNotSupportSplit
+
+ // find splits associated with this site, only take introns into account that are supported by a sufficient number of reads
+
+ Object[] returnObject = IntronExonSearch.findIntrons_RespectAlternative(cluster, thisContig, spliceKey, nextPos, currentCompete, posiCovMap, GeneFinder.spliceLim);
+
+ returnFromAna[0] = (Integer) returnObject[0];
+
+ if(((Integer)returnFromAna[0]).intValue() != -1){
+ if(spliceKey < ((Integer)returnFromAna[0]).intValue() && (currentPos > currentCompeteStart)){
+ returnFromAna[1] = spliceKey;
+ }else{
+ returnFromAna[1] = currentCompeteStart;
+ }
+ }else{
+ returnFromAna[1] = -1;
+ }
+
+ returnFromAna[2] = (TreeMap<Integer,Vector<Integer>>) returnObject[1];
+
+ returnFromAna[4] = ((Vector<Rna>) returnObject[2]);
+
+ if(thisContig.splicePositions.get(spliceKey) >= GeneFinder.spliceLim){
+ returnFromAna[3] = spliceKey;
+ }else{
+ Vector<Integer> otherSpliceSites = ((Vector<Integer>) returnObject[3]);
+ if(!otherSpliceSites.isEmpty()){
+ int minSite = Integer.MAX_VALUE;
+ for(int site : otherSpliceSites){
+ if(site < minSite){
+ minSite = site;
+ }
+ }
+ returnFromAna[3] = minSite;
+ }else{
+ returnFromAna[3] = -1;
+ }
+ }
+
+ return returnFromAna;
+ }
+}
diff --git a/src/geneFinder/LocalTwinResolve.java b/src/geneFinder/LocalTwinResolve.java
new file mode 100755
index 0000000..d33f110
--- /dev/null
+++ b/src/geneFinder/LocalTwinResolve.java
@@ -0,0 +1,64 @@
+package geneFinder;
+
+import types.*;
+
+/**
+ * contains methods to make sure, that extracted twin clusters and can be treated locally (no relationship to neighboring clusters) are
+ * resolved before starting the ambiguous reads optimization
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class LocalTwinResolve {
+
+ /*
+ * takes two twins and their assigned reads and decides whether the forward, or the reverse twin shall live
+ * note: at the moment only consider exon length
+ */
+
+ public static Gene resolveTwins(Gene cluster){
+
+ Gene twin = cluster.twinNode;
+
+ cluster.exonLength = (cluster.stopPos-cluster.startPos +1);
+ twin.exonLength = (twin.stopPos-twin.startPos +1);
+
+ if(cluster.exonLength <= twin.exonLength){
+ // return the cluster
+ undeclareShared(cluster);
+ cluster.twinNode = null;
+
+ cluster.exonsOfGene.clear();
+ cluster.exonLength = 0;
+
+ return cluster;
+ }else{
+ // return the twin
+ undeclareShared(twin);
+ twin.twinNode = null;
+
+ twin.exonsOfGene.clear();
+ twin.exonLength = 0;
+
+ return twin;
+ }
+
+ }
+
+ /*
+ * when we removed one twin cluster, declare all the rnas of the remaining one as "not shared"
+ */
+
+ public static void undeclareShared(Gene cluster){
+
+ for(String rnaKey : cluster.idTOassociatedRnas.keySet()){
+ Rna rna = ((Rna) cluster.idTOassociatedRnas.get(rnaKey)[0]);
+ rna.isSharedBy.removeElement(cluster.geneID);
+ rna.isSharedBy.removeElement(cluster.twinNode.geneID);
+ }
+ }
+
+}
diff --git a/src/geneFinder/MergeClusters.java b/src/geneFinder/MergeClusters.java
new file mode 100755
index 0000000..2b422fd
--- /dev/null
+++ b/src/geneFinder/MergeClusters.java
@@ -0,0 +1,283 @@
+package geneFinder;
+
+import types.Contig;
+import types.Gene;
+import types.Rna;
+
+/**
+ * contains methods to correctly merge two extracted raw clusters
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class MergeClusters {
+
+
+ /*
+ * check if merging is necessary and grab pairs according to their exon length
+ */
+
+ public static boolean checkIfNeedMerge(Gene cluster, Gene clusterBeforeInVec, Contig thisContig, StringBuffer contigSeq){
+
+ boolean wantMerge = false;
+ int tolerance = GeneFinder.interval;
+
+ if(cluster.startPos <= (clusterBeforeInVec.stopPos+1+tolerance)){
+
+ wantMerge = true;
+
+ }else if(cluster.twinNode != null){
+
+ if(cluster.twinNode.startPos <= (clusterBeforeInVec.stopPos+1+tolerance)){
+
+ wantMerge = true;
+
+ } else if(clusterBeforeInVec.twinNode != null && (cluster.twinNode.startPos <= (clusterBeforeInVec.twinNode.stopPos+1+tolerance))){
+
+ wantMerge = true;
+
+ }
+
+ }else if(clusterBeforeInVec.twinNode != null && (cluster.startPos <= (clusterBeforeInVec.twinNode.stopPos+1+tolerance))){
+
+ wantMerge = true;
+ }
+
+ if(wantMerge){
+
+ // grab the right pair
+
+ Gene firstClust = null; // will be assigned accordingly
+ Gene secondClust = null;
+
+ if(cluster.twinNode != null && (!cluster.freeToResolve && !cluster.twinNode.freeToResolve)){
+ if(cluster.isMergedTwin){
+ firstClust = cluster;
+ }else{
+ firstClust = cluster.twinNode;
+ }
+ }else{
+ firstClust = cluster;
+ }
+
+ if(clusterBeforeInVec.twinNode != null && (!clusterBeforeInVec.freeToResolve && !clusterBeforeInVec.twinNode.freeToResolve)){
+ if(clusterBeforeInVec.isMergedTwin){
+ secondClust = clusterBeforeInVec;
+ }else{
+ secondClust = clusterBeforeInVec.twinNode;
+ }
+ }else{
+ secondClust = clusterBeforeInVec;
+ }
+
+ wantMerge = doOverlapMerge(firstClust, secondClust, thisContig, contigSeq); // we took the largest possible exon length partners to perform the merge
+ }
+
+ return wantMerge;
+ }
+
+ /*
+ * when two clusters overlap after the start and stop codon determination, merge both
+ * new: do not pay attention to the direction, only grab the largest possible exon length!
+ * erase other twin
+ */
+
+ public static boolean doOverlapMerge(Gene cluster, Gene clusterBeforeInVec, Contig thisContig, StringBuffer contigSeq){
+
+ HelperFunctions_GeneSearch.addAssociatedRnas(cluster, clusterBeforeInVec.idTOassociatedRnas);
+
+ HelperFunctions_GeneSearch.declareShared_AfterMerge(cluster,clusterBeforeInVec);
+
+ if(clusterBeforeInVec.twinNode != null){
+
+ for(String rnaKey : cluster.idTOassociatedRnas.keySet()){
+
+ Rna rna = ((Rna) cluster.idTOassociatedRnas.get(rnaKey)[0]);
+
+ if(rna.isSharedBy.contains(clusterBeforeInVec.twinNode.geneID)){
+
+ rna.isSharedBy.removeElement(clusterBeforeInVec.twinNode.geneID);
+ if(rna.isSharedBy.contains(cluster.geneID)){
+ rna.isSharedBy.removeElement(cluster.geneID);
+ }
+
+ }
+ }
+ }
+
+ if(cluster.twinNode != null){
+ for(String rnaKey : cluster.idTOassociatedRnas.keySet()){
+
+ Rna rna = ((Rna) cluster.idTOassociatedRnas.get(rnaKey)[0]);
+
+ if(rna.isSharedBy.contains(cluster.twinNode.geneID)){
+
+ rna.isSharedBy.removeElement(cluster.twinNode.geneID);
+ if(rna.isSharedBy.contains(cluster.geneID)){
+ rna.isSharedBy.removeElement(cluster.geneID);
+ }
+
+ }
+ }
+
+ cluster.twinNode = null;
+ cluster.hadTwinBefore = true;
+ cluster.isMergedTwin = true;
+ cluster.freeToResolve = false;
+ }
+
+ cluster.possibleStarts_Forward = clusterBeforeInVec.possibleStarts_Forward;
+ cluster.possibleStarts_Reverse = clusterBeforeInVec.possibleStarts_Reverse;
+ cluster.startPos = clusterBeforeInVec.startPos;
+ cluster.hasStop_temp = true;
+
+ cluster.direcCounter[0] += clusterBeforeInVec.direcCounter[0];
+ cluster.direcCounter[1] += clusterBeforeInVec.direcCounter[1];
+
+ cluster.possibleIntrons.putAll(clusterBeforeInVec.possibleIntrons);
+ cluster.possibleFussyExons.putAll(clusterBeforeInVec.possibleFussyExons);
+ cluster.alternativeTranscripts.addAll(clusterBeforeInVec.alternativeTranscripts);
+ cluster.intronEndsThatAreNotContinued.addAll(clusterBeforeInVec.intronEndsThatAreNotContinued);
+
+ ExtractGeneCandidates.handleFrameSearchWithoutTwin(cluster, cluster.possibleStops_Forward[0], cluster.possibleStops_Reverse[0], contigSeq,true);
+
+ if(GeneFinder.isProkaryote || GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(cluster.startPos,cluster.stopPos+1);
+ }
+
+ thisContig.allGenes.remove(thisContig.allGenes.size()-1);
+ thisContig.allGenes.add(cluster);
+
+ return true;
+ }
+
+ /*
+ * manages all the steps necessary to perform a merge of two clusters (inkl. twin)
+ */
+
+ public static void mergeWithOneIncludeTwin(Gene cluster, Gene clusterBef, StringBuffer contigSeq,boolean freeToResolve, boolean isLeading, boolean hasStop){
+
+ updateCluster_AfterMerging(cluster,clusterBef,cluster.stopPos-2,contigSeq);
+ HelperFunctions_GeneSearch.declareShared_AfterMerge(cluster,clusterBef);
+ handleTwinsWhenMerge(cluster,clusterBef,freeToResolve,isLeading,hasStop);
+ }
+
+ /*
+ * after we merged the current cluster with the one before, perform the update stuff
+ * deleted: boolean overlaps
+ */
+
+ public static void updateCluster_AfterMerging(Gene cluster, Gene clusterBef, int stopPos, StringBuffer contigSeq){
+
+ HelperFunctions_GeneSearch.addAssociatedRnas(cluster, clusterBef.idTOassociatedRnas);
+
+ cluster.possibleIntrons.putAll(clusterBef.possibleIntrons);
+ cluster.possibleFussyExons.putAll(clusterBef.possibleFussyExons);
+ cluster.alternativeTranscripts.addAll(clusterBef.alternativeTranscripts);
+ cluster.intronEndsThatAreNotContinued.addAll(clusterBef.intronEndsThatAreNotContinued);
+
+ if(GeneFinder.isProkaryote || GeneFinder.inprogeaCall){
+ cluster.sequence = contigSeq.substring(clusterBef.startPos,stopPos + 3);
+ }
+
+ cluster.startPos = clusterBef.startPos;
+ cluster.possibleStarts_Forward = clusterBef.possibleStarts_Forward;
+ cluster.possibleStarts_Reverse = clusterBef.possibleStarts_Reverse;
+
+ cluster.direcCounter[0] += clusterBef.direcCounter[0];
+ cluster.direcCounter[1] += clusterBef.direcCounter[1];
+
+ cluster.stopPos = stopPos+2;
+ cluster.hasStop_temp = true;
+
+ }
+
+ /*
+ * when previous cluster and current cluster are merged, do the necessary updating stuff
+ */
+
+ public static void handleTwinsWhenMerge(Gene cluster, Gene clusterBef, boolean isFreeToResolve, boolean clusterIsLeading, boolean hasStop){
+
+ cluster.freeToResolve = isFreeToResolve;
+ cluster.onRevStrand = clusterBef.onRevStrand;
+
+ cluster.twinNode = clusterBef.twinNode;
+ cluster.twinNode.twinNode = cluster;
+
+ cluster.twinNode.freeToResolve = isFreeToResolve;
+
+ cluster.hasStop_temp = hasStop;
+ cluster.isMergedTwin = clusterIsLeading;
+ cluster.twinNode.isMergedTwin = false;
+
+ cluster.hadTwinBefore = true;
+ cluster.twinNode.hadTwinBefore = true;
+ }
+
+ /*
+ * we have a leading twin, so find out which one and merge
+ * is necessary for the cluster extraction merge, if the cluster before has a twin and either we found no stop or a stop in both directions
+ */
+
+ public static void findLeadingTwinAndMerge(Gene cluster, Gene clusterBef, boolean has_Stop, boolean overlaps, boolean overlapsTwin, int[] intronBef, int[] intronBef_twin, StringBuffer contigSeq){
+
+ if(!clusterBef.freeToResolve && !clusterBef.twinNode.freeToResolve){
+ if(clusterBef.isMergedTwin){
+ // only merge with clusterBef
+ mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,false,true,has_Stop);
+ if(!overlaps){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef);
+ }
+ }else if(clusterBef.twinNode.isMergedTwin){
+ // only merge with twin
+ mergeWithOneIncludeTwin(cluster,clusterBef.twinNode,contigSeq,false,true,has_Stop);
+ if(!overlapsTwin){
+ IntronExonSearch.checkIfIntronIsIncluded_AndAdd(cluster,intronBef_twin);
+ }
+ }
+ }else{
+ if(overlaps){
+ mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,false,true,has_Stop);
+
+ }else{
+ mergeWithOneIncludeTwin(cluster,clusterBef.twinNode,contigSeq,false,true,has_Stop);
+ }
+ }
+ }
+
+ /*
+ * we have to merge the current cluster that shall be extracted with both twins before
+ */
+
+ public static void mergeClustWithBothBefs(Gene cluster, Gene clusterBef, StringBuffer contigSeq, boolean hasStop){
+
+ clusterBef.twinNode.possibleIntrons.putAll(cluster.possibleIntrons);
+ clusterBef.twinNode.possibleFussyExons.putAll(cluster.possibleFussyExons);
+ clusterBef.twinNode.alternativeTranscripts.addAll(cluster.alternativeTranscripts);
+ clusterBef.twinNode.intronEndsThatAreNotContinued.addAll(cluster.intronEndsThatAreNotContinued);
+ clusterBef.twinNode.direcCounter[0] += cluster.direcCounter[0];
+ clusterBef.twinNode.direcCounter[1] += cluster.direcCounter[1];
+
+ HelperFunctions_GeneSearch.declareAsShared(cluster);
+ HelperFunctions_GeneSearch.declareShared_For_FutureTwin(cluster, clusterBef.twinNode);
+
+ HelperFunctions_GeneSearch.addAssociatedRnas(clusterBef.twinNode,cluster.idTOassociatedRnas);
+
+ clusterBef.twinNode.possibleStops_Forward = cluster.possibleStops_Forward;
+ clusterBef.twinNode.possibleStops_Reverse = cluster.possibleStops_Reverse;
+
+ mergeWithOneIncludeTwin(cluster,clusterBef,contigSeq,true,false,hasStop);
+
+ if(GeneFinder.isProkaryote || GeneFinder.inprogeaCall){
+ cluster.twinNode.sequence = contigSeq.substring(cluster.twinNode.startPos,cluster.stopPos+1);
+ }
+
+ cluster.twinNode.stopPos = cluster.stopPos;
+
+ cluster.twinNode.hasStop_temp = hasStop;
+
+ }
+}
diff --git a/src/geneFinder/Operon_LP.java b/src/geneFinder/Operon_LP.java
new file mode 100755
index 0000000..bb668c9
--- /dev/null
+++ b/src/geneFinder/Operon_LP.java
@@ -0,0 +1,425 @@
+package geneFinder;
+
+/**
+ * identify the set of ORFs optimizing the alignment scoring metric
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+import ilog.concert.IloException;
+import ilog.cplex.IloCplex;
+
+import java.io.*;
+import java.util.Vector;
+
+public class Operon_LP {
+
+
+ public static Object[] writeOperon_LP(String inputSeq, int[] cov, Vector<int[]> allORFs, int maxLength, Vector<Integer> existingORFs){
+
+ /*int[] covAll = new int[inputSeq.length()];
+ covAll = calculateCoverage(allORFs, covAll);
+ int[] covExist = new int[inputSeq.length()];
+
+ Vector<int[]> tempExist = new Vector<int[]>();
+ for(int i = 0; i<allORFs.size();++i){
+ if(existingORFs != null && existingORFs.contains(i)){
+ tempExist.add(allORFs.get(i));
+ }
+ }
+ covExist = calculateCoverage(tempExist, covExist);*/
+
+ int maxLengthOrf = 0;
+
+ /*if(maxLength != -1){
+ maxLengthOrf = maxLength;
+ }else{*/
+ for(int[] orf : allORFs){
+ if(((orf[1]-orf[0]+1)) > maxLengthOrf){
+ maxLengthOrf = orf[1]-orf[0]+1;
+ }
+ }
+ //}
+
+ StringBuffer binaryVars = new StringBuffer(""); // will include all variables
+
+ boolean doNoOpti = false;
+ int seqL = inputSeq.length();
+
+ try {
+
+ BufferedWriter bw = new BufferedWriter(new FileWriter(GeneFinder.pathOut+"resultsRun/input_operonLP.lp")); // name of lp file
+
+ bw.write("Maximize\n");
+
+ StringBuffer targetFunction = new StringBuffer(""); // will include the target function
+
+ // before write out, search overlaps
+
+ int formerCov = cov[0];
+ int beginPos = 0;
+
+ int numZeros = 0;
+
+ Vector<Vector<Integer>> overlaps = new Vector<Vector<Integer>>();
+
+ for(int pos = 0; pos<cov.length;++pos){
+
+ if(cov[pos] == 0){
+ numZeros++;
+ }
+
+ if(formerCov != cov[pos]){
+ // change, so check if write out necessary
+ if(formerCov>1){
+ Vector<Integer> temp = new Vector<Integer>();
+ temp.add(beginPos);
+ temp.add(pos-1); // inclusively
+ temp.add(formerCov);
+
+ overlaps.add(temp);
+
+ }
+
+ beginPos = pos;
+ formerCov = cov[pos];
+ }
+
+ }
+
+ StringBuffer match = new StringBuffer("");
+ StringBuffer openPenalty = new StringBuffer("");
+ StringBuffer bounds = new StringBuffer("");
+
+ int constNum = 0;
+
+ for(int i = 0;i<allORFs.size();++i){
+ targetFunction.append(" + x__"+i+ " + p__" + i + "\n");
+ binaryVars.append(" y__" + i + "\n");
+
+ double matchScore = (double) (allORFs.get(i)[1]-allORFs.get(i)[0]+1.0);
+
+ double penScore = ((double) seqL)/((double) (allORFs.get(i)[1]-allORFs.get(i)[0]+1.0)) * (((double) maxLengthOrf)/((double) (allORFs.get(i)[1]-allORFs.get(i)[0]+1.0)));
+
+ match.append(" c" + (constNum++) + ": x__"+i+ " - " + matchScore + " y__" + i + " = 0 \n");
+
+ if(existingORFs != null && existingORFs.contains(i)){
+ match.append(" y__" + i + " = 1 \n");
+ }
+
+ if(penScore >= 0){
+ openPenalty.append(" c" + (constNum++) + ": p__"+i+ " + " + penScore + " y__" + i + " = 0 \n");
+ }else{
+ penScore = -penScore;
+ openPenalty.append(" c" + (constNum++) + ": p__"+i+ " - " + penScore + " y__" + i + " = 0 \n");
+ }
+
+ bounds.append("-inf <= x__" + i + " <= +inf \n");
+ bounds.append("-inf <= p__" + i + " <= +inf \n");
+
+ if(overlaps.size() > 0){
+ // search if this orf overlaps
+
+ for(int pos = 0; pos < overlaps.size();++pos){
+ int startO = overlaps.get(pos).get(0);
+ int stopO = overlaps.get(pos).get(1);
+
+ if(((startO >= allORFs.get(i)[0]) && (stopO <= allORFs.get(i)[1])) || ((startO <= allORFs.get(i)[1]) && (stopO >= allORFs.get(i)[1])) || ((startO <= allORFs.get(i)[0]) && (stopO >= allORFs.get(i)[0]))){
+ // found overlap, so add position
+ overlaps.get(pos).add(i);
+ }
+ }
+
+ }
+
+ }
+
+ // check if overlap groups are correct
+
+ if(overlaps.size() > 0){
+ // search if this orf overlaps
+ overlaps = checkOverlaps(overlaps,allORFs);
+ }
+
+ // now add all overlaps
+
+ StringBuffer overlapPen = new StringBuffer("");
+
+ for(int pos = 0; pos < overlaps.size();++pos){
+
+ targetFunction.append(" + ov__"+pos+ "\n"); // var is correlated to position in overlaps vector
+
+ double ovScore = (double) overlaps.get(pos).get(1) - overlaps.get(pos).get(0) + 1.0;
+
+ /*if((covAll[overlaps.get(pos).get(0)] > covExist[overlaps.get(pos).get(0)]) && (covExist[overlaps.get(pos).get(0)] > 0)){
+ ovScore = ovScore * 1.5;
+ }*/
+
+ // note: should be multiplied with -1.0, but instead we keep it positive for the lp format, to make ov_i negative
+ StringBuffer orfNum = new StringBuffer("[ " + ovScore + " ");
+
+
+ int numO = 0;
+ if((overlaps.get(pos).size()-3) > 2){
+
+ StringBuffer allPartOrfs = new StringBuffer("");
+
+ for(int ovP = 3; ovP < overlaps.get(pos).size()-1;++ovP){
+ for(int ovP_2 = ovP+1; ovP_2 < overlaps.get(pos).size();++ovP_2){
+ orfNum.append("y__" + overlaps.get(pos).get(ovP) + " * y__" + overlaps.get(pos).get(ovP_2) + " ] <= 0");
+ overlapPen.append(" c" + (constNum++) + ": ov__"+ pos + "__" + numO + " + " + orfNum.toString() + " \n");
+ bounds.append("-inf <= ov__" + pos + "__" + numO + " <= 0 \n");
+
+ allPartOrfs.append(" - " + "ov__" + pos + "__" + numO);
+ numO++;
+ orfNum = new StringBuffer("[ " + ovScore + " ");
+ }
+ }
+
+ overlapPen.append(" c" + (constNum++) + ": ov__"+ pos + allPartOrfs.toString() + " <= 0 \n");
+ bounds.append("-inf <= ov__" + pos + " <= 0 \n");
+
+ }else{
+ for(int orfs = 3;orfs < overlaps.get(pos).size();++orfs){
+ orfNum.append("y__" + overlaps.get(pos).get(orfs) + " * ");
+ }
+
+ overlapPen.append(" c" + (constNum++) + ": ov__"+ pos + " + " + orfNum.substring(0,orfNum.length()-2) + "] <= 0 \n");
+ bounds.append("-inf <= ov__" + pos + " <= 0 \n");
+ }
+
+
+ }
+
+ if(allORFs.size() == 0){
+ doNoOpti = true;
+ }else{
+ // write out the lp
+ if(targetFunction.length() > 3){
+ bw.write(targetFunction.substring(2));
+ targetFunction = new StringBuffer("");
+ }
+
+ // now constraints
+
+ bw.write("Subject To \n");
+
+ bw.write(match.toString() + openPenalty.toString() + overlapPen.toString() + "\n");
+
+ //now declare normal variables as binary
+
+ bw.write("Bounds \n");
+
+ bw.write(bounds.toString());
+
+ bw.write("Binary \n");
+
+ bw.write(binaryVars.toString());
+
+ bw.write("END");
+ }
+
+ bw.close();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ double score = -Double.MAX_VALUE;
+ Vector<Integer> chosenORFs = new Vector<Integer>();
+
+ if(!doNoOpti){ // now solve the lp
+
+ if(GeneFinder.useCPLEX){ // use cplex
+
+ solveOperonLPWithCPLEX();
+
+ // parse solution file
+
+ Object[] returnArr = parseOperonLP_CPLEX(allORFs,maxLengthOrf,seqL);
+ score = (Double) returnArr[0];
+ chosenORFs = (Vector<Integer>) returnArr[1];
+
+ }
+ }
+
+ return new Object[] {score,chosenORFs,maxLengthOrf};
+
+ }
+
+
+ /*
+ * solves MaxFlow with the CPLEX optimizer
+ */
+
+ public static void solveOperonLPWithCPLEX(){
+
+ try {
+ IloCplex cplex = new IloCplex();
+ cplex.setOut(null);
+ cplex.importModel(GeneFinder.pathOut+"resultsRun/input_operonLP.lp");
+ cplex.setParam(IloCplex.IntParam.Threads,GeneFinder.numberThreads);
+
+ cplex.solve();
+
+ cplex.writeSolution(GeneFinder.pathOut+"resultsRun/solution_operonLP.sol");
+
+
+ } catch (IloException e2) {
+ e2.printStackTrace();
+ }
+
+ }
+
+ /*
+ * method that parses the solution file of cplex
+ */
+
+ public static Object[] parseOperonLP_CPLEX(Vector<int[]> allOrfs,int maxLengthOrf, int seqL){
+
+ Vector<Integer> chosenOrfs = new Vector<Integer>();
+ double objective = -Double.MAX_VALUE;
+
+ try{
+ BufferedReader br = new BufferedReader(new FileReader(GeneFinder.pathOut+"resultsRun/solution_operonLP.sol"));
+
+ String line = "";
+
+ while((line = br.readLine()) != null){ // go through file until we reach the part where we find the variables
+
+ if(line.contains("objectiveValue")){
+ String[] objectiveArr = line.split("\"");
+ objective = Double.parseDouble(objectiveArr[1]);
+ }
+
+ if(line.contains("<variables>")){
+ break;
+ }
+ }
+
+ while(((line = br.readLine()) != null) && (!line.contains("</variables>"))){ // make sure to stop when variable part is over
+
+ String[] lineSplit1 = line.split(" "); // position 1 contains name, 3 contains value
+
+ String[] valueSplit = lineSplit1[5].split("\""); // to extract the value
+
+ if(line.contains("y__")){
+ int score = 0;
+
+ if(!(valueSplit[1].equals("0") || valueSplit[1].equals("1"))){ // sometimes cplex does not round
+ double sol = Double.parseDouble(valueSplit[1]);
+ if(sol<=0.5){ // if 2 variables are both 0.5, then we cannot take both! so remove both because no real support!
+ score = 0;
+ }else{
+ score = 1;
+ }
+ }else{
+ score = Integer.parseInt(valueSplit[1]); // directly grab the score, without rounding
+ }
+
+ if(score == 1){
+ String[] name = lineSplit1[3].split("__");
+ String[] name2 = name[1].split("\"");
+ int pos = Integer.parseInt(name2[0]);
+ chosenOrfs.add(pos);
+
+ // recompute penalty
+
+ double penOld = ((double) seqL)/((double) (allOrfs.get(pos)[1]-allOrfs.get(pos)[0]+1.0)) * (((double) maxLengthOrf)/((double) (allOrfs.get(pos)[1]-allOrfs.get(pos)[0]+1.0)));
+ double penNew = ((double) seqL)/((double) (allOrfs.get(pos)[1]-allOrfs.get(pos)[0]+1.0));
+
+ objective = objective - penOld + penNew;
+
+ }
+ }
+
+ }
+
+ br.close();
+
+ } catch (IOException e) {
+
+ }
+
+ return new Object[] {objective,chosenOrfs};
+ }
+
+ /*
+ * check overlaps
+ */
+
+ public static Vector<Vector<Integer>> checkOverlaps(Vector<Vector<Integer>> overlaps, Vector<int[]> allORFs){
+
+ for(int pos = overlaps.size() - 1; pos >= 0;pos--){
+
+ if((overlaps.get(pos).size()-3) > 2){
+ int startO = overlaps.get(pos).get(0);
+ int stopO = overlaps.get(pos).get(1);
+
+ Vector<Vector<Integer>> newORFs = new Vector<Vector<Integer>>(); // contains new intervals if those are necessary
+
+ for(int i = 3;i<overlaps.get(pos).size();++i){
+ int start = allORFs.get(overlaps.get(pos).get(i))[0];
+ int stop = allORFs.get(overlaps.get(pos).get(i))[1];
+
+ if(start < startO && stop < stopO){
+ Vector<Integer> newOV = new Vector<Integer>();
+ newOV.add(startO);
+ newOV.add(stop);
+ newOV.add(overlaps.get(pos).get(2));
+
+ newORFs.add(newOV);
+
+ }else if(start > startO && stop > stopO){
+ Vector<Integer> newOV = new Vector<Integer>();
+ newOV.add(start);
+ newOV.add(stopO);
+ newOV.add(overlaps.get(pos).get(2));
+
+ newORFs.add(newOV);
+ }
+ }
+
+ if(!newORFs.isEmpty()){
+ for(int posNew = 0; posNew < newORFs.size();++posNew){
+ int startOnew = newORFs.get(posNew).get(0);
+ int stopOnew = newORFs.get(posNew).get(1);
+
+ for(int i = 3;i<overlaps.get(pos).size();++i){
+
+ if(((startOnew >= allORFs.get(overlaps.get(pos).get(i))[0]) && (stopOnew <= allORFs.get(overlaps.get(pos).get(i))[1]))){
+ // found overlap, so add position
+ newORFs.get(posNew).add(overlaps.get(pos).get(i));
+ }
+ }
+
+ overlaps.add(newORFs.get(posNew));
+ }
+
+ overlaps.removeElementAt(pos);
+ }
+ }
+
+ }
+
+ return overlaps;
+ }
+
+ /*
+ * derive the coverage from a set of given ORFs
+ */
+
+ /*public static int[] calculateCoverage(Vector<int[]> orfs, int[] cov){
+
+ for(int[] orf : orfs){
+ for(int i=orf[0];i<=orf[1];++i){
+ cov[i]++;
+ }
+ }
+
+ return cov;
+ }*/
+}
diff --git a/src/geneFinder/OptimizeAmbis.java b/src/geneFinder/OptimizeAmbis.java
new file mode 100755
index 0000000..05c1e16
--- /dev/null
+++ b/src/geneFinder/OptimizeAmbis.java
@@ -0,0 +1,502 @@
+package geneFinder;
+
+import ilog.concert.IloException;
+import ilog.cplex.IloCplex;
+
+import java.io.*;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Vector;
+
+import types.*;
+
+
+/**
+* manage the assignment of ambiguous hits to their final position, using a maximum flow formulation
+* current methods: GLPK solver, CPLEX linear program
+* Copyright (c) 2013,
+* Franziska Zickmann,
+* ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+* Distributed under the GNU Lesser General Public License, version 3.0
+*
+*/
+
+public class OptimizeAmbis {
+
+ public static Map<String,Object[]> multiRnas = new HashMap<String,Object[]>(); // store all ambiguous rnas
+ public static Map<Gene,Vector<String>> multiRnas_eachCluster = new HashMap<Gene,Vector<String>>(); // stores all variables that belong to one gene
+ public static Map<Gene,Object[]> clustNeighbors = new HashMap<Gene,Object[]>(); // first entry: gene vector, second entry: multiRnas, note: if too memory intensive, sacrifice running time and use associated rnas
+
+
+ /*
+ * set up the maximum flow formulation
+ */
+
+ public static void maxFlowLP(){
+
+ multiRnas.clear();
+ multiRnas_eachCluster.clear();
+ clustNeighbors.clear();
+
+ long timeBef = System.currentTimeMillis();
+
+ // the following 3 variables are required for output messages
+ int numMulti = 0; // count all ambiguous reads
+ int numShared = 0; // count all reads of twin candidate genes
+ int processCounter = 0; // for progress report
+
+ int fVar_counter = 0; // necessary to parse cplex solution
+
+ StringBuffer binaryVars = new StringBuffer(""); // will include all variables
+
+ boolean noMultis = false;
+
+ try {
+
+ BufferedWriter bw = new BufferedWriter(new FileWriter(GeneFinder.pathOut+"resultsRun/input_it" + GeneFinder.iteration + ".lp")); // name of lp file
+
+ bw.write("Maximize\n");
+
+ StringBuffer targetFunction = new StringBuffer(""); // will include the target function
+
+ for(String contigName : GeneFinder.mappedContigs.keySet()){
+
+ Contig contig = GeneFinder.mappedContigs.get(contigName);
+
+ for(int posGene = contig.allGenes.size() -1; posGene >= 0; posGene--){
+
+ Gene cluster = contig.allGenes.get(posGene);
+
+ boolean hasMultis = false; // hasMultis and hasTwin indicate if we have to deal with another gene candidate
+ boolean hasTwin = false;
+
+ cluster.totalCov = 0.0;
+
+ if(!FindExonsOfGene.findExonsForGene(cluster)){ // define the exons as they are at the moment, necessary for exon length calculation
+ contig.allGenes.remove(cluster);
+ }else{
+ sumUpExonLengths(cluster, cluster.exonsOfGene);
+
+ if(cluster.twinNode != null){
+ // we have to resolve the twin as well
+
+ cluster.twinNode.totalCov = 0.0;
+ if(!FindExonsOfGene.findExonsForGene(cluster.twinNode)){
+ cluster.twinNode = null;
+ }else{
+ sumUpExonLengths(cluster.twinNode, cluster.twinNode.exonsOfGene);
+
+ hasTwin = true;
+ }
+ }
+
+
+ int multiNumNode = 0; // count the ambiguous reads of each candidate
+
+ for(String rnaKey : cluster.idTOassociatedRnas.keySet()){
+
+ Rna rna = ((Rna)cluster.idTOassociatedRnas.get(rnaKey)[0]);
+
+ String varName = "x__"+rna.rnaID;
+
+ if(rna.isMulti == 1 || (rna.isSharedBy.contains(cluster.geneID))){
+
+ if(rna.isMulti == 1){ // to determine the right unique coverage
+
+ multiNumNode++;
+ numMulti++;
+
+ cluster.totalCov += (double) (((1.0)/((double) rna.hitNum)) * (double)GeneFinder.readLength); // ambiguous reads have fewer weight, according to the number of their hits
+
+ }else{
+ cluster.totalCov += (double) GeneFinder.readLength;
+ }
+
+ if(rna.assignedNum > 1){
+ hasMultis = true;
+
+ if(((multiNumNode % 50) == 0) && (targetFunction.length() > 10)){
+ bw.write(targetFunction.substring(2));
+ targetFunction = new StringBuffer(" ");
+ }else{
+ targetFunction.append(" + x__"+rna.rnaID + "__" + cluster.geneID + "__f" + "\n"); // include the variable for each readTOgene connection in the target fkt.
+ }
+
+ fVar_counter++;
+ initialize_MaxFlow(cluster,varName); // perform the necessary updates of the maps
+ }
+
+ }else{
+ cluster.totalCov += (double)GeneFinder.readLength;
+ }
+
+ }
+
+ cluster.totalCov = ((double) cluster.totalCov)/((double) cluster.exonLength);
+
+ if(hasMultis){
+
+ cluster.numOfMultis = multiNumNode;
+ cluster.uniqueCov = (((double)((cluster.idTOassociatedRnas.keySet().size() - multiNumNode) * GeneFinder.readLength))/((double)cluster.exonLength));
+
+ }
+
+ if(hasTwin){ // now perform the same stuff for the twin candidate
+
+ for(String rnaKey : cluster.twinNode.idTOassociatedRnas.keySet()){
+
+ Rna rna = ((Rna)cluster.twinNode.idTOassociatedRnas.get(rnaKey)[0]);
+
+ String varName = "x__"+rna.rnaID;
+
+ if(rna.isMulti == 1 || rna.isSharedBy.contains(cluster.twinNode.geneID)){
+
+ if(!rna.isSharedBy.contains(cluster.twinNode.geneID)){
+ // this is non-shared multiple read, so count it
+ //cluster.twinNode.numOfMultis++;
+ numMulti++;
+ }
+ if(rna.isMulti == 1){
+ cluster.twinNode.numOfMultis++;
+
+ cluster.twinNode.totalCov += (double) (((1.0)/((double) rna.hitNum)) * (double)GeneFinder.readLength);
+ }
+
+ if(rna.isSharedBy.contains(cluster.twinNode.geneID)){
+ numShared++; // at this point, update the shared rna number
+ }
+
+ if(rna.assignedNum > 1){
+
+ if(((cluster.twinNode.numOfMultis % 50) == 0) && (targetFunction.length() > 10)){
+ bw.write(targetFunction.substring(2));
+ targetFunction = new StringBuffer(" ");
+ }else{
+ targetFunction.append(" + x__"+rna.rnaID + "__" + cluster.twinNode.geneID + "__f" + "\n");
+ }
+
+ fVar_counter++;
+ initialize_MaxFlow(cluster.twinNode,varName);
+ }
+
+
+ }else{
+ cluster.twinNode.totalCov += (double)GeneFinder.readLength;
+ }
+ }
+
+ cluster.twinNode.uniqueCov = (((double)((cluster.twinNode.idTOassociatedRnas.keySet().size() - cluster.twinNode.numOfMultis) * GeneFinder.readLength))/((double)cluster.twinNode.exonLength));
+ cluster.twinNode.totalCov = ((double) cluster.twinNode.totalCov)/((double) cluster.twinNode.exonLength);
+ }
+
+ processCounter++;
+ if((processCounter % 2000) == 0){
+
+ if((processCounter % 10000) == 0){
+ long timeAfterTmp = System.currentTimeMillis();
+ if(!GeneFinder.secondPart){
+ System.out.println("Processed " + processCounter + " candidates in " + (double) (timeAfterTmp-timeBef)/1000.0 +"s.");
+ }
+ }
+
+ if(!(numMulti == 0 && numShared == 0 && multiRnas.keySet().size() == 0)){
+ if((targetFunction.length() > 10)){
+ bw.write(targetFunction.substring(2));
+ targetFunction = new StringBuffer(" ");
+ }
+ }
+ }else{
+ if((targetFunction.length() > 10) && (fVar_counter % 1000) == 0){
+ bw.write(targetFunction.substring(2));
+ targetFunction = new StringBuffer(" ");
+ }
+ }
+ }
+ }
+ }
+
+ // log messages:
+
+ if(!GeneFinder.secondPart){
+ System.out.println("Processed " + processCounter + " candidates.");
+ System.out.println("Number of multiple rnas: " + numMulti);
+ System.out.println("Number of shared rnas: " + numShared);
+ System.out.println("Number constraints: " + multiRnas.keySet().size());
+ WriteOutput.writeToLogFile("Processed " + processCounter + " candidates. \n" + "Number of multiple rnas: " + numMulti + " \nNumber of shared rnas: " + numShared + " \nNumber constraints: " + multiRnas.keySet().size() + "\n");
+
+ long timeAfter1 = System.currentTimeMillis();
+ System.out.println("Time needed for max flow initialization: "+(double) (timeAfter1-timeBef)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Time needed for max flow initialization: "+(double) (timeAfter1-timeBef)/1000.0 +"s.\n");
+ }
+
+ if(numMulti == 0 && numShared == 0 && multiRnas.keySet().size() == 0){
+ System.out.println("No multiple rnas, no optimization necessary!");
+ WriteOutput.writeToLogFile("No multiple rnas, no optimization necessary! \n");
+ noMultis = true;
+ GeneFinder.noAmbiOpti = true;
+ }else{
+ // write out the lp
+ if(targetFunction.length() > 3){
+ bw.write(targetFunction.substring(2));
+ targetFunction = new StringBuffer("");
+ }
+
+ bw.write("Subject To \n");
+
+ // now constraints
+
+ for(String varName : multiRnas.keySet()){
+
+ StringBuffer constraint = new StringBuffer(""); // three different types of necessary constraints
+ StringBuffer constraintVar_f = new StringBuffer("");
+ StringBuffer constraintVar_Diff = new StringBuffer("");
+
+ for(String var : (Vector<String>) multiRnas.get(varName)[0]){ // set up the different constraints
+
+ constraint.append(" + " + var + "\n");
+ binaryVars.append(" " + var + "\n");
+ constraintVar_f.append(var+"__f >= 0 \n");
+ //constraintVar_Diff.append(var+"__f - " + GeneFinder.readLength + " " + var + " <= 0 \n");
+ constraintVar_Diff.append(var+"__f - " + var + " <= 0 \n");
+ }
+
+ constraint.append(" = 1");
+
+ bw.write(constraintVar_f.toString() + constraintVar_Diff.toString() + constraint.substring(2) + "\n");
+
+ }
+
+ // now add the weight-constraints
+
+ for(Gene clust : clustNeighbors.keySet()){
+
+ double covSum = 0.0;
+ double sum_exonL = clust.exonLength;
+
+ StringBuffer constraintF = new StringBuffer("");
+
+ for(Gene neighbor : (Vector<Gene>) clustNeighbors.get(clust)[0]){
+ covSum += neighbor.uniqueCov;// + ((1.0/100000.0)); // the ((1.0/100000.0)) ensures that also candidates without unique hits can be chosen
+ sum_exonL = sum_exonL + neighbor.exonLength;
+ }
+
+ for(String var : (Vector<String>) clustNeighbors.get(clust)[1]){
+ constraintF.append(" + " + var + "\n");
+ }
+
+ // 1) no additional read length factor
+ // 2) multiple rnas have a penalty according to their number of ambiguous hits, already included in totalCov
+ // 3) competing candidates have a direct influence on the weight due to covSum
+
+ double weight = 0;
+
+ covSum = Math.pow((covSum+1),2);
+ weight = ((clust.totalCov/covSum) * ((double)clust.exonLength/(double)sum_exonL));
+
+ constraintF.append(" <= " + Math.pow((weight+1),2)); // to the power of two leads to a more "the winner takes it all fashion"
+
+ bw.write(constraintF.substring(2) + "\n");
+
+ }
+
+ //now declare normal variables as binary
+
+ bw.write("Binary \n");
+
+ int posBin = 0;
+
+ for(posBin = 0;posBin<binaryVars.length();){
+ int posBin2 = posBin + 500000;
+ if(posBin2 < binaryVars.length()){
+ bw.write(binaryVars.substring(posBin,posBin2)); // extracted: posBin - (posBin2-1)
+ }
+ posBin = posBin + 500000;
+ }
+
+ if((posBin-500000) >= 0){
+ bw.write(binaryVars.substring(posBin-500000));
+ }
+
+ bw.write("END");
+ }
+
+ bw.close();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ long timeAfter = System.currentTimeMillis();
+ if(!GeneFinder.secondPart){
+ System.out.println("Time needed to set up lp file: "+ (double) (timeAfter-timeBef)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Time needed to set up lp file: "+ (double) (timeAfter-timeBef)/1000.0 +"s.\n");
+ System.out.println();
+ }
+
+ if(!noMultis){ // now solve the lp
+
+ long timeBefSolve = System.currentTimeMillis();
+
+ if(GeneFinder.useCPLEX){ // use cplex
+
+ if(GeneFinder.optiSolve){
+ solveMaxFlowWithCPLEX(numMulti);
+ long timeAfterSolve = System.currentTimeMillis();
+ System.out.println("Time needed to solve max flow: "+ (double) (timeAfterSolve-timeBefSolve)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Time needed to solve max flow: "+ (double) (timeAfterSolve-timeBefSolve)/1000.0 +"s.\n");
+ }else{ // for debugging of human real
+ double[] minMax = CalculateScores.assignGeneScores(true);
+ WriteOutput.writeGeneFinderOutput(minMax,"");
+ }
+
+ // parse solution file
+
+ CleanAfterAmbiOpti.parse_solution_and_clean_CPLEX(multiRnas,fVar_counter);
+
+ }else{ // use glpk
+
+ if(GeneFinder.optiSolve){
+ solveMaxFlowWithGLPK();
+ long timeAfterSolve = System.currentTimeMillis();
+ System.out.println("Time needed to solve max flow: "+ (double) (timeAfterSolve-timeBefSolve)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Time needed to solve max flow: "+ (double) (timeAfterSolve-timeBefSolve)/1000.0 +"s.\n");
+ }
+
+ // parse solution file
+ CleanAfterAmbiOpti.parse_solution_and_clean_GLPK(multiRnas);
+ }
+
+ }
+
+ }
+
+
+ /*
+ * solves MaxFlow with the CPLEX optimizer
+ */
+
+ public static void solveMaxFlowWithCPLEX(int numMulti){
+
+ try {
+ System.out.println("Start cplex solve...");
+ IloCplex cplex = new IloCplex();
+
+ cplex.importModel(GeneFinder.pathOut+"resultsRun/input_it" + GeneFinder.iteration + ".lp");
+
+ cplex.setParam(IloCplex.IntParam.RootAlg,IloCplex.Algorithm.Network);
+ cplex.setParam(IloCplex.DoubleParam.EpGap,0.01);
+ cplex.setParam(IloCplex.IntParam.Threads,GeneFinder.numberThreads);
+
+ if(numMulti >= 10000000){
+
+ if(GeneFinder.memForCplex != -1){
+ cplex.setParam(IloCplex.DoubleParam.WorkMem,GeneFinder.memForCplex);
+ }
+
+ cplex.setParam(IloCplex.StringParam.WorkDir,GeneFinder.pathOut+"resultsRun/");
+ System.out.println("Directory: " + cplex.getParam(IloCplex.StringParam.WorkDir));
+
+ cplex.setParam(IloCplex.DoubleParam.PolishTime,1000.0);
+ System.out.println("polish time: " + cplex.getParam(IloCplex.DoubleParam.PolishTime));
+
+ cplex.setParam(IloCplex.IntParam.HeurFreq,-1);
+
+ cplex.setParam(IloCplex.IntParam.NodeFileInd,3);
+ System.out.println(cplex.getParam(IloCplex.IntParam.NodeFileInd));
+
+ cplex.setParam(IloCplex.BooleanParam.MemoryEmphasis,true);
+ System.out.println(cplex.getParam(IloCplex.BooleanParam.MemoryEmphasis));
+
+ }
+
+ cplex.solve();
+
+ cplex.writeSolution(GeneFinder.pathOut+"resultsRun/solutionCPLEX_it" + GeneFinder.iteration + ".sol");
+
+
+ } catch (IloException e2) {
+ e2.printStackTrace();
+ }
+
+ }
+
+ /*
+ * solves MaxFlow with the glpsol of the GLPK optimizer
+ */
+
+ public static void solveMaxFlowWithGLPK(){
+
+ try {
+ System.out.println("Start glpk solve...");
+
+ Runtime rt = Runtime.getRuntime();
+ Process firstExe = rt.exec("glpsol --lp " + GeneFinder.pathOut + "resultsRun/input_it" + GeneFinder.iteration + ".lp --output " + GeneFinder.pathOut + "resultsRun/solutionGLPK_out_it" + GeneFinder.iteration + ".out");
+ firstExe.waitFor();
+
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+
+ /*
+ * do the initialization for the max flow optimization
+ * fill the required maps
+ */
+
+ public static void initialize_MaxFlow(Gene cluster, String varName){
+
+ if(!clustNeighbors.keySet().contains(cluster)){
+ Vector<Gene> geneTmp = new Vector<Gene>(); // candidate is not contained so far, so add to map
+ Vector<String> varTmp = new Vector<String>();
+ varTmp.add(varName+ "__" + cluster.geneID + "__f");
+ geneTmp.add(cluster);
+ Object[] geneObj = {geneTmp,varTmp};
+ clustNeighbors.put(cluster,geneObj);
+ }else{
+ ( (Vector<String>) clustNeighbors.get(cluster)[1]).add(varName+ "__" + cluster.geneID + "__f"); // add this variable to reads of the candidate
+ }
+
+
+ if(multiRnas.keySet().contains(varName)){ // add this candidate to the list of genes connected to this read
+ ((Vector<String>) multiRnas.get(varName)[0]).add(varName+"__"+cluster.geneID);
+
+ ((Vector<Gene>) multiRnas.get(varName)[1]).add(cluster);
+
+ Vector<Gene> clustGenes = (Vector<Gene>) clustNeighbors.get(cluster)[0];
+ for(Gene clust : (Vector<Gene>) multiRnas.get(varName)[1]){ // add this candidate to the list of competing genes for all those genes already connected to this read
+ if(!clustGenes.contains(clust)){
+ ( (Vector<Gene>) clustNeighbors.get(cluster)[0]).add(clust);
+ }
+
+ if(!((Vector<Gene>) clustNeighbors.get(clust)[0]).contains(cluster)){
+ ((Vector<Gene>) clustNeighbors.get(clust)[0]).add(cluster);
+ }
+ }
+
+ }else{
+ Vector<String> vecTmp = new Vector<String>(); // first time ambiguous read was looked at, so add to map
+ Vector<Gene> vecGeneTmp = new Vector<Gene>();
+ vecGeneTmp.add(cluster);
+ vecTmp.add(varName+"__"+cluster.geneID);
+ Object[] objTmp = {vecTmp,vecGeneTmp};
+ multiRnas.put(varName,objTmp);
+ }
+ }
+
+
+ /*
+ * go through all exons and sum up their lengths do determine the total exon length
+ */
+
+ public static void sumUpExonLengths(Gene gene, Vector<int[]> exons){
+
+ for(int[] exon :exons){
+ gene.exonLength += (exon[1] - exon[0] + 1);
+ }
+ }
+
+}
diff --git a/src/geneFinder/PrepareMapping_GF.java b/src/geneFinder/PrepareMapping_GF.java
new file mode 100755
index 0000000..03b0c8c
--- /dev/null
+++ b/src/geneFinder/PrepareMapping_GF.java
@@ -0,0 +1,56 @@
+package geneFinder;
+
+import java.io.File;
+import java.io.IOException;
+
+
+/**
+ * concatenates different genome files to one reference file
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class PrepareMapping_GF {
+
+ /*
+ * concatenate the reference files to one large fasta
+ */
+
+
+ public String prepareRefFile_GF(){
+
+ String nameRef = GeneFinder.pathOut+"resultsRun/concaRefFile";
+
+ Runtime prepRef = Runtime.getRuntime();
+ Process firstExe;
+ String allRefNames = new String();
+
+ try {
+
+ for(File genomeFile : GeneFinder.genomeFilesWithNames.keySet()){
+ String name = GeneFinder.pathToGenomeFiles+GeneFinder.genomeFilesWithNames.get(genomeFile);
+ allRefNames += name+"&&";
+ }
+
+ if(GeneFinder.useTopHat){
+ nameRef += ".fa";
+ }else{
+ nameRef += ".fasta";
+ }
+
+ String exe = "python "+ GeneFinder.pathToHelpFiles+"callCat.py " + allRefNames + " " + nameRef;
+ firstExe = prepRef.exec(exe);
+ firstExe.waitFor();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+
+ return (GeneFinder.pathOut+"resultsRun/concaRefFile");
+ }
+
+}
diff --git a/src/geneFinder/ProkaryoteExtraction.java b/src/geneFinder/ProkaryoteExtraction.java
new file mode 100755
index 0000000..77292a6
--- /dev/null
+++ b/src/geneFinder/ProkaryoteExtraction.java
@@ -0,0 +1,623 @@
+package geneFinder;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.Vector;
+
+import types.*;
+
+
+/**
+ * extracts high-coverage clusters as potential genes of prokaryotes
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ */
+
+public class ProkaryoteExtraction {
+
+ // variables for log file
+
+ public static int numMergedClusters;
+ //public static int numFoundNoStart_firstTime;
+ //public static int numFoundNoStop_firstTime;
+ //public static int numNoFrameFound;
+
+ /*
+ * method to analyze the mapping -> extract clusters of high coverage and assign start and stop codons
+ */
+
+ public int initializeClusterSearch(String nameRefFile){
+
+ File refFile;
+ if(GeneFinder.useTopHat){
+ refFile = new File(nameRefFile+".fa");
+ }else{
+ refFile = new File(nameRefFile+".fasta");
+ }
+
+ int id = 1;
+
+ for(String contigName : GeneFinder.mappedContigs.keySet()){
+ Contig thisContig = GeneFinder.mappedContigs.get(contigName);
+ StringBuffer contigSeq = new StringBuffer();
+
+ try{
+ BufferedReader br = new BufferedReader(new FileReader(refFile));
+ String line;
+
+ while((line = br.readLine()) != null){
+ if(line.startsWith(">")){
+ // test if correct contig
+ if(line.substring(1).startsWith(contigName)){
+ // found right one, now extract sequence
+ while(((line = br.readLine()) != null) && (line.length() != 0) && (!(line.startsWith(">")))){
+ String line2 = "";
+ if(Character.isLowerCase(line.charAt(0))){
+ for(int i = 0;i<line.length();i++){
+ char letter = line.charAt(i);
+ letter = Character.toUpperCase(letter);
+ line2 += letter;
+ }
+ }else{
+ line2 = line;
+ }
+ contigSeq.append(line2);
+ }
+ break;
+ }
+ }
+ }
+
+ if(contigSeq.length() == 0){
+ // oops, did not found contig
+ System.out.println("Error, could not find contig " + contigName);
+ System.exit(0);
+ }
+
+ // now that we have the sequence, search for areas with high coverage
+ Runtime r = Runtime.getRuntime();
+
+ id = searchClusters(thisContig, id, contigSeq);
+
+ double memBef_2 = (r.totalMemory()-r.freeMemory());
+
+ thisContig.positionTOmappingRnas.clear();
+ thisContig.positionTOmappingRnas = new TreeMap<Integer,Vector<Rna>>();
+ thisContig.positionTOdiff.clear();
+ thisContig.positionTOdiff = new TreeMap<Integer,Integer>();
+ contigSeq = null;
+ r.gc();
+ r.gc();
+
+ double memAft_2 = (r.totalMemory()-r.freeMemory());
+ if(!GeneFinder.secondPart){
+ System.out.println("Memory freed = " + (((memBef_2-memAft_2)/1000.0)/1000.0) + "MB");
+ System.out.println();
+ }
+
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ return id;
+
+ }
+
+ /*
+ * searches potential genes on forward and reverse strand, extracts the specific regions respecting reading frame (if possible)
+ */
+
+ public int searchClusters(Contig thisContig, int id, StringBuffer contigSeq){
+
+ if(GeneFinder.noAmbiOpti){
+ HelperFunctions_GeneSearch.removeAmbiHits(thisContig);
+ }
+
+ Iterator<Integer> positionIt = thisContig.positionTOmappingRnas.keySet().iterator();
+
+ if(GeneFinder.iteration == 2 && !GeneFinder.secondPart){
+ System.out.print("Iteration 2. ");
+ WriteOutput.writeToLogFile("Iteration 2. ");
+ }
+ if(!GeneFinder.secondPart){
+ System.out.println("Contig: = " + thisContig.contigName);
+ WriteOutput.writeToLogFile("Contig: = " + thisContig.contigName + "\n\n");
+ }
+
+ int startPos = 0;
+ int currentPos = 0;
+ int nextPos = 0;
+
+ // variables for log file
+
+ numMergedClusters = 0;
+
+ if(positionIt.hasNext()){
+ currentPos = positionIt.next();
+ }
+
+ // initialize the coverage handling:
+ Object[] coverageVecAndPos = new Object[3];
+ Vector<Integer> coverageVec = new Vector<Integer>(); // vector is preferred over array because more flexible
+ // first the coverage is zero at all positions:
+ for(int arrPos=0;arrPos<GeneFinder.readLength;++arrPos){
+ coverageVec.add(0);
+ }
+
+ coverageVecAndPos[0] = coverageVec;
+ coverageVecAndPos[1] = currentPos;
+ coverageVecAndPos[2] = -1;
+
+ if(GeneFinder.endCoverage == -1){
+ GeneFinder.endCoverage = (1.0/3.0)*GeneFinder.minCoverage - 0.001;
+
+ if(!GeneFinder.secondPart){
+ System.out.println("End coverage estimated from required minimum coverage: " + GeneFinder.endCoverage);
+ WriteOutput.writeToLogFile("End coverage estimated from required minimum coverage: " + GeneFinder.endCoverage + "\n\n");
+ }
+
+ }
+
+ if(GeneFinder.maxCov == -1){
+ GeneFinder.maxCov = Double.MAX_VALUE; // so we always accept the coverage
+ }
+
+ boolean noMoreCluster = false;
+ boolean startedNewCluster = false; // this boolean ensures that also the last cluster is completed once it has been started due to sufficient coverage (otherwise, if map is empty, cluster is not extracted)
+ boolean doNotCountTwice = false; // if true we do not perform the coverageVec update for the first current position (when starting a new cluster) because this has already been done with "nextPos"
+
+ int numIdentifiedClusters = 0; // if = 1, this ends the while loop
+
+ do{
+
+ Gene cluster = new Gene();
+ coverageVecAndPos[2] = -1;
+
+ do{
+
+ startedNewCluster = false;
+
+ int currentCompete = -1; // stores the current alternative interval, is only -1,-1 if no alternatives exist for current position
+ int currentCompeteStart = -1; // defines the first split determining the currentCompete, necessary to not extract exons, that overlap with currentCompete region
+
+ int localExonEnd = -1;
+ int endToPassForclosedGenes = -1; // when we have to close a gene within a currentCompete interval, it is important to grab the last included position as the end
+ boolean chooseAlternativeEndInstead = false;
+
+ TreeMap<Integer,Vector<Integer>> posiCovMap = new TreeMap<Integer,Vector<Integer>>(); // for each intron, the coverage add after begin and end ist stored
+
+ int diff = 0;
+
+ if(!doNotCountTwice){
+
+ int covPlus = thisContig.positionTOmappingRnas.get(currentPos).size();
+
+ Object[] returnValues = updateCoverageInterval_respectAlternatives(thisContig,covPlus,currentPos,coverageVecAndPos,posiCovMap,-1);
+
+ coverageVecAndPos = (Object[]) returnValues[0];
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnValues[1];
+
+ if(thisContig.positionTOdiff.keySet().contains(currentPos)){
+ diff = thisContig.positionTOdiff.get(currentPos); // if there occurred insertions or deletions before this positions add/subtract the difference
+ }
+
+ }
+
+ boolean startWithGene = false;
+ if((((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff >= GeneFinder.minCoverage) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff < GeneFinder.maxCov)){
+ startWithGene = true;
+ }
+
+ if(startWithGene){
+ startPos = (Integer)coverageVecAndPos[2]; // potential start of a new cluster begins at currentPos - bases covered by already present rnas
+ if((Integer)coverageVecAndPos[2] == -1){
+ startPos = currentPos;
+ }
+
+ // add also all rnas starting in interval startPos-currentPos
+
+ int pos_temp = (Integer)coverageVecAndPos[2];
+
+ if((Integer)coverageVecAndPos[2] != -1){
+ do{
+ //associatedRnas = addRnas(thisContig.positionTOmappingRnas.get(pos_temp),associatedRnas);
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(pos_temp));
+
+ if(thisContig.positionTOmappingRnas.higherKey(pos_temp) != null){
+ pos_temp = thisContig.positionTOmappingRnas.higherKey(pos_temp);
+ }else{
+ pos_temp = currentPos;
+ }
+ }while((pos_temp != currentPos) && !(pos_temp > currentPos));
+ }
+
+
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(currentPos));
+
+ startedNewCluster = true;
+
+ while(positionIt.hasNext()){
+
+ nextPos = positionIt.next();
+
+ int covPlusNext = thisContig.positionTOmappingRnas.get(nextPos).size();
+
+ Object[] returnValues = updateCoverageInterval_respectAlternatives(thisContig,covPlusNext,nextPos,coverageVecAndPos,posiCovMap,-1);
+
+ coverageVecAndPos = (Object[]) returnValues[0];
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnValues[1];
+
+ if(thisContig.positionTOdiff.keySet().contains(nextPos)){
+ diff = thisContig.positionTOdiff.get(nextPos); // if there occurred insertions or deletions before this positions add/subtract the difference
+ }else{
+ diff = 0;
+ }
+
+ if(((nextPos - currentPos) <= (GeneFinder.readLength)) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff > GeneFinder.endCoverage) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff < GeneFinder.maxCov)){
+
+ endToPassForclosedGenes = nextPos;
+
+ currentPos = nextPos;
+
+ if(currentPos >= currentCompete){
+ currentCompete = -1;
+ currentCompeteStart = -1;
+ }
+
+
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(currentPos));
+
+
+ if(localExonEnd != -1){
+ localExonEnd = currentPos + GeneFinder.readLength;
+ }
+
+ }else{
+ boolean closeGene = false;
+
+ int basesToAddForOverlap = 0;
+ int positionPosiCovMap = HelperFunctions_GeneSearch.findIntronNearNextPos(cluster,nextPos);
+
+ if(posiCovMap.containsKey(positionPosiCovMap)){
+ basesToAddForOverlap = posiCovMap.get(positionPosiCovMap).size();
+ }
+
+ if((currentCompeteStart != -1 && ((currentCompeteStart - currentPos) < GeneFinder.readLength)) && ((currentCompete + basesToAddForOverlap + 1) >= nextPos) && (((Vector<Integer>) coverageVecAndPos[0]).get(0) + diff < GeneFinder.maxCov)){
+ // +1 because currentCompete defines the intron end
+ // have to go on because we are still within the cluster, so just decide if this part shall be included or not
+
+ int goOn = 0;
+
+ if((nextPos - currentPos) > (GeneFinder.readLength)){
+ goOn = -1;
+ }
+
+ if(goOn != -1){
+
+ if(goOn == 0){
+ HelperFunctions_GeneSearch.addRnasFromVector(cluster,thisContig.positionTOmappingRnas.get(nextPos));
+ }
+
+ currentPos = nextPos;
+
+ if(currentPos >= currentCompete){
+ currentCompete = -1;
+ currentCompeteStart = -1;
+ }
+ }else{
+
+ currentCompete = -1;
+ currentCompeteStart = -1;
+ if(endToPassForclosedGenes != -1){
+ chooseAlternativeEndInstead = true;
+ }
+ closeGene = true;
+ }
+
+ }else{
+ closeGene = true;
+ }
+
+
+ if(closeGene){
+
+ int basesToAdd = GeneFinder.readLength;
+
+ if(chooseAlternativeEndInstead){
+ currentPos = endToPassForclosedGenes;
+ }
+
+ if(((currentCompeteStart != -1 && ((currentCompeteStart - currentPos) < GeneFinder.readLength))) && currentCompete > currentPos){
+
+ currentPos = currentCompete + 1; // nextPos is bigger than split end, but we have to consider the split anyway
+
+ // look new number up in posiCovMap
+
+ if(posiCovMap.containsKey(currentCompete)){
+ basesToAdd = posiCovMap.get(currentCompete).size();
+ }
+
+ }
+
+ // extract this cluster, interval [startPos,currentPos+readLength]
+
+ id = clustIni(cluster, thisContig, contigSeq, startPos, id, currentPos,basesToAdd);
+
+ numIdentifiedClusters++;
+ currentPos = nextPos; // now nextPos is potential new start
+
+ doNotCountTwice = true;
+ break;
+ }
+ }
+ }
+
+ }else{
+ if(positionIt.hasNext()){
+ currentPos = positionIt.next();
+ doNotCountTwice = false;
+
+ cluster.idTOassociatedRnas.clear();
+ cluster.possibleIntrons.clear();
+
+ }else{
+ break;
+ }
+ }
+
+ if(!positionIt.hasNext() && (numIdentifiedClusters < 1) && startedNewCluster){ // to grab very last cluster
+
+ int basesToAdd = GeneFinder.readLength;
+
+ if(chooseAlternativeEndInstead){
+ currentPos = endToPassForclosedGenes;
+ }
+
+ if(((currentCompeteStart != -1 && ((currentCompeteStart - currentPos) < GeneFinder.readLength))) && currentCompete > currentPos){
+
+ currentPos = currentCompete + 1; // nextPos is bigger than split end, but we have to consider the split anyway
+
+ // look new number up in posiCovMap
+
+ if(posiCovMap.containsKey(currentCompete)){
+ basesToAdd = posiCovMap.get(currentCompete).size();
+ }
+ }
+
+ id = clustIni(cluster, thisContig, contigSeq, startPos, id, currentPos,basesToAdd);
+
+ numIdentifiedClusters++;
+ }
+
+ }while(positionIt.hasNext() && (numIdentifiedClusters < 1));
+
+ if(!positionIt.hasNext() && (numIdentifiedClusters < 1)){
+ // reached end of reference sequence, no cluster has been extracted this time, so stop
+ noMoreCluster = true;
+ break;
+ }
+
+
+ // now test for overlap merging:
+
+ if(thisContig.allGenes.size() > 0){
+
+ Gene clusterBeforeInVec = thisContig.allGenes.get(thisContig.allGenes.size()-1);
+
+ if(MergeClusters.checkIfNeedMerge(cluster, clusterBeforeInVec,thisContig,contigSeq)){
+ numMergedClusters++;
+ }else{
+ thisContig.allGenes.add(cluster);
+ }
+
+ }else{
+ thisContig.allGenes.add(cluster);
+ }
+
+
+ if(thisContig.allGenes.size() > 1){
+
+ Gene clustBef = thisContig.allGenes.get(thisContig.allGenes.size()-2);
+ if(clustBef.operonOrfs.isEmpty()){
+ Object[] returnArr = Prokaryote_Specials.define_OrfsInOperon(clustBef.sequence,clustBef);
+ clustBef.operonOrfs = (Vector<int[]>) returnArr[0]; // first entry is always 1-d array indicating whether normal gene sequence follows (1) or operon sequence (-1)
+ clustBef.operonDirectionIsForward = (Boolean) returnArr[1];
+ }
+
+ }
+
+ if(!positionIt.hasNext()){
+ noMoreCluster = true;
+ break;
+ }
+
+ numIdentifiedClusters--;
+
+ }while(!noMoreCluster);
+
+ // check for the last extracted cluster if it has a proper stop or not and further check for twins
+
+ if(thisContig.allGenes.size() != 0){
+ Gene clustBef = thisContig.allGenes.get(thisContig.allGenes.size()-1);
+
+ if(clustBef.operonOrfs.isEmpty()){
+ Object[] returnArr = Prokaryote_Specials.define_OrfsInOperon(clustBef.sequence, clustBef);
+ clustBef.operonOrfs = (Vector<int[]>) returnArr[0]; // first entry is always 1-d array indicating whether normal gene sequence follows (1) or operon sequence (-1)
+ clustBef.operonDirectionIsForward = (Boolean) returnArr[1];
+ }
+
+ }
+
+ if(!GeneFinder.secondPart){
+ String s = "";
+ s += "No more clusters can be found \n";
+ s += "Total identified clusters: " + thisContig.allGenes.size() + "\n";
+ s += "Number mergings: " + numMergedClusters + "\n";
+
+ WriteOutput.writeToLogFile(s + "\n");
+ System.out.println(s);
+ }
+
+ return id;
+ }
+
+ /*
+ * check if only multiRnaSupport
+ */
+
+ public static boolean checkIfOnlyMultiRnas(Gene gene){
+
+ for(String rnaString : gene.idTOassociatedRnas.keySet()){
+ Rna rna = ((Rna)gene.idTOassociatedRnas.get(rnaString)[0]);
+ if(rna.isMulti == 0){
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /*
+ * for all rnas mapping to this gene, remove the gene alignment
+ */
+
+ public static void removeGenesRnas(Gene gene, Contig thisContig){
+
+ for(String rnaID : gene.idTOassociatedRnas.keySet()){
+ Rna rna = (Rna) gene.idTOassociatedRnas.get(rnaID)[0];
+
+ for(Object[] info : rna.contigsMappedOn){
+
+ int alignPos = ((Integer) info[1]).intValue();
+
+ if((((Contig) info[0]).equals(thisContig)) && (alignPos > gene.startPos) && (alignPos < (gene.stopPos-GeneFinder.readLength))){
+ rna.contigsMappedOn.removeElement(info);
+ rna.assignedNum = rna.assignedNum -1;
+ break;
+ }
+ }
+
+ }
+ }
+
+ /*
+ * updates the coverage entries in the vector representing the current covered interval
+ */
+
+ public static Object[] updateCoverageInterval_respectAlternatives(Contig thisContig, int covPlus, int currentPos, Object[] coverageVecAndPos, TreeMap<Integer,Vector<Integer>> posiCovMap,int considerSpliceSite){
+
+ boolean addedValues = false;
+
+ int splitDiff = 0; // necessary to consider splice site, do not update with covplus if we exceed splice site
+ if(considerSpliceSite != -1){
+ splitDiff = (considerSpliceSite-currentPos);
+ }
+
+ Vector<Integer> covVecClone = new Vector<Integer>(); // stores all coverage add values derived by posiCovMap
+ Object[] returnObject = HelperFunctions_GeneSearch.lookIntoPosiCovMap(posiCovMap, currentPos);
+
+ addedValues = (Boolean) returnObject[0];
+ covVecClone = (Vector<Integer>) returnObject[1];
+ posiCovMap = (TreeMap<Integer,Vector<Integer>>) returnObject[2];
+
+ if(currentPos - (Integer) coverageVecAndPos[1] > GeneFinder.readLength){
+
+ Vector<Integer> coverageVecNew = new Vector<Integer>();
+ // initialize:
+ for(int arrPos=0;arrPos<GeneFinder.readLength;++arrPos){
+ if((considerSpliceSite != -1) && (arrPos >= splitDiff)){
+ coverageVecNew.add(0);
+ }else{
+ coverageVecNew.add(covPlus);
+ }
+ }
+
+ coverageVecAndPos[0] = coverageVecNew;
+ coverageVecAndPos[2] = -1;
+
+
+ }else{
+
+ Vector<Integer> covVec = (Vector<Integer>) coverageVecAndPos[0];
+ for(int pos = 0;pos<(currentPos- (Integer) coverageVecAndPos[1]);++pos){
+ covVec.remove(covVec.firstElement());
+ covVec.add(0);
+ }
+ for(int vecPos=0;vecPos < covVec.size();++vecPos){
+ if(!((considerSpliceSite != -1) && (vecPos >= splitDiff))){
+ covVec.set(vecPos,(covVec.get(vecPos) + covPlus)); // correct?
+ }
+
+ }
+
+ if((((Integer)coverageVecAndPos[2] == -1)) || (currentPos-((Integer) coverageVecAndPos[2]) > (GeneFinder.readLength))){
+ // update overlap-position
+ boolean foundPos = false;
+ int pos_temp = (Integer) coverageVecAndPos[2];
+ if(pos_temp == -1){
+ coverageVecAndPos[2] = (Integer) coverageVecAndPos[1];
+ }else{
+ if(!addedValues){ // if we have introns, then stay with the current start position
+ do{
+ if(thisContig.positionTOmappingRnas.lastKey() != pos_temp){
+ pos_temp = thisContig.positionTOmappingRnas.higherKey(pos_temp);
+ }else{
+ pos_temp = (Integer) coverageVecAndPos[1];
+ }
+ if((currentPos-(pos_temp) <= (GeneFinder.readLength))){
+ coverageVecAndPos[2] = pos_temp;
+ foundPos = true;
+ }
+ }while(!foundPos || !(pos_temp >= (Integer) coverageVecAndPos[1]));
+ }
+ }
+ }
+ }
+
+ coverageVecAndPos[1] = currentPos;
+
+ if(addedValues){
+ for(int posVec = 0; posVec < GeneFinder.readLength; ++posVec){
+ ((Vector<Integer>) coverageVecAndPos[0]).setElementAt((((Vector<Integer>)coverageVecAndPos[0]).get(posVec) + covVecClone.get(posVec)),posVec);
+ }
+ }
+
+ Object[] toReturn = {coverageVecAndPos,posiCovMap,addedValues};
+ return toReturn;
+ }
+
+ /*
+ * first extraction of each cluster, simply regard the high coverage interval
+ */
+
+ public static int clustIni(Gene cluster, Contig thisContig, StringBuffer contigSeq, int startPos, int id, int currentPos, int basesToAdd){
+
+ basesToAdd = basesToAdd + GeneFinder.readLength;
+
+ startPos = (int)Math.max(0,(startPos-GeneFinder.readLength));
+
+ cluster.sequence = contigSeq.substring(startPos,Math.min(currentPos+basesToAdd,contigSeq.length()));
+
+ cluster.geneID = id++;
+ cluster.onRevStrand = false;
+ cluster.startPos = startPos;
+ cluster.stopPos = currentPos+basesToAdd-1; // note: without "-1" stopPos would be the position necessary for seq-extraction = actual stop + 1
+
+ cluster.coreSeq = (cluster.stopPos-cluster.startPos);
+ cluster.realDirectionNotKnown = true;
+
+ return id;
+ }
+
+}
+
+
diff --git a/src/geneFinder/Prokaryote_Specials.java b/src/geneFinder/Prokaryote_Specials.java
new file mode 100755
index 0000000..ed886ee
--- /dev/null
+++ b/src/geneFinder/Prokaryote_Specials.java
@@ -0,0 +1,589 @@
+package geneFinder;
+
+import java.util.Vector;
+import types.Gene;
+
+/**
+ * class that includes method especially applied in prokaryotic gene finding
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class Prokaryote_Specials {
+
+
+ public static Vector<Integer> bestCombi; // only store the best possible combination achieved so far
+ public static boolean bestIsForward;
+
+ public static int covered_Local;
+
+ public static int notCounted; // counts how often we rejected an ORF due to the size threshold
+ public static int alreadyCovered; // counts how often we rejected an ORF because it is contained in a bigger one
+
+ public static int[] cov; // necessary for divide and conquer
+
+ /*
+ * search all ORFs and determine the best set
+ */
+
+ public static Object[] define_OrfsInOperon(String inputSeq, Gene gene){
+
+ // first extract all possible ORFs
+
+ //String report = "";
+
+ bestCombi = new Vector<Integer>();
+ bestIsForward = true;
+
+ notCounted = 0;
+ alreadyCovered = 0;
+ cov = new int[inputSeq.length()];
+
+ Vector<int[]> allORFs_FO = searchFO_orfs(inputSeq);
+ //report += notCounted + " below, " + alreadyCovered + " covered, " + allORFs_FO.size() + " left (FO). ";
+
+ Object[] returnArr_FO = Operon_LP.writeOperon_LP(inputSeq,cov,allORFs_FO,-1,null);
+ double scoreFO = (Double) returnArr_FO[0];
+ Vector<Integer> chosenORFs_FO = (Vector<Integer>) returnArr_FO[1];
+ int maxL_FO = (Integer) returnArr_FO[2];
+
+ notCounted = 0;
+ alreadyCovered = 0;
+ cov = new int[inputSeq.length()];
+ Vector<int[]> allORFs_RE = searchRE_orfs(inputSeq);
+
+ //report += notCounted + " below, " + alreadyCovered + " covered, " + allORFs_RE.size() + " left (RE). ";
+
+ Object[] returnArr_RE = Operon_LP.writeOperon_LP(inputSeq,cov,allORFs_RE,-1,null);
+ double scoreRE = (Double) returnArr_RE[0];
+ Vector<Integer> chosenORFs_RE = (Vector<Integer>) returnArr_RE[1];
+ int maxL_RE = (Integer) returnArr_RE[2];
+
+ // now determine the BIC and remember the maximum
+
+ Vector<int[]> orfsVec = new Vector<int[]>();
+
+ if(scoreRE > scoreFO){
+ bestIsForward = false;
+ bestCombi.clear();
+ bestCombi = chosenORFs_RE;
+
+ // second iteration
+
+ Vector<int[]> mergedORFs = new Vector<int[]>();
+ Vector<Integer> posChosen = new Vector<Integer>();
+
+ int posNum = 0;
+
+ for(int pos : chosenORFs_RE){
+ int[] tmp = {(allORFs_RE.get(pos)[0]), (allORFs_RE.get(pos)[1])};
+ mergedORFs.add(tmp);
+ posChosen.add(posNum++);
+ }
+
+ mergedORFs.addAll(allORFs_FO);
+ cov = new int[inputSeq.length()];
+ calculateCoverage(mergedORFs);
+ Object[] returnArr_RE_2 = Operon_LP.writeOperon_LP(inputSeq,cov,mergedORFs,maxL_RE,posChosen);
+ double scoreRE_2 = (Double) returnArr_RE_2[0];
+ Vector<Integer> chosenORFs_RE_2 = (Vector<Integer>) returnArr_RE_2[1];
+
+ if(chosenORFs_RE_2.size() > chosenORFs_RE.size()){
+
+ // start final iteration
+
+ Vector<int[]> mergedORFs_2 = new Vector<int[]>();
+ Vector<Integer> posChosen_2 = new Vector<Integer>();
+
+ int pos2_temp = 0;
+ for(int pos : chosenORFs_RE_2){
+ int[] tmp = {(mergedORFs.get(pos)[0]), (mergedORFs.get(pos)[1])};
+ mergedORFs_2.add(tmp);
+ if(!posChosen.contains(pos)){
+ posChosen_2.add(pos2_temp);
+ }
+ pos2_temp++;
+ }
+
+ cov = new int[inputSeq.length()];
+ calculateCoverage(mergedORFs_2);
+
+ Object[] returnArr_RE_3 = Operon_LP.writeOperon_LP(inputSeq,cov,mergedORFs_2,maxL_RE,posChosen_2);
+ double scoreRE_3 = (Double) returnArr_RE_3[0];
+ Vector<Integer> chosenORFs_RE_3 = (Vector<Integer>) returnArr_RE_3[1];
+
+ orfsVec = determineBestComposition(mergedORFs_2,chosenORFs_RE_3,posChosen_2,inputSeq,gene,0);
+
+ }else{
+ int[] first = {1};
+ orfsVec.add(first);
+ for(int pos : bestCombi){
+ int[] tmp = {(gene.startPos + allORFs_RE.get(pos)[0]), (gene.startPos + allORFs_RE.get(pos)[1])};
+ orfsVec.add(tmp);
+ }
+ }
+
+ }else{
+ bestIsForward = true;
+ bestCombi.clear();
+ bestCombi = chosenORFs_FO;
+
+ // second iteration
+
+ Vector<int[]> mergedORFs = new Vector<int[]>();
+ Vector<Integer> posChosen = new Vector<Integer>();
+
+ int posNum = 0;
+
+ for(int pos : chosenORFs_FO){
+ int[] tmp = {(allORFs_FO.get(pos)[0]), (allORFs_FO.get(pos)[1])};
+ mergedORFs.add(tmp);
+ posChosen.add(posNum++);
+ }
+
+ mergedORFs.addAll(allORFs_RE);
+ cov = new int[inputSeq.length()];
+ calculateCoverage(mergedORFs);
+ Object[] returnArr_FO_2 = Operon_LP.writeOperon_LP(inputSeq,cov,mergedORFs,maxL_FO,posChosen);
+ double scoreFO_2 = (Double) returnArr_FO_2[0];
+ Vector<Integer> chosenORFs_FO_2 = (Vector<Integer>) returnArr_FO_2[1];
+
+ if(chosenORFs_FO_2.size() > chosenORFs_FO.size()){
+
+ // start final iteration
+
+ Vector<int[]> mergedORFs_2 = new Vector<int[]>();
+ Vector<Integer> posChosen_2 = new Vector<Integer>();
+
+ int pos2_temp = 0;
+ for(int pos : chosenORFs_FO_2){
+ int[] tmp = {(mergedORFs.get(pos)[0]), (mergedORFs.get(pos)[1])};
+ mergedORFs_2.add(tmp);
+ if(!posChosen.contains(pos)){
+ posChosen_2.add(pos2_temp);
+ }
+ pos2_temp++;
+ }
+
+ cov = new int[inputSeq.length()];
+ calculateCoverage(mergedORFs_2);
+
+ Object[] returnArr_FO_3 = Operon_LP.writeOperon_LP(inputSeq,cov,mergedORFs_2,maxL_FO,posChosen_2);
+ double scoreFO_3 = (Double) returnArr_FO_3[0];
+ Vector<Integer> chosenORFs_FO_3 = (Vector<Integer>) returnArr_FO_3[1];
+
+ orfsVec = determineBestComposition(mergedORFs_2,chosenORFs_FO_3,posChosen_2,inputSeq,gene,1);
+
+ }else{
+ int[] first = {1};
+ orfsVec.add(first);
+ for(int pos : bestCombi){
+ int[] tmp = {(gene.startPos + allORFs_FO.get(pos)[0]), (gene.startPos + allORFs_FO.get(pos)[1])};
+ orfsVec.add(tmp);
+ }
+ }
+
+ }
+
+ /*if(bestIsForward){
+ for(int pos : bestCombi){
+ int[] tmp = {(gene.startPos + allORFs_FO.get(pos)[0]), (gene.startPos + allORFs_FO.get(pos)[1])};
+ orfsVec.add(tmp);
+ }
+ }else{
+ for(int pos : bestCombi){
+ int[] tmp = {(gene.startPos + allORFs_RE.get(pos)[0]), (gene.startPos + allORFs_RE.get(pos)[1])};
+ orfsVec.add(tmp);
+ }
+ }*/
+
+ //report += "OrfNum: " + orfsVec.size() + ", FO:" + bestIsForward + ", time: " + (double) (timeAft_all-timeBef_all)/1000.0 +"s.";
+
+ return new Object[]{orfsVec,bestIsForward};
+ }
+
+ /*
+ * sort reverse ORFs beginning with left-most start position (via bubblesort)
+ * necessary for combination determination
+ */
+
+ public static Vector<int[]> sort_ORFs(Vector<int[]> allORFs_RE){
+
+ int[] temp;
+ for(int i=1; i<allORFs_RE.size(); ++i) {
+ for(int j=0; j<allORFs_RE.size()-i; ++j) {
+ if((allORFs_RE.get(j)[0]) > (allORFs_RE.get(j+1)[0])) {
+ temp=allORFs_RE.get(j);
+ allORFs_RE.setElementAt(allORFs_RE.get(j+1),j);
+ allORFs_RE.setElementAt(temp,j+1);
+ }
+
+ }
+ }
+
+ return allORFs_RE;
+ }
+
+ /*
+ * searches all ORFs assuming forward direction
+ * note: no length limit is set, ORFs too short should be penalized in the BIC scoring
+ */
+
+ public static Vector<int[]> searchFO_orfs(String inputSeq){
+
+ Vector<int[]> allORFs_FO = new Vector<int[]>();
+
+ int foundNewATG = 1;
+ int posLastATG = 0;
+
+ do{
+ int startPos = inputSeq.substring(posLastATG).indexOf("ATG");
+ int stopPos = -1;
+
+ int posLastStart = -1;
+
+ if(startPos == -1){
+ foundNewATG = 0;
+ break;
+ }else{
+ startPos = startPos + posLastATG;
+ posLastATG = startPos + 3;
+ posLastStart = startPos + 3;
+ }
+
+ int goOn = 0;
+
+ do{
+ goOn = 0;
+
+ String stopPart = inputSeq.substring(posLastStart);
+ int stopSub[] = {stopPart.indexOf("TAA"),stopPart.indexOf("TGA"),stopPart.indexOf("TAG")};
+
+ java.util.Arrays.sort(stopSub);
+
+ if((stopSub[0] > -1)){
+ if(((((posLastStart + stopSub[0])-startPos) % 3) == 0)){
+ stopPos = posLastStart + stopSub[0];
+ }else{
+ posLastStart = posLastStart + stopSub[0]+1;
+ goOn = 1;
+ }
+ }else if((stopSub[1] > -1)){
+ if(((((posLastStart + stopSub[1])-startPos) % 3) == 0)){
+ stopPos = posLastStart + stopSub[1];
+ }else{
+ posLastStart = posLastStart + stopSub[1]+1;
+ goOn = 1;
+ }
+ } else if((stopSub[2] > -1)){
+ if(((((posLastStart + stopSub[2])-startPos) % 3) == 0)){
+ stopPos = posLastStart + stopSub[2];
+ }else{
+ posLastStart = posLastStart + stopSub[2]+1;
+ goOn = 1;
+ }
+ }
+
+ if(stopPos != -1){
+
+ if(stopPos-startPos > 30){
+ if(!checkIfORFcovered(allORFs_FO,new int[]{startPos,(stopPos+2)})){
+ allORFs_FO.add(new int[]{startPos,(stopPos+2)});
+ for(int i=startPos;i<=stopPos+2;++i){
+ cov[i]++;
+ }
+ }else{
+ alreadyCovered++;
+ }
+ }else{
+ notCounted++;
+ }
+
+ break;
+ }
+
+ }while(goOn == 1);
+
+
+ }while(foundNewATG == 1);
+
+
+ return allORFs_FO;
+ }
+
+ /*
+ * searches all ORFs assuming reverse direction
+ * note: no length limit is set, ORFs too short should be penalized in the BIC scoring
+ */
+
+ public static Vector<int[]> searchRE_orfs(String inputSeq){
+
+ Vector<int[]> allORFs_RE= new Vector<int[]>();
+
+ int foundNewCAT = 1;
+ int posLastCAT = inputSeq.length();
+
+ do{
+ int startPos = inputSeq.substring(0,posLastCAT).lastIndexOf("CAT");
+ int stopPos = -1;
+
+ int posLastStop = -1;
+
+ if(startPos == -1){
+ foundNewCAT = 0;
+ break;
+ }else{
+ posLastCAT = startPos;
+ posLastStop = startPos;
+ }
+
+ int goOn = 0;
+
+ do{
+ goOn = 0;
+
+ String stopPart = inputSeq.substring(0,posLastStop);
+ int stopSub[] = {stopPart.lastIndexOf("TTA"),stopPart.lastIndexOf("TCA"),stopPart.lastIndexOf("CTA")};
+
+ java.util.Arrays.sort(stopSub);
+
+ if((stopSub[2] > -1)){
+ if(((startPos-stopSub[2]) % 3) == 0){
+ stopPos = stopSub[2];
+ }else{
+ posLastStop = stopSub[2]+2;
+ goOn = 1;
+ }
+ }else if((stopSub[1] > -1)){
+ if(((startPos-stopSub[1]) % 3) == 0){
+ stopPos = stopSub[1];
+ }else{
+ posLastStop = stopSub[1]+2;
+ goOn = 1;
+ }
+ } else if((stopSub[0] > -1)){
+ if(((startPos-stopSub[0]) % 3) == 0){
+ stopPos = stopSub[0];
+ }else{
+ posLastStop = stopSub[0]+2;
+ goOn = 1;
+ }
+ }
+
+ if(stopPos != -1){
+
+ if(startPos-stopPos > 30){
+ if(!checkIfORFcovered(allORFs_RE,new int[]{stopPos,(startPos+2)})){
+ allORFs_RE.add(new int[]{stopPos,(startPos+2)});
+ for(int i=stopPos;i<=startPos+2;++i){
+ cov[i]++;
+ }
+ }else{
+ alreadyCovered++;
+ }
+ }else{
+ notCounted++;
+ }
+
+
+ break;
+ }
+
+ }while(goOn == 1);
+
+
+ }while(foundNewCAT == 1);
+
+
+ return allORFs_RE;
+ }
+
+ /*
+ * filter out all orfs that are completely included in bigger ones
+ */
+
+ public static boolean checkIfORFcovered(Vector<int[]> allORFs,int[] thisORF){
+
+ for(int[] orf : allORFs){
+ if((orf[0] <= thisORF[0]) && (orf[1] >= thisORF[1])){
+ return true;
+ }
+ }
+
+ return false;
+
+ }
+
+
+ /*
+ * derive the coverage from a set of given ORFs
+ */
+
+ public static void calculateCoverage(Vector<int[]> orfs){
+
+ for(int[] orf : orfs){
+ for(int i=orf[0];i<=orf[1];++i){
+ cov[i]++;
+ }
+ }
+
+ }
+
+ /*
+ * if we have chosen additional ORFs for the current direction, try to find the best combination of operons
+ * orgForward = 1 indicates, that original set was on forward strand, = 0 means reverse
+ */
+
+ public static Vector<int[]> determineBestComposition(Vector<int[]> mergedORFs, Vector<Integer> chosenORFs, Vector<Integer> posChosenBef,String inputSeq,Gene gene, int orgForward){
+
+ Vector<int[]> orfsVec = new Vector<int[]>();
+
+ int orgAdd = -1;
+ if(orgForward == 1){
+ orgAdd = 0;
+ }else{
+ orgAdd = 1;
+ }
+
+ Vector<int[]> originalStrandORFs = new Vector<int[]>();
+ Vector<int[]> additionalORFs = new Vector<int[]>();
+ Vector<int[]> connectedComp_add = new Vector<int[]>(); // contains one connected component per int[] (boundaries at 0,1 and the posis of genes at following positions) for addtional orfs
+ Vector<int[]> connectedComp_norm = new Vector<int[]>();
+
+ for(int i : chosenORFs){
+
+ if(posChosenBef.contains(i)){
+ additionalORFs.add(mergedORFs.get(i));
+ }else{
+ originalStrandORFs.add(mergedORFs.get(i));
+ }
+
+ }
+
+ // determine connected comp for additional orfs
+
+ additionalORFs = sort_ORFs(additionalORFs);
+
+ cov = new int[inputSeq.length()];
+ calculateCoverage(originalStrandORFs);
+
+ connectedComp_add = defineConnectedComps(additionalORFs,connectedComp_add);
+
+ // now for original orfs
+
+ originalStrandORFs = sort_ORFs(originalStrandORFs);
+
+ cov = new int[inputSeq.length()];
+ calculateCoverage(additionalORFs);
+
+ connectedComp_norm = defineConnectedComps(originalStrandORFs,connectedComp_norm);
+
+ // fill orfsVec
+
+ orfsVec.add(new int[] {-1});
+
+ if(originalStrandORFs.get(0)[0] <= additionalORFs.get(0)[0]){
+ orfsVec = fillVec(connectedComp_norm,orgForward,gene,orfsVec,0);
+ orfsVec = fillVec(connectedComp_add,orgAdd,gene,orfsVec,connectedComp_add.get(0)[0]);
+ }else{
+ orfsVec = fillVec(connectedComp_add,orgAdd,gene,orfsVec,0);
+ orfsVec = fillVec(connectedComp_norm,orgForward,gene,orfsVec,connectedComp_norm.get(0)[0]);
+ }
+
+ return orfsVec;
+ }
+
+ /*
+ * fill the orf vector with operons for this candidate gene
+ */
+
+ public static Vector<int[]> fillVec(Vector<int[]> connectedComp,int strand,Gene gene,Vector<int[]> orfsVec,int startPosition){
+
+ for(int i = 0; i<connectedComp.size();++i){
+ int[] temp = new int[connectedComp.get(i).length+1];
+ temp[0] = strand;
+ if(i==0){
+ temp[1] = gene.startPos + startPosition;
+ }else{
+ temp[1] = gene.startPos + connectedComp.get(i)[0];
+ }
+ temp[2] = gene.startPos + connectedComp.get(i)[1];
+
+ for(int pos = 2;pos<connectedComp.get(i).length;++pos){
+ temp[pos+1] = gene.startPos + connectedComp.get(i)[pos];
+ }
+
+ orfsVec.add(temp);
+ }
+
+ return orfsVec;
+ }
+
+ /*
+ * search for connected components in additional Orfs
+ */
+
+ public static Vector<int[]> defineConnectedComps(Vector<int[]> additionalORFs, Vector<int[]> connectedComp){
+
+ int[] tempBound = new int[2];
+ Vector<int[]> tempOrfs = new Vector<int[]>();
+ tempBound[0] = additionalORFs.get(0)[0];
+ tempBound[1] = additionalORFs.get(0)[1];
+ tempOrfs.add(additionalORFs.get(0));
+
+ for(int i = 1; i<additionalORFs.size();++i){
+ int[] orf = additionalORFs.get(i);
+
+ if(orf[0] <= tempBound[1]){ // connected
+ tempBound[1] = orf[1];
+ tempOrfs.add(orf);
+ }else{
+ int orfWithin = 0;
+ for(int pos = tempBound[1]+1; pos < orf[0];++pos){
+ if(cov[pos] > 0){
+ orfWithin = 1;
+ break;
+ }
+ }
+
+ if(orfWithin == 0){
+ tempBound[1] = orf[1];
+ tempOrfs.add(orf);
+ }else{
+ // define as not connected
+ int[] tempComp = new int[((tempOrfs.size()*2)+2)];
+ tempComp[0] = tempBound[0];
+ tempComp[1] = tempBound[1];
+ int posArr = 2;
+ for(int[] tempOrf : tempOrfs){
+ tempComp[posArr++] = tempOrf[0];
+ tempComp[posArr++] = tempOrf[1];
+ }
+ connectedComp.add(tempComp);
+
+ // newly initialize
+
+ tempBound = new int[2];
+ tempOrfs = new Vector<int[]>();
+ tempBound[0] = orf[0];
+ tempBound[1] = orf[1];
+ tempOrfs.add(orf);
+ }
+ }
+ }
+
+ int[] tempComp = new int[((tempOrfs.size()*2)+2)];
+ tempComp[0] = tempBound[0];
+ tempComp[1] = tempBound[1];
+ int posArr = 2;
+ for(int[] tempOrf : tempOrfs){
+ tempComp[posArr++] = tempOrf[0];
+ tempComp[posArr++] = tempOrf[1];
+ }
+ connectedComp.add(tempComp);
+ return connectedComp;
+ }
+}
\ No newline at end of file
diff --git a/src/geneFinder/ReadInParameters_GeneFinder.java b/src/geneFinder/ReadInParameters_GeneFinder.java
new file mode 100755
index 0000000..70ae2dd
--- /dev/null
+++ b/src/geneFinder/ReadInParameters_GeneFinder.java
@@ -0,0 +1,485 @@
+package geneFinder;
+
+import java.io.*;
+import java.net.URLDecoder;
+import java.util.*;
+
+/**
+ * class to read in the command line parameters and get all information
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class ReadInParameters_GeneFinder {
+
+ /*
+ * do the read in
+ */
+
+ public static void readIn_GF(String[] args){
+
+ if(args.length == 0 || args[0].equals("-h") || args[0].equals("--help") || args[0].equals("-help")){
+ printHelp_GF();
+ }
+
+ String parameter = Arrays.toString(args);
+
+ // read in everything:
+
+ String inputText = "";
+ System.out.println();
+ inputText += "Input:\n";
+
+ boolean foundGenome = false;
+ boolean foundRna = false;
+ boolean foundThreadNumber = false;
+ boolean foundOutFileName = false;
+ boolean foundTool = false;
+ boolean foundLength = false;
+ boolean havePathOut = false;
+ boolean foundMinCov = false;
+ boolean foundMaxCov = false;
+ boolean foundEndCov = false;
+ boolean foundMaxRepHits = false;
+ boolean foundHelpPath = false;
+ boolean foundIntronMin = false;
+ boolean foundAmbiOpti = false;
+ boolean foundOptiMethod = false;
+ boolean foundDispCov = false;
+ boolean foundSolveOn = false;
+ boolean foundIter = false;
+ boolean foundnoSplLim = false;
+ boolean foundSam = false;
+ boolean foundSplitIndicator = false;
+ boolean foundSecondIndicator = false;
+ boolean foundProkaryote = false;
+ boolean foundSequential = false;
+ boolean foundInprogea = false;
+
+
+ if(!parameter.isEmpty() && args.length > 0){
+ System.out.println();
+ for(int i = 0; i<args.length;i++){
+ String arg = args[i];
+
+ if(arg.equals("-iG")){ //genome
+
+ GeneFinder.pathToGenomeFiles = args[i+1];
+
+ if(args[i+1].endsWith(".fasta") || args[i+1].endsWith(".fa")){
+ File dir = new File(args[i+1]);
+ String[] nameSplit = args[i+1].split("/");
+ String name = nameSplit[nameSplit.length-1];
+ GeneFinder.pathToGenomeFiles = args[i+1].substring(0,(args[i+1].length()-name.length()));
+ foundGenome = true;
+
+ inputText += "genome: \n";
+
+ inputText += " - " + name + "\n";
+ GeneFinder.genomeFilesWithNames.put(dir,name);
+
+ }else{
+ File dir = new File(args[i+1]);
+ String[] fileList = dir.list(new FilenameFilter() {
+ public boolean accept(File d, String name) {
+ if(name.endsWith(".fasta") || name.endsWith(".fa")){
+ return true;
+ }else{
+ return false;
+ }
+ }
+ });
+ if(fileList.length != 0){
+ foundGenome = true;
+ }
+ inputText += "genomes: \n";
+ for(String name : fileList){
+ inputText += " - " + name + "\n";
+ File newFile = new File(args[i+1]+name);
+ GeneFinder.genomeFilesWithNames.put(newFile,name);
+ }
+ }
+ } else if(arg.equals("-iR")){ //rna reads
+
+ String pathToRnaFiles = args[i+1];
+
+ if(pathToRnaFiles.endsWith(".fastq") || pathToRnaFiles.endsWith(".fq")){
+ File dir = new File(pathToRnaFiles);
+ String[] nameSplit = pathToRnaFiles.split("/");
+ String name = nameSplit[nameSplit.length-1];
+ foundRna = true;
+
+ inputText += "rna file: \n";
+
+ inputText += " - " + name + "\n";
+ GeneFinder.rnaFilesWithNames.put(dir,pathToRnaFiles);
+
+ }else{
+ File dir = new File(pathToRnaFiles);
+ String[] fileList = dir.list(new FilenameFilter() {
+ public boolean accept(File d, String name) {
+ if(name.endsWith(".fastq") || name.endsWith(".fq")){
+ return true;
+ }else{
+ return false;
+ }
+ }
+ });
+ if(fileList.length != 0){
+ foundRna = true;
+ }
+ inputText += "rna files: \n";
+ for(String name : fileList){
+ inputText += " - " + name + "\n";
+ File newFile = new File(pathToRnaFiles+name);
+ GeneFinder.rnaFilesWithNames.put(newFile,pathToRnaFiles+name);
+ }
+ }
+
+ } else if(arg.equals("-outName")){ // output file name
+
+ GeneFinder.outputName = args[i+1];
+ inputText += "Outfile name: " + args[i+1] + "\n";
+ foundOutFileName = true;
+
+ }else if(arg.equals("-rL")){ // read length
+
+ GeneFinder.readLength = Integer.parseInt(args[i+1]);
+ foundLength = true;
+ inputText += "Read length: " + GeneFinder.readLength +"\n";
+
+ } else if(arg.equals("-mT")){ // read mapper
+ String mapper = args[i+1];
+ foundTool = true;
+ if(mapper.equals("bwa")){
+ GeneFinder.useTopHat = false;
+ GeneFinder.useBWAsw = false;
+ }else if(mapper.equals("bwasw")){
+ GeneFinder.useTopHat = false;
+ GeneFinder.useBWAsw = true;
+ }else{
+ GeneFinder.useTopHat = true;
+ }
+
+ } else if(arg.equals("-minCov")){ // minimal number of required coverage for the beginning of a gene
+ GeneFinder.minCoverage = Double.parseDouble(args[i+1]);
+ foundMinCov = true;
+ inputText += "Minimal required coverage: " + GeneFinder.minCoverage + "\n";
+
+ } else if(arg.equals("-maxCov")){ // minimal number of required coverage for the beginning of a gene
+ GeneFinder.maxCov = Double.parseDouble(args[i+1]);
+ foundMaxCov = true;
+ inputText += "Maximal coverage threshold: " + GeneFinder.maxCov + "\n";
+ } else if(arg.equals("-endCov")){ // minimal number of required coverage for the end of a gene
+ double endCov = Double.parseDouble(args[i+1]);
+ foundEndCov = true;
+ if(endCov == -1){
+ GeneFinder.endCoverage = -1; // indicates that it will be reassigned once we obtained the minimum coverage
+ }else if(endCov == 0){
+ GeneFinder.endCoverage = 0.0;
+ } else{
+ GeneFinder.endCoverage = endCov - 0.001; // subtract 0.001 to make sure that simple greater than is possible in extraction (necessary for endCov == 0)
+ }
+
+ inputText += "Minimal required end coverage: " + GeneFinder.endCoverage + "\n";
+
+ } else if(arg.equals("-settingMapper")){ // setting for topHat
+ GeneFinder.settingMapper = args[i+1].substring(1,(args[i+1].length()-1));
+ inputText += "Setting of alignment tool: " + GeneFinder.settingMapper + "\n";
+
+ } else if(arg.equals("-nT")){ // number of threads allowed to be used
+ GeneFinder.numberThreads = Integer.parseInt(args[i+1]);
+ foundThreadNumber = true;
+ inputText += "Number of threads used in parallel: " + GeneFinder.numberThreads + "\n";
+
+ } else if(arg.equals("-out")){ // output directory
+ GeneFinder.pathOut = args[i+1];
+ havePathOut = true;
+ inputText += "Path to output: " + GeneFinder.pathOut + "\n";
+
+ } else if(arg.equals("-samForSequential")){ // output directory
+ GeneFinder.haveSam_ChromSorted = args[i+1];
+ GeneFinder.useSequential = true;
+ foundSequential = true;
+ inputText += "Perform sequential analysis. Using sam file: " + GeneFinder.haveSam_ChromSorted + "\n";
+
+ } else if(arg.equals("-haveSam")){ // output directory
+ GeneFinder.haveSam = args[i+1];
+ foundSam = true;
+ inputText += "Using sam file: " + GeneFinder.haveSam + "\n";
+
+ } else if(arg.equals("-scripts")){ // help file directory
+ GeneFinder.pathToHelpFiles = args[i+1];
+ foundHelpPath = true;
+ inputText += "Path to help files: " + GeneFinder.pathToHelpFiles + "\n";
+
+ } else if(arg.equals("-maxReportedHitsBWA")){ // max reported hits in BWA output
+ GeneFinder.maxReportedHitsBWA = Integer.parseInt(args[i+1]);
+ foundMaxRepHits = true;
+ inputText += "maximal number of reported hits for BWA: " + GeneFinder.maxReportedHitsBWA + "\n";
+
+ } else if(arg.equals("-interval")){ // minimal intron length
+ GeneFinder.interval = Integer.parseInt(args[i+1]);
+ foundIntronMin = true;
+ if(GeneFinder.interval == -1){
+ inputText += "use read length as minimal interval length\n";
+ }else{
+ inputText += "minimal interval length: " + GeneFinder.interval + "\n";
+ }
+
+ } else if(arg.equals("-noAmbiOpti")){ // turn on or off the optimization of ambiguous reads
+ foundAmbiOpti = true;
+ GeneFinder.noAmbiOpti = true;
+ inputText += "Ambiguous reads are excluded from analysis.\n";
+ } else if(arg.equals("-opti")){
+ String optimizer = args[i+1];
+ foundOptiMethod = true;
+ if(optimizer.equals("glpk")){
+ GeneFinder.useCPLEX = false;
+ GeneFinder.useGLPK = true;
+ inputText += "Using glpk for ambiguous read optimization.\n";
+ } else{
+ GeneFinder.useCPLEX = true;
+ GeneFinder.useGLPK = false;
+ inputText += "Using cplex for ambiguous read optimization.\n";
+ }
+ } else if(arg.equals("-mem")){
+ GeneFinder.memForCplex = Integer.parseInt(args[i+1]);
+ } else if(arg.equals("-dispCov")){
+ foundDispCov = true;
+ int display = Integer.parseInt(args[i+1]);
+ if(display == 1){
+ GeneFinder.dispCov = true;
+ } else{
+ GeneFinder.dispCov = false;
+ }
+ } else if(arg.equals("-solverOn")){
+ foundSolveOn = true;
+
+ if(args[i+1].equals("y")){
+ GeneFinder.optiSolve = true;
+ } else{
+ GeneFinder.optiSolve = false;
+ }
+ } else if(arg.equals("-splitRunAndOpti")){
+ foundSplitIndicator = true;
+
+ if(args[i+1].equals("y")){
+ GeneFinder.splitRunAndOpti = true;
+ } else{
+ GeneFinder.splitRunAndOpti = false;
+ }
+ } else if(arg.equals("-secondPart")){
+ foundSecondIndicator = true;
+
+ if(args[i+1].equals("y")){
+ GeneFinder.secondPart = true;
+ } else{
+ GeneFinder.secondPart = false;
+ }
+ } else if(arg.equals("-prokaryote")){
+ foundProkaryote = true;
+ GeneFinder.isProkaryote = true;
+ /*if(args[i+1].equals("y")){
+ GeneFinder.isProkaryote = true;
+ } else{
+ GeneFinder.isProkaryote = false;
+ }*/
+ } else if(arg.equals("-iter")){
+ foundIter = true;
+ int iteration = Integer.parseInt(args[i+1]);
+ if(iteration == 1){
+ GeneFinder.iteration = 1;
+ } else{
+ GeneFinder.iteration = 2;
+ }
+ } else if(arg.equals("-splLim")){
+ foundnoSplLim = true;
+ double num = Double.parseDouble(args[i+1]);
+ if(num == 0){
+ GeneFinder.spliceLim = 0.0;
+ inputText += "Accept all present splice sites. \n";
+ } else if(num == -1){
+ GeneFinder.spliceLim = -1.0;
+ inputText += "Using minimal coverage as a threshold for splice site acceptance. \n";
+ } else{
+ GeneFinder.spliceLim = num;
+ inputText += "Using " + num + " as a threshold for splice site acceptance. \n";
+ }
+ } else if(arg.equals("-inprogea")){
+ foundInprogea = true;
+ GeneFinder.inprogeaCall = true;
+ }
+
+ }
+ }
+
+ // defaults and error messages:
+
+ if(!havePathOut){
+ GeneFinder.pathOut = "";
+ }
+ if(!foundGenome){
+ System.out.println("No genome file specified. Use \"-h\" to print usage options. ");
+ System.exit(0);
+ }
+ if(!foundRna){
+ if(!foundSam){
+ System.out.println("No rna file specified. Use \"-h\" to print usage options. ");
+ System.exit(0);
+ }
+ }
+ if(!foundThreadNumber){
+ GeneFinder.numberThreads = 1;
+ }
+ if(!foundTool){
+ GeneFinder.useTopHat = true;
+ }
+ if(!foundMinCov){
+ GeneFinder.minCoverage = -1;
+ inputText += "Estimate minimal coverage from the alignment. \n";
+ }
+ if(!foundMaxCov){
+ GeneFinder.maxCov = -1;
+ }
+ if(!foundEndCov){
+ GeneFinder.endCoverage = -1;
+ }
+ if(!GeneFinder.useTopHat && !foundMaxRepHits){
+ GeneFinder.maxReportedHitsBWA = 2;
+ }
+ if(!foundHelpPath){
+ //GeneFinder.pathToHelpFiles ="";
+ String path = GeneFinder.class.getProtectionDomain().getCodeSource().getLocation().getPath();
+
+ try {
+ String decodedPath = URLDecoder.decode(path, "UTF-8");
+ String scriptPath = decodedPath.substring(0,decodedPath.length()-9);
+ GeneFinder.pathToHelpFiles = scriptPath + "scripts/";
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ }
+ if(!foundLength){
+ GeneFinder.readLength = -1;
+ }
+ if(!foundIntronMin){
+ GeneFinder.interval = -1;
+ }
+ if(!foundOutFileName){
+ GeneFinder.outputName = "genes";
+ }
+ if(!foundAmbiOpti){
+ GeneFinder.noAmbiOpti = false;
+ }
+ if(!foundOptiMethod){
+ GeneFinder.useCPLEX = true;
+ }
+ if(!foundSolveOn){
+ GeneFinder.optiSolve = true;
+ }
+ if(!foundDispCov){
+ GeneFinder.dispCov = false;
+ }
+ if(!foundIter){
+ GeneFinder.iteration = 1;
+ }
+ if(!foundnoSplLim){
+ GeneFinder.spliceLim = -1;
+ inputText += "Using minimal coverage as a threshold for splice site acceptance. \n";
+ }
+ if(!foundSam){
+ GeneFinder.haveSam = null;
+ }
+ if(!foundSplitIndicator){
+ GeneFinder.splitRunAndOpti = false;
+ }
+ if(!foundSecondIndicator){
+ GeneFinder.secondPart = false;
+ }
+ if(!foundProkaryote){
+ GeneFinder.isProkaryote = false;
+ }
+ if(!foundSequential){
+ GeneFinder.useSequential = false;
+ }
+ if(!foundInprogea){
+ GeneFinder.inprogeaCall = false;
+ }
+
+ Runtime rtAlign = Runtime.getRuntime();
+ Process firstExe;
+ try {
+ String exe = "mkdir "+GeneFinder.pathOut+"resultsRun";
+ GeneFinder.logFile = new File(GeneFinder.pathOut+"resultsRun/log_it" + GeneFinder.iteration + ".txt");
+ firstExe = rtAlign.exec(exe);
+ firstExe.waitFor();
+
+ if(!GeneFinder.secondPart){
+ System.out.println(inputText);
+ WriteOutput.writeToLogFile(inputText);
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+
+
+
+ }
+
+ /*
+ * print the help text to screen
+ */
+
+ public static void printHelp_GF(){
+
+ System.out.println();
+ System.out.println("GIIRA (Gene Identification Incorporating RNA-Seq data and Ambiguous reads) is a method to identify potential gene regions in a genome " +
+ "based on a RNA-Seq mapping and incorporating ambiguously mapped reads.");
+ System.out.println();
+ System.out.println("Copyright (c) 2013,");
+ System.out.println("Franziska Zickmann, ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany,");
+ System.out.println("Distributed under the GNU Lesser General Public License, version 3.0.");
+ System.out.println();
+ System.out.println("usage example: \n java -jar GIIRA.jar -iG genomeFile.fasta -iR rnaFile.fastq -libPath [PATH_CPLEX_LIB] -cp [PATH_CPLEX/cplex.jar] \n");
+ System.out.println();
+ System.out.println("options: \n -h : help text and exit \n" +
+ " \n -iG [pathToGenomes] : specify path to directory with genome files in fasta format \n" +
+ " \n -iR [pathToRna] : specify path to directory with rna read files in fastq format \n" +
+ " \n -scripts [absolutePath] : specify the absolute path to the directory containing the required helper scripts, DEFAULT: directory of GIIRA.jar \n" +
+ " \n -out [pathToResults] : specify the directory that shall contain the results files \n" +
+ " \n -outName [outputName] : specify desired name for output files, DEFAULT: genes \n" +
+ " \n -haveSam [samfileName]: if a sam file already exists, provide the name, else a mapping is performed. NOTE: the sam file has to be sorted according to read names! \n" +
+ " \n -nT [numberThreads] : specify the maximal number of threads that are allowed to be used, DEFAULT: 1 \n" +
+ " \n -mT [tophat/bwa/bwasw] : specify desired tool for the read mapping, DEFAULT: tophat \n" +
+ " \n -opti [cplex/glpk] : specify the desired optimization method, either using CPLEX optimizer (cplex, DEFAULT) or glpk solver (glpk) \n" +
+ " \n -libPath [PATH] : if cplex is the desired optimizer, specify the absolute path to the cplex library Djava.library.path \n" +
+ " \n -cp [PATH] : if cplex is the desired optimizer, specify the absolute path to the cplex jar file cplex.jar\n" +
+ //" \n -splitRunAndOpti [y/n] : indicates if the optimization and giira shall be run separately, to reduce the memory consumption (y), DEFAULT: n" +
+ " \n -mem [int] : specify the amount of memory that cplex is allowed to use \n" +
+ " \n -maxReportedHits [int] : if using BWA as mapping tool, specify the maximal number of reported hits, DEFAULT: 2 \n" +
+ " \n -prokaryote : if specified, genome is treated as prokaryotic, no spliced reads are accepted, and structural genes are resolved. DEFAULT: n \n" +
+ " \n -minCov [double] : specify the minimum required coverage of the gene candidate extraction, DEFAULT: -1 (is estimated from mapping) \n" +
+ " \n -maxCov [double] : optional maximal coverage threshold, can also be estimated from mapping (DEFAULT) \n" +
+ " \n -endCov [double] : if the coverage falls below this value, the currently open candidate gene is closed. This value can be estimated from the minimum coverage (-1); DEFAULT: -1 \n" +
+ " \n -dispCov [0/1] : estimate (1) the coverage histogram for the read mapping, DEFAULT: 0 \n" +
+ " \n -interval [int] : specify the minimal size of an interval between near candidate genes, if \"-1\" it equals the read length. DEFAULT: -1 \n " +
+ " \n -splLim [double] : specify the minimal coverage that is required to accept a splice site, if (-1) the threshold is equal to minCov, DEFAULT: -1 \n" +
+ " \n -rL [int] : specify read length, otherwise this information is extracted from SAM file (DEFAULT) \n" +
+ " \n -samForSequential [pathToSamFile] : if it is desired to analyse chromosomes in a sequential manner, provide a chromosome sorted sam file in addition to the one sorted by read names, DEFAULT: noSequential \n" +
+ " \n -noAmbiOpti : if specified, ambiguous hits are not included in the analysis \n" +
+ " \n -settingMapper [(list of parameters)] : A comma-separated list of the desired parameters for TopHat or BWA. Please provide \n" +
+ " for each parameter a pair of indicator and value, separated by an equality sign. \n" +
+ " Note that paramters intended for the 3 different parts (indexing, aln, sam) of BWA have to be separated by a lowercase bar \n " +
+ " Example: -settingMapper [-a=is_-t=5,-N_-n=5]");
+ System.exit(0);
+
+ }
+
+}
diff --git a/src/geneFinder/SamParser.java b/src/geneFinder/SamParser.java
new file mode 100755
index 0000000..7e8eda5
--- /dev/null
+++ b/src/geneFinder/SamParser.java
@@ -0,0 +1,1248 @@
+package geneFinder;
+
+import java.io.*;
+import java.util.TreeMap;
+import java.util.Vector;
+
+import types.*;
+
+/**
+* parse the sam file derived from the read mapping,
+* Copyright (c) 2013,
+* Franziska Zickmann,
+* ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+* Distributed under the GNU Lesser General Public License, version 3.0
+*
+*/
+
+public class SamParser {
+
+ public static int posiNotContained;
+ public static int removedHits;
+
+ /*
+ * here we do the special parsing of the SAM-file, that only extracts the basic info to prepare the cluster extraction
+ * no initial scores are assigned
+ */
+
+ public void samFileParser(File inFile, File inFile2, String nameRef){
+
+ if(GeneFinder.minCoverage == -1){
+ // estimate the threshold by using coveragePlot script
+
+ GeneFinder.minCoverage = estimateCoverage();
+
+ if(!GeneFinder.secondPart){
+ System.out.println("Estimated minimum coverage: " + GeneFinder.minCoverage);
+ System.out.println("Estimated maximum coverage threshold: " + GeneFinder.maxCov);
+ WriteOutput.writeToLogFile("Estimated minimum coverage: " + GeneFinder.minCoverage + "\n");
+ }
+
+ }else if (GeneFinder.dispCov && (GeneFinder.iteration != 2)){
+ estimateCoverage();
+ }
+
+ if(GeneFinder.useSequential){
+ partParser(inFile2,nameRef);
+ GeneFinder.iteration = 2;
+ WriteOutput.sortReassignSamFile();
+ }
+
+ if(!GeneFinder.secondPart){
+ if(GeneFinder.iteration == 2){
+ System.out.print("Parse SAM file second time.... ");
+ WriteOutput.writeToLogFile("Parse SAM file second time.... ");
+ }else{
+ System.out.print("Parse SAM file.... ");
+ WriteOutput.writeToLogFile("Parse SAM file.... ");
+ }
+ }
+
+ long timebef = System.currentTimeMillis();
+
+ long rnaCount = 0;
+ long timeAfter = 0;
+ int multiCount = 0;
+ int multiTotalCount = 0;
+ int totalHitCount = 0;
+
+ int reassignedCount = 0;
+ int notReaCount = 0;
+
+ posiNotContained = 0;
+ removedHits = 0;
+
+ try{
+
+ BufferedReader rnaIn = new BufferedReader(new FileReader(inFile));
+
+ BufferedReader br = null;
+ String lineReaSam = "";
+
+ if(GeneFinder.iteration == 2){
+ br = new BufferedReader(new FileReader(new File(GeneFinder.pathOut+"resultsRun/reassignedReads_sorted.sam")));
+ while((lineReaSam = br.readLine()) != null){
+ String[] parts1 = lineReaSam.split(" ");
+ if(!parts1[0].startsWith("@") && parts1.length >=4){
+ break; // now lineReaSam is at first read
+ }
+ }
+ }
+
+ String line;
+
+ while((line = rnaIn.readLine()) != null){
+
+ String[] parts1 = line.split(" ");
+
+ if(!parts1[0].startsWith("@") && parts1.length >=4){
+ // now go to next while loop to skip previous check
+ String currentReadID = new String();
+ Rna currentRead = null;
+
+ if(GeneFinder.readLength == -1){
+ if(parts1[10].length() == 1){
+ System.out.println("No read sequence in sam file, please specify with option -rL.");
+ System.exit(0);
+ }
+ GeneFinder.readLength = parts1[10].length();
+ if(!GeneFinder.secondPart){
+ System.out.println();
+ System.out.println("Read length: " + GeneFinder.readLength);
+ }
+ }
+
+ do{
+ String[] parts = line.split("\t");
+
+ String flag = Integer.toBinaryString(Integer.parseInt(parts[1]));
+ String[] flagArr = flag.split("");
+
+ // test if mapped
+
+ if(!((flagArr.length >= 3) && ((flagArr[((flagArr.length)-3)]).equals("1"))) && (!(GeneFinder.isProkaryote && parts[5].contains("N"))) && !((parts[5].contains("H")) || (parts[5].contains("P")) || (parts[5].contains("S")))){ // part 5 restriction for blat alignment....
+
+ // get the contig
+
+ String contigName = parts[2];
+ Contig thisContig;
+ if(GeneFinder.mappedContigs.containsKey(contigName)){
+ // already seen this contig
+ thisContig = GeneFinder.mappedContigs.get(contigName);
+ }else{
+ thisContig = new Contig();
+ thisContig.contigName = contigName;
+ GeneFinder.mappedContigs.put(contigName,thisContig);
+ GeneFinder.contigTOname.put(thisContig,contigName);
+ }
+
+ totalHitCount++;
+
+ if(!parts[0].equals(currentReadID)){ // now we have proceeded to a new read
+
+ if(GeneFinder.iteration == 2 && currentRead != null && currentRead.isMulti == 1){
+
+ TreeMap<Integer,String> allReassigned = new TreeMap<Integer,String>(); // will contain all lines than deal with this read
+ if(lineReaSam != null){
+ do{
+ String[] partsReaSam = lineReaSam.split(" ");
+
+ if(currentRead.rnaID.equals(partsReaSam[0])){
+ allReassigned.put(Integer.parseInt(partsReaSam[3]),partsReaSam[2]);
+ }else{
+ break;
+ }
+
+ }while((lineReaSam = br.readLine()) != null);
+
+ if(!allReassigned.isEmpty()){
+ compareMappingsWithReaSam(currentRead,allReassigned);
+ reassignedCount++;
+ }else{
+ notReaCount++;
+ }
+ }
+
+ }
+
+ currentReadID = parts[0];
+
+ // set up new rna node
+
+ Rna newRna = new Rna();
+ newRna.rnaID = parts[0];
+ newRna.isMulti = 0;
+ newRna.hitNum = 1;
+ newRna.assignedNum = 0;
+
+ // assign quality,
+
+ double qualScoreRead = 0.0;
+ double maxQual = Double.MIN_VALUE;
+ double minQual = Double.MAX_VALUE;
+
+ if(parts[10].length() > 1){
+
+ for(int i = 0; i< parts[10].length();i++){
+ int posQualInt = parts[10].charAt(i);
+ double baseQual = 1.0 - (Math.pow(10.0, (-((double) posQualInt/10.0))));
+ if(baseQual < minQual){
+ minQual = baseQual;
+ }
+ if(baseQual > maxQual){
+ maxQual = baseQual;
+ }
+ qualScoreRead += baseQual;
+
+ }
+
+ newRna.quality = ((qualScoreRead/((double) parts[10].length())))/(maxQual-minQual+1);
+
+ }else{
+ newRna.quality = 1.0;
+ }
+
+ int thisAlignmentPosition = (Integer.parseInt(parts[3])) - 1; // TopHat and BWA refer to the 1-based start, so subtract -1 (because we refer to 0-based start)
+
+ double thisMappingQuality = (1-(Math.pow(10.0,(-(Integer.parseInt(parts[4]))/10.0))));
+
+ if(thisContig.positionTOmappingRnas.containsKey(thisAlignmentPosition)){
+ if(!thisContig.positionTOmappingRnas.get(thisAlignmentPosition).contains(newRna)){
+ thisContig.positionTOmappingRnas.get(thisAlignmentPosition).add(newRna);
+ }
+
+ }else{
+ Vector<Rna> itsRnas = new Vector<Rna>();
+ itsRnas.add(newRna);
+ thisContig.positionTOmappingRnas.put(thisAlignmentPosition,itsRnas);
+ }
+
+ Object[] mappedContig_and_Info = new Object[7];
+ mappedContig_and_Info[0] = thisContig;
+ mappedContig_and_Info[1] = thisAlignmentPosition;
+ mappedContig_and_Info[2] = parts[5];
+
+ // this is for mismatch info
+
+ String mdTag = "";
+ if(line.contains("MD:Z:")){
+ String[] splitMD = line.split("MD:Z:");
+ String[] splitMD2 = splitMD[splitMD.length-1].split("\t");
+ mdTag = splitMD2[0];
+ }else{
+ System.err.println("No MD tag!");
+ System.exit(0);
+ }
+ mappedContig_and_Info[5] = CigarParser.extractAllAlignDiffs(parts[5],mdTag,parts[9]);
+
+ /*if(parts[5].contains("X")){
+ String[] cigarSplitX = parts[5].split("X");
+ int posTmp = 0;
+ Vector<int[]> missMatchInfo = new Vector<int[]>(); // entries are of form: position (within contig),baseCode (A=0;C=1;G=2;T=3)
+ for(String tmp : cigarSplitX){
+ // the more parts, the more mismatches; note that the last part have to be checked if it ends with another letter!
+ if(!(tmp.endsWith("M") || tmp.endsWith("N"))){
+ String[] tmpSplit = tmp.split("[MN]");
+ for(String tmp2 : tmpSplit){
+ posTmp += Integer.parseInt(tmp2);
+ }
+ // now we have the position of a mismatch
+ int[] mmInfo = new int[2];
+ mmInfo[0] = thisAlignmentPosition + posTmp;
+ // now get base at this position
+ String base = parts[9].substring(posTmp,posTmp+1);
+ mmInfo[1] = returnBaseCode(base);
+ missMatchInfo.add(mmInfo);
+ }
+ }
+
+ mappedContig_and_Info[5] = missMatchInfo;
+ }*/
+
+ mappedContig_and_Info[3] = thisMappingQuality;
+
+ if((parts[5].contains("N"))){
+ mappedContig_and_Info = performSplitExtract(mappedContig_and_Info, thisAlignmentPosition,thisContig,parts[5]);
+ }
+
+ if(line.contains("XS:A:+")){
+ mappedContig_and_Info[6] = "+";
+ }else if(line.contains("XS:A:-")){
+ mappedContig_and_Info[6] = "-";
+ }
+
+ if(parts[5].contains("I") || parts[5].contains("D")){
+
+ int splitSize = 0;
+ if(mappedContig_and_Info[4] != null){
+ splitSize = ((int[]) mappedContig_and_Info[4])[0];
+ }
+
+ countInDels(parts[5],thisAlignmentPosition,thisContig,splitSize,false);
+ }
+
+ newRna.contigsMappedOn.add(mappedContig_and_Info);
+
+ // note: new try to avoid out of mem: rnaList.add(newRna);
+ rnaCount++;
+ currentRead = newRna;
+
+ if(!GeneFinder.useTopHat){
+ // multiple hits in BWA file are indicated with XO:A:R and named in tag XA
+ String[] splitPart = parts[13].split(":");
+ String[] splitPartX1 = parts[14].split(":");
+ if((Integer.parseInt(splitPart[2]) > 1) && ((Integer.parseInt(splitPart[2]) + Integer.parseInt(splitPartX1[2])) <= GeneFinder.maxReportedHitsBWA)){
+ // parse through all other hits
+ String[] multiHits = parts[19].split(";");
+ for(String hit : multiHits){
+ multiTotalCount++;
+ if(currentRead.isMulti == 0){
+ multiCount++;
+ currentRead.isMulti = 1;
+ }
+ multiTotalCount++;
+ String[] hitInfo = hit.split(",");
+
+ // first get the contig
+ Contig secondContig;
+ if(GeneFinder.mappedContigs.containsKey(hitInfo[0].substring(5))){
+ // already seen this contig
+ secondContig = GeneFinder.mappedContigs.get(hitInfo[0].substring(5));
+ }else{
+ secondContig = new Contig();
+ secondContig.contigName = hitInfo[0].substring(5);
+ GeneFinder.mappedContigs.put(hitInfo[0].substring(5),secondContig);
+ }
+
+ // now mapping position
+
+ int mapPos = Integer.parseInt(hitInfo[1].substring(1)) - 1;
+
+ if(secondContig.positionTOmappingRnas.containsKey(mapPos)){
+ if(!secondContig.positionTOmappingRnas.get(mapPos).contains(currentRead)){
+ secondContig.positionTOmappingRnas.get(mapPos).add(currentRead);
+ }
+ }else{
+ Vector<Rna> itsRnas = new Vector<Rna>();
+ itsRnas.add(currentRead);
+ secondContig.positionTOmappingRnas.put(mapPos,itsRnas);
+ }
+
+ Object[] mappedContig_and_Info_second = new Object[7];
+ mappedContig_and_Info_second[0] = secondContig;
+ mappedContig_and_Info_second[1] = mapPos;
+ mappedContig_and_Info_second[2] = hitInfo[2];
+ mappedContig_and_Info_second[3] = thisMappingQuality;
+
+ // this is for mismatch info
+
+ if(hitInfo[2].contains("X")){
+
+ String[] cigarSplitX = hitInfo[2].split("X");
+ int posTmp = 0;
+ Vector<int[]> missMatchInfo = new Vector<int[]>(); // entries are of form: position (within contig),baseCode (A=0;C=1;G=2;T=3)
+ for(String tmp : cigarSplitX){
+ // the more parts, the more mismatches; note that the last part have to be checked if it ends with another letter!
+ if(!(tmp.endsWith("M") || tmp.endsWith("N"))){
+ String[] tmpSplit = tmp.split("[MN]");
+ for(String tmp2 : tmpSplit){
+ posTmp += Integer.parseInt(tmp2);
+ }
+ // now we have the position of a mismatch
+ int[] mmInfo = new int[2];
+ mmInfo[0] = mapPos + posTmp;
+ // now get base at this position
+ String base = parts[9].substring(posTmp,posTmp+1);
+ mmInfo[1] = returnBaseCode(base);
+ missMatchInfo.add(mmInfo);
+ }
+ }
+
+ mappedContig_and_Info_second[5] = missMatchInfo;
+
+ if(hitInfo[2].contains("I") || hitInfo[2].contains("D")){
+
+ int splitSize = 0;
+ if(mappedContig_and_Info_second[4] != null){
+ splitSize = ((int[]) mappedContig_and_Info_second[4])[0];
+ }
+
+ countInDels(hitInfo[2],mapPos,secondContig,splitSize,false);
+ }
+ }
+
+ currentRead.contigsMappedOn.add(mappedContig_and_Info_second);
+ currentRead.hitNum++;
+ }
+ }
+ }
+
+ }else{
+ // still same read, but with different mapping
+ if(currentRead.isMulti == 0){
+ multiCount++;
+ multiTotalCount++; // to count also the hit before this one
+ currentRead.isMulti = 1;
+ }
+
+ multiTotalCount++;
+ int thisAlignmentPosition = (Integer.parseInt(parts[3])) - 1;
+ double thisMappingQuality = (1-(Math.pow(10.0,(-(Integer.parseInt(parts[4]))/10.0))));
+
+ if(thisContig.positionTOmappingRnas.containsKey(thisAlignmentPosition)){
+ if(!thisContig.positionTOmappingRnas.get(thisAlignmentPosition).contains(currentRead)){
+ thisContig.positionTOmappingRnas.get(thisAlignmentPosition).add(currentRead);
+ }
+ }else{
+ Vector<Rna> itsRnas = new Vector<Rna>();
+ itsRnas.add(currentRead);
+ thisContig.positionTOmappingRnas.put(thisAlignmentPosition,itsRnas);
+ }
+
+ Object[] mappedContig_and_Info = new Object[7];
+ mappedContig_and_Info[0] = thisContig;
+ mappedContig_and_Info[1] = thisAlignmentPosition;
+ mappedContig_and_Info[2] = parts[5];
+ mappedContig_and_Info[3] = thisMappingQuality;
+
+ // this is for mismatch info
+
+ String mdTag = "";
+ if(line.contains("MD:Z:")){
+ String[] splitMD = line.split("MD:Z:");
+ String[] splitMD2 = splitMD[splitMD.length-1].split("\t");
+ mdTag = splitMD2[0];
+ }else{
+ System.err.println("No MD tag!");
+ System.exit(0);
+ }
+ mappedContig_and_Info[5] = CigarParser.extractAllAlignDiffs(parts[5],mdTag,parts[9]);
+
+ /*if(parts[5].contains("X")){
+ String[] cigarSplitX = parts[5].split("X");
+ int posTmp = 0;
+ Vector<int[]> missMatchInfo = new Vector<int[]>(); // entries are of form: position (within contig),baseCode (A=0;C=1;G=2;T=3)
+ for(String tmp : cigarSplitX){
+ // the more parts, the more mismatches; note that the last part have to be checked if it ends with another letter!
+ if(!(tmp.endsWith("M") || tmp.endsWith("N"))){
+ String[] tmpSplit = tmp.split("[MN]");
+ for(String tmp2 : tmpSplit){
+ posTmp += Integer.parseInt(tmp2);
+ }
+ // now we have the position of a mismatch
+ int[] mmInfo = new int[2];
+ mmInfo[0] = thisAlignmentPosition + posTmp;
+ // now get base at this position
+ String base = parts[9].substring(posTmp,posTmp+1);
+ mmInfo[1] = returnBaseCode(base);
+ missMatchInfo.add(mmInfo);
+ }
+ }
+
+ mappedContig_and_Info[5] = missMatchInfo;
+ }*/
+
+ if(parts[5].contains("N")){
+
+ mappedContig_and_Info = performSplitExtract(mappedContig_and_Info, thisAlignmentPosition,thisContig,parts[5]);
+
+ }
+
+ if(line.contains("XS:A:+")){
+ mappedContig_and_Info[6] = "+";
+ }else if(line.contains("XS:A:-")){
+ mappedContig_and_Info[6] = "-";
+ }
+
+ if(parts[5].contains("I") || parts[5].contains("D")){
+
+ int splitSize = 0;
+ if(mappedContig_and_Info[4] != null){
+ splitSize = ((int[]) mappedContig_and_Info[4])[0];
+ }
+
+ countInDels(parts[5],thisAlignmentPosition,thisContig,splitSize,false);
+ }
+
+ currentRead.contigsMappedOn.add(mappedContig_and_Info);
+ currentRead.hitNum++;
+ }
+
+ }
+
+ }while((line = rnaIn.readLine()) != null);
+
+ if(GeneFinder.iteration == 2 && currentRead != null && currentRead.isMulti == 1){
+
+ TreeMap<Integer,String> allReassigned = new TreeMap<Integer,String>(); // will contain all lines than deal with this read
+ if(lineReaSam != null){
+ do{
+ String[] partsReaSam = lineReaSam.split(" ");
+
+ if(partsReaSam[0].compareTo(currentRead.rnaID) > 0){
+ break; // we exceeded this read, so stop
+ }else if(currentRead.rnaID.equals(partsReaSam[0])){
+ allReassigned.put(Integer.parseInt(partsReaSam[3]),partsReaSam[2]);
+ }
+ }while((lineReaSam = br.readLine()) != null);
+
+ if(!allReassigned.isEmpty()){
+ compareMappingsWithReaSam(currentRead,allReassigned);
+ }
+ }
+
+ }
+ }else{
+ if(GeneFinder.iteration == 1){
+ WriteOutput.writeToOtherFile(GeneFinder.pathOut+"resultsRun/reassignedReads.sam",line+"\n");
+ }
+ }
+ }
+
+ rnaIn.close();
+ if(GeneFinder.iteration == 2){
+ br.close();
+ }
+
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+
+ if(GeneFinder.interval == -1){
+ GeneFinder.interval = GeneFinder.readLength + 1;
+ if(!GeneFinder.secondPart){
+ System.out.println("Intron minimum: " + GeneFinder.interval);
+ }
+
+ }
+ timeAfter = System.currentTimeMillis();
+
+ if(!GeneFinder.secondPart){
+ if(GeneFinder.iteration == 2){
+ System.out.println("reassigned: " + reassignedCount);
+ System.out.println("not reassigned: " + notReaCount);
+ }
+ System.out.println("Done.");
+ System.out.println((rnaCount) + " rnas have been mapped to the reference.");
+ System.out.println((multiCount) + " of them are ambiguous reads.");
+ System.out.println("The total number of hits is " + (totalHitCount) + ".");
+ System.out.println("Total number of multiple hits: " + multiTotalCount + ".");
+ System.out.println("Time needed to parse SAM file: "+ (double) (timeAfter-timebef)/1000.0 +"s.");
+
+ GeneFinder.ambiProportion = (double) ((double)multiTotalCount/(double)totalHitCount);
+ System.out.println("Proportion of ambiguous reads: " + GeneFinder.ambiProportion);
+
+ System.out.println();
+ WriteOutput.writeToLogFile("Done.\nTime needed to parse SAM file: "+ (double) (timeAfter-timebef)/1000.0 +"s.\n" + rnaCount + " rnas have been mapped to the reference. \n" + (multiCount) + " of them are ambiguous reads. \nThe total number of hits is " + (totalHitCount) + ".\nTotal number of multiple hits: " + multiTotalCount + ".\nProportion of ambiguous reads: " + GeneFinder.ambiProportion + ".\n\n");
+
+ }
+
+ }
+
+ /*
+ * care for split extraction
+ */
+
+ public static Object[] performSplitExtract(Object[] mappedContig_and_Info, int thisAlignmentPosition, Contig thisContig, String cigar){
+
+ String[] cigarSplit1 = cigar.split("[DIMX]");
+ int currentPos = thisAlignmentPosition;
+
+ for(String cigSplit2 : cigarSplit1){
+ if(cigSplit2.contains("N")){
+
+ String[] cigarSplit = cigSplit2.split("N");
+ int[] spliceInfo = new int[2]; // first entry: intron size, second: start position of intron
+ spliceInfo[0] = Integer.parseInt(cigarSplit[0]);
+
+ // check if an insert occurred before split start, if yes, position has to be updated
+
+ String[] cigarSplitN = cigar.split("N");
+
+ if(cigarSplitN[0].contains("I")){
+ String[] splitN_I = cigarSplitN[0].split("I");
+ String[] splitN_I_All = splitN_I[0].split("[DMX]");
+ currentPos = currentPos - Integer.parseInt(splitN_I_All[splitN_I_All.length-1]); // this contains the number directly before "I"
+ }
+
+
+ spliceInfo[1] = currentPos;
+ mappedContig_and_Info[4] = spliceInfo;
+ if(thisContig.splicePositions.containsKey(currentPos)){
+ thisContig.splicePositions.put(currentPos,thisContig.splicePositions.get(currentPos)+1);
+
+ }else{
+ thisContig.splicePositions.put(currentPos, 1);
+ }
+
+ break;
+ }else{
+ // count position before split starts
+ currentPos += Integer.parseInt(cigSplit2);
+ }
+
+ }
+
+ return mappedContig_and_Info;
+ }
+
+ /*
+ * returns the integer belonging to one of the 4 bases respecting baseCode (A=0;C=1;G=2;T=3)
+ */
+
+ public static int returnBaseCode(String base){
+
+ if(base.equals("A")){
+ return 0;
+ }else if(base.equals("C")){
+ return 1;
+ }else if(base.equals("G")){
+ return 2;
+ }else if(base.equals("T")){
+ return 3;
+ }else if(base.equals("N")){
+ return 4;
+ }else if(base.equals("gap")){
+ return 5;
+ }else{
+ return 6;
+ }
+ }
+
+ /*
+ * counts the number of insertions and deletions
+ */
+
+ public static void countInDels(String cigar, int thisAlignmentPosition,Contig thisContig, int splitSize, boolean calledFromSequential){
+
+ int[] indels = {0,0};
+
+ if(cigar.contains("D")){
+ String[] cigarSplitD = cigar.split("D");
+
+ for(int i = 0; i<cigarSplitD.length -1;++i){ // "-1" because we not not want to go through last part (cigar does not end with D)
+ if(cigarSplitD[i].matches(".*[a-zA-Z].*")){
+ String[] parts = cigarSplitD[i].split("[a-zA-Z]");
+ indels[0] += Integer.parseInt(parts[parts.length-1]);
+ }else{
+ indels[0] += Integer.parseInt(cigarSplitD[i]);
+ }
+ }
+ }
+
+ if(cigar.contains("I")){
+ String[] cigarSplitI = cigar.split("I");
+
+ for(int i = 0; i<cigarSplitI.length -1;++i){ // "-1" because we not not want to go through last part (cigar does not end with I)
+ if(cigarSplitI[i].matches(".*[a-zA-Z].*")){
+ String[] parts = cigarSplitI[i].split("[a-zA-Z]");
+ indels[1] += Integer.parseInt(parts[parts.length-1]);
+ }else{
+ indels[1] += Integer.parseInt(cigarSplitI[i]);
+ }
+ }
+ }
+
+ if(calledFromSequential){
+ for(int i = 1; i<=indels[0]; ++i){
+ if(thisContig.positionTOdiff.keySet().contains(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i)){
+ int value = thisContig.positionTOdiff.get(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i) - 1;
+ thisContig.positionTOdiff.put(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i,value);
+ if(thisContig.positionTOdiff.get(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i) == 0){
+ thisContig.positionTOdiff.remove(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i);
+ }
+ }
+
+ }
+
+ for(int i = 0; i<indels[1]; ++i){
+ if(thisContig.positionTOdiff.keySet().contains(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize - i)){
+ int value = thisContig.positionTOdiff.get(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize - i) + 1;
+ thisContig.positionTOdiff.put(thisAlignmentPosition + splitSize + (GeneFinder.readLength-1) - i,value);
+ if(thisContig.positionTOdiff.get(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize - i) == 0){
+ thisContig.positionTOdiff.remove(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize - i);
+ }
+ }
+
+ }
+ }else{
+ for(int i = 1; i<=indels[0]; ++i){
+ if(thisContig.positionTOdiff.keySet().contains(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i)){
+ int value = thisContig.positionTOdiff.get(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i) + 1;
+ thisContig.positionTOdiff.put(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i,value);
+ }else{
+ thisContig.positionTOdiff.put(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize + i,1);
+ }
+ }
+
+ for(int i = 0; i<indels[1]; ++i){
+ if(thisContig.positionTOdiff.keySet().contains(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize - i)){
+ int value = thisContig.positionTOdiff.get(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize - i) - 1;
+ thisContig.positionTOdiff.put(thisAlignmentPosition + splitSize + (GeneFinder.readLength-1) - i,value);
+ }else{
+ thisContig.positionTOdiff.put(thisAlignmentPosition + (GeneFinder.readLength-1) + splitSize - i,-1);
+ }
+ }
+ }
+
+
+ }
+
+ /*
+ * compare reassigned hits with the former extracted ones + manage update maps
+ */
+
+ public static void compareMappingsWithReaSam(Rna read,TreeMap<Integer,String> allReassigned){
+
+ for(int posMap = read.contigsMappedOn.size()-1;posMap>= 0;posMap--){
+ Object[] hit = read.contigsMappedOn.get(posMap);
+
+ if(allReassigned.containsKey((((Integer)hit[1]).intValue()))){
+ String info = allReassigned.get((((Integer)hit[1]).intValue()));
+
+ if(!(info.equals(GeneFinder.contigTOname.get(((Contig)hit[0]))))){
+ updateMapsForRemovedElement(hit,read);
+ read.contigsMappedOn.removeElementAt(posMap);
+ }
+ }else{
+ updateMapsForRemovedElement(hit,read);
+ read.contigsMappedOn.removeElementAt(posMap);
+ }
+
+ }
+
+ if(read.contigsMappedOn.size() == 1){
+ read.isMulti = 0;
+ }
+ }
+
+ /*
+ * update positionToreads and splicing maps
+ */
+
+ public static void updateMapsForRemovedElement(Object[] hit, Rna read){
+
+ Contig thisContig = (Contig) hit[0];
+ if(thisContig.positionTOmappingRnas.containsKey((Integer)hit[1])){
+ if(thisContig.positionTOmappingRnas.get((Integer)hit[1]).contains(read)){
+ thisContig.positionTOmappingRnas.get((Integer)hit[1]).removeElement(read);
+ }
+ if(thisContig.positionTOmappingRnas.get((Integer)hit[1]).size() == 0){
+ thisContig.positionTOmappingRnas.remove((Integer)hit[1]);
+ removedHits++;
+ }
+ }else{
+ //System.out.println("posi not contained for " + read.rnaID);
+ posiNotContained++;
+ }
+
+ if(hit[4] != null){
+ int[] spliceInfo = (int[]) hit[4];
+ if(thisContig.splicePositions.containsKey(spliceInfo[1])){
+ thisContig.splicePositions.put(spliceInfo[1],thisContig.splicePositions.get(spliceInfo[1])-1);
+ }
+ if(thisContig.splicePositions.get(spliceInfo[1]) == 0){
+ thisContig.splicePositions.remove(spliceInfo[1]);
+ }
+ }
+
+ }
+
+ /*
+ * use coveragePlot.py script to plot the overall coverage and estimate the needed coverage threshold
+ */
+
+ public static Double estimateCoverage(){
+
+ if(!GeneFinder.secondPart){
+ System.out.println("Estimate coverage.... ");
+ WriteOutput.writeToLogFile("Estimate coverage.... \n");
+ }
+
+ long timebef= System.currentTimeMillis();
+ long timeAfter = 0;
+
+ double meanCov = 0.0;
+
+ String nameSam = "";
+
+
+ if(GeneFinder.useTopHat){
+ if(GeneFinder.haveSam != null){
+ nameSam = GeneFinder.haveSam;
+ }else{
+ nameSam = GeneFinder.pathOut+"resultsRun/accepted_hits.sam";
+ }
+
+ }else{
+ if(GeneFinder.haveSam != null){
+ nameSam = GeneFinder.haveSam;
+ }else{
+ nameSam = GeneFinder.pathOut+"resultsRun/aln_BWA.sam";
+ }
+
+ }
+
+ String nameOutfile = GeneFinder.pathOut+"resultsRun/covMean.txt";
+
+ try {
+
+ // if covMean.txt already in results directory, we do not need to estimate the coverage again
+
+ BufferedReader br = new BufferedReader(new FileReader(nameOutfile));
+ String line = "";
+ if((line = br.readLine()) != null){
+ meanCov = Double.parseDouble(line);
+ }
+ if((line = br.readLine()) != null){
+ GeneFinder.maxCov = Double.parseDouble(line);
+ }
+ br.close();
+
+ } catch (IOException e) {
+
+ // no coverage file provided, so create a new one
+
+ String firstExe = "python " + GeneFinder.pathToHelpFiles+"getMeanCov.py " + nameSam + " " + nameOutfile;
+ Giira.callAndHandleOutput(firstExe);
+
+ timeAfter = System.currentTimeMillis();
+
+ if(!GeneFinder.secondPart){
+ System.out.println("Done. Time needed: "+ (double) (timeAfter-timebef)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Done. Time needed: "+ (double) (timeAfter-timebef)/1000.0 +"s. \n");
+ }
+
+ try {
+ BufferedReader br = new BufferedReader(new FileReader(nameOutfile));
+ String line = "";
+ if((line = br.readLine()) != null){
+ meanCov = Double.parseDouble(line);
+ }
+ if((line = br.readLine()) != null){
+ GeneFinder.maxCov = Double.parseDouble(line);
+ }
+
+ br.close();
+ }catch (IOException e2) {
+ System.err.println("File not found!");
+ }
+
+ }
+
+ return (meanCov);
+ }
+
+
+ /*
+ * parses a sam file sorted according to the chromosomes
+ * at the moment no BWA support!
+ */
+
+ public void partParser(File inFile,String nameRef){
+
+ long timebef = System.currentTimeMillis();
+
+ long rnaCount = 0;
+ long timeAfter = 0;
+ int multiCount = 0;
+ int multiTotalCount = 0;
+ int totalHitCount = 0;
+
+ int reassignedCount = 0;
+ int notReaCount = 0;
+
+ int interChromoCount = 0;
+ int interChromoTotalCount = 0;
+
+ posiNotContained = 0;
+ removedHits = 0;
+
+ TreeMap<String,Vector<Object>> seenReads = new TreeMap<String,Vector<Object>>(); // first entry in Vec: ok (1) or not ok (0), second: read
+
+ try{
+
+ BufferedReader rnaIn = new BufferedReader(new FileReader(inFile));
+
+ String currentName = "";
+
+ String line;
+
+ while((line = rnaIn.readLine()) != null){
+
+ String[] parts1 = line.split(" ");
+
+ if(!parts1[0].startsWith("@") && parts1.length >=4){
+ // now go to next while loop to skip previous check
+ //String currentReadID = new String();
+ //Rna currentRead = null;
+
+ if(GeneFinder.readLength == -1){
+ if(parts1[10].length() == 1){
+ System.out.println("No read sequence in sam file, please specify with option -rL.");
+ System.exit(0);
+ }
+ GeneFinder.readLength = parts1[10].length();
+ if(!GeneFinder.secondPart){
+ System.out.println();
+ System.out.println("Read length: " + GeneFinder.readLength);
+ }
+ }
+
+ do{
+ String[] parts = line.split("\t");
+
+ String flag = Integer.toBinaryString(Integer.parseInt(parts[1]));
+ String[] flagArr = flag.split("");
+
+ // test if mapped
+
+ if(!((flagArr.length >= 3) && ((flagArr[((flagArr.length)-3)]).equals("1"))) && (!(GeneFinder.isProkaryote && parts[5].contains("N"))) && !((parts[5].contains("H")) || (parts[5].contains("P")) || (parts[5].contains("S")))){ // part 5 restriction for blat alignment....
+
+ // get the contig
+
+ String contigName = parts[2];
+ Contig thisContig;
+ if(GeneFinder.mappedContigs.containsKey(contigName)){
+ // already seen this contig
+ thisContig = GeneFinder.mappedContigs.get(contigName);
+ }else{
+ if(!GeneFinder.mappedContigs.keySet().isEmpty()){
+ // start new round here!!
+
+ timeAfter = System.currentTimeMillis();
+
+ System.out.println((rnaCount) + " rnas have been mapped to the reference.");
+ System.out.println((multiCount) + " of them are ambiguous reads.");
+ System.out.println("The total number of hits is " + (totalHitCount) + ".");
+ System.out.println("Total number of multiple hits: " + multiTotalCount + ".");
+ System.out.println("Number of interchromosomal reads (so far): " + interChromoCount + ".");
+ System.out.println("Number of interchromosomal hits (so far): " + interChromoTotalCount + ".");
+ System.out.println("Time for parsing chromosome: "+ (double) (timeAfter-timebef)/1000.0 +"s.");
+
+ GeneFinder.ambiProportion = (double) ((double)multiTotalCount/(double)totalHitCount);
+ System.out.println("Proportion of ambiguous reads: " + GeneFinder.ambiProportion);
+
+ System.out.println();
+ WriteOutput.writeToLogFile(rnaCount + " rnas have been mapped to the reference. \n" + (multiCount) + " of them are ambiguous reads. \nThe total number of hits is " + (totalHitCount) + ".\nTotal number of multiple hits: " + multiTotalCount + ".\nProportion of ambiguous reads: " + GeneFinder.ambiProportion + ".\n\n");
+
+ String nameOut = "_" + currentName;
+ GeneFinder.geneFinder_managing(nameRef,nameOut);
+ }
+ currentName = contigName;
+ timebef = System.currentTimeMillis();
+
+ thisContig = new Contig();
+ thisContig.contigName = contigName;
+ GeneFinder.mappedContigs.clear();
+ GeneFinder.contigTOname.clear();
+ GeneFinder.mappedContigs.put(contigName,thisContig);
+ GeneFinder.contigTOname.put(thisContig,contigName);
+ rnaCount = 0;
+ multiCount = 0;
+ totalHitCount = 0;
+ multiTotalCount = 0;
+ System.out.println();
+ System.out.println("Starting chromosome " + contigName + ".");
+ }
+
+ totalHitCount++;
+
+ Rna read;
+
+ if(seenReads.keySet().contains(parts[0])){
+
+ Vector<Object> temp = seenReads.get(parts[0]);
+
+ if(((Integer)temp.get(0)) != 0){
+
+ //multiCount++;
+ multiTotalCount++;
+
+ // read ok up to now, check tag XX
+
+ if(line.contains("CC:Z:")){
+ if(!line.contains("CC:Z:=")){
+
+ // search for all former hits and delete read
+
+ searchAndDeleteFromContig((Rna)temp.get(1),thisContig);
+
+ temp.clear();
+ temp.add(0);
+ seenReads.put(parts[0],temp);
+
+ if(totalHitCount % 100000 == 0){
+
+ Runtime r=Runtime.getRuntime();
+ r.gc();
+ r.gc();
+
+ }
+
+ interChromoTotalCount++;
+ break;
+ }
+ }
+ }else{
+ interChromoTotalCount++;
+ break;
+ }
+ read = (Rna)temp.get(1);
+ read.isMulti = 1;
+ read.hitNum++;
+ //read.assignedNum++;
+
+ }else{
+ rnaCount++;
+ if(line.contains("CC:Z:")){
+ multiCount++;
+ if(!line.contains("CC:Z:=")){
+ interChromoCount++;
+ interChromoTotalCount++;
+ Vector<Object> temp = new Vector<Object>();
+ temp.add(0);
+ seenReads.put(parts[0],temp);
+ interChromoTotalCount++;
+ break;
+ }
+
+ }
+ // create new read
+ read = new Rna();
+ read.rnaID = parts[0];
+ read.isMulti = 0;
+ read.hitNum = 1;
+ read.assignedNum = 0;
+
+ // assign quality,
+
+ double qualScoreRead = 0.0;
+ double maxQual = Double.MIN_VALUE;
+ double minQual = Double.MAX_VALUE;
+
+ if(parts[10].length() > 1){
+
+ for(int i = 0; i< parts[10].length();i++){
+ int posQualInt = parts[10].charAt(i);
+ double baseQual = 1.0 - (Math.pow(10.0, (-((double) posQualInt/10.0))));
+ if(baseQual < minQual){
+ minQual = baseQual;
+ }
+ if(baseQual > maxQual){
+ maxQual = baseQual;
+ }
+ qualScoreRead += baseQual;
+
+ }
+
+ read.quality = ((qualScoreRead/((double) parts[10].length())))/(maxQual-minQual+1);
+
+ }else{
+ read.quality = 1.0;
+ }
+
+ if(line.contains("CC:Z:")){
+ Vector<Object> temp = new Vector<Object>();
+ temp.add(1);
+ temp.add(read);
+ seenReads.put(parts[0],temp);
+ }
+ }
+
+ int thisAlignmentPosition = (Integer.parseInt(parts[3])) - 1; // TopHat and BWA refer to the 1-based start, so subtract -1 (because we refer to 0-based start)
+
+ double thisMappingQuality = (1-(Math.pow(10.0,(-(Integer.parseInt(parts[4]))/10.0))));
+
+ if(thisContig.positionTOmappingRnas.containsKey(thisAlignmentPosition)){
+ if(!thisContig.positionTOmappingRnas.get(thisAlignmentPosition).contains(read)){
+ thisContig.positionTOmappingRnas.get(thisAlignmentPosition).add(read);
+ }
+ }else{
+ Vector<Rna> itsRnas = new Vector<Rna>();
+ itsRnas.add(read);
+ thisContig.positionTOmappingRnas.put(thisAlignmentPosition,itsRnas);
+ }
+
+ Object[] mappedContig_and_Info = new Object[7];
+ mappedContig_and_Info[0] = thisContig;
+ mappedContig_and_Info[1] = thisAlignmentPosition;
+ mappedContig_and_Info[2] = parts[5];
+
+ // this is for mismatch info
+
+ String mdTag = "";
+ if(line.contains("MD:Z:")){
+ String[] splitMD = line.split("MD:Z:");
+ String[] splitMD2 = splitMD[splitMD.length-1].split("\t");
+ mdTag = splitMD2[0];
+ }else{
+ System.err.println("No MD tag!");
+ System.exit(0);
+ }
+ mappedContig_and_Info[5] = CigarParser.extractAllAlignDiffs(parts[5],mdTag,parts[9]);
+
+ /*if(parts[5].contains("X")){
+ String[] cigarSplitX = parts[5].split("X");
+ int posTmp = 0;
+ Vector<int[]> missMatchInfo = new Vector<int[]>(); // entries are of form: position (within contig),baseCode (A=0;C=1;G=2;T=3)
+ for(String tmp : cigarSplitX){
+ // the more parts, the more mismatches; note that the last part have to be checked if it ends with another letter!
+ if(!(tmp.endsWith("M") || tmp.endsWith("N"))){
+ String[] tmpSplit = tmp.split("[MN]");
+ for(String tmp2 : tmpSplit){
+ posTmp += Integer.parseInt(tmp2);
+ }
+ // now we have the position of a mismatch
+ int[] mmInfo = new int[2];
+ mmInfo[0] = thisAlignmentPosition + posTmp;
+ // now get base at this position
+ String base = parts[9].substring(posTmp,posTmp+1);
+ mmInfo[1] = returnBaseCode(base);
+ missMatchInfo.add(mmInfo);
+ }
+ }
+
+ mappedContig_and_Info[5] = missMatchInfo;
+ }*/
+
+ mappedContig_and_Info[3] = thisMappingQuality;
+
+ if((parts[5].contains("N"))){
+ mappedContig_and_Info = performSplitExtract(mappedContig_and_Info, thisAlignmentPosition,thisContig,parts[5]);
+ }
+
+ if(line.contains("XS:A:+")){
+ mappedContig_and_Info[6] = "+";
+ }else if(line.contains("XS:A:-")){
+ mappedContig_and_Info[6] = "-";
+ }
+
+ if(parts[5].contains("I") || parts[5].contains("D")){
+
+ int splitSize = 0;
+ if(mappedContig_and_Info[4] != null){
+ splitSize = ((int[]) mappedContig_and_Info[4])[0];
+ }
+
+ countInDels(parts[5],thisAlignmentPosition,thisContig,splitSize,false);
+ }
+
+ read.contigsMappedOn.add(mappedContig_and_Info);
+
+ }
+
+ }while((line = rnaIn.readLine()) != null);
+
+ }else{
+ if(GeneFinder.iteration == 1){
+ WriteOutput.writeToOtherFile(GeneFinder.pathOut+"resultsRun/reassignedReads.sam",line+"\n");
+ }
+ }
+ }
+
+ if(!GeneFinder.mappedContigs.keySet().isEmpty()){
+ // start new round here!!
+
+ timeAfter = System.currentTimeMillis();
+
+ if(GeneFinder.iteration == 2){
+ System.out.println("reassigned: " + reassignedCount);
+ System.out.println("not reassigned: " + notReaCount);
+ }
+
+ System.out.println((rnaCount) + " rnas have been mapped to the reference.");
+ System.out.println((multiCount) + " of them are ambiguous reads.");
+ System.out.println("The total number of hits is " + (totalHitCount) + ".");
+ System.out.println("Total number of multiple hits: " + multiTotalCount + ".");
+ System.out.println("Number of interchromosomal reads (so far): " + interChromoCount + ".");
+ System.out.println("Number of interchromosomal hits (so far): " + interChromoTotalCount + ".");
+ System.out.println("Time for parsing chromosome: "+ (double) (timeAfter-timebef)/1000.0 +"s.");
+
+ GeneFinder.ambiProportion = (double) ((double)multiTotalCount/(double)totalHitCount);
+ System.out.println("Proportion of ambiguous reads: " + GeneFinder.ambiProportion);
+
+ System.out.println();
+ WriteOutput.writeToLogFile(rnaCount + " rnas have been mapped to the reference. \n" + (multiCount) + " of them are ambiguous reads. \nThe total number of hits is " + (totalHitCount) + ".\nTotal number of multiple hits: " + multiTotalCount + ".\nProportion of ambiguous reads: " + GeneFinder.ambiProportion + ".\n\n");
+
+ String nameOut = currentName;
+ GeneFinder.geneFinder_managing(nameRef,nameOut);
+ }
+
+ rnaIn.close();
+
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+
+ if(GeneFinder.interval == -1){
+ GeneFinder.interval = GeneFinder.readLength + 1;
+ if(!GeneFinder.secondPart){
+ System.out.println("Intron minimum: " + GeneFinder.interval);
+ }
+
+ }
+
+ }
+
+ /*
+ * method to remove all mappings of this read
+ */
+
+ public static void searchAndDeleteFromContig(Rna read ,Contig thisContig){
+
+ Vector<Integer> mapPosis = new Vector<Integer>();
+ for(Object[] temp : read.contigsMappedOn){
+ // [contig, alignPos, cigarString, mapQual,spliceInfo,mismatchInfo,direcInfo]
+
+ int posi = (Integer) temp[1];
+
+ if(thisContig.positionTOmappingRnas.containsKey(posi)){
+ if(thisContig.positionTOmappingRnas.get(posi).contains(read)){
+ thisContig.positionTOmappingRnas.get(posi).remove(read);
+ }else if(!mapPosis.contains(posi)){
+ System.out.println("Read not contained!");
+ }
+
+ if(thisContig.positionTOmappingRnas.get(posi).isEmpty()){
+ thisContig.positionTOmappingRnas.remove(posi);
+ }
+ }else if(!mapPosis.contains(posi)){
+ System.out.println("Position not contained! " + read.rnaID);
+ }
+
+ mapPosis.add(posi);
+
+ // splice positions
+
+ if(temp[4] != null){
+ int[] spliceInfo = (int[]) temp[4]; // first entry: intron size, second: start position of intron
+
+ if(thisContig.splicePositions.containsKey(spliceInfo[1])){
+ thisContig.splicePositions.put(spliceInfo[1],thisContig.splicePositions.get(spliceInfo[1])-1);
+ }else{
+ System.out.println("Splice position not contained!");
+ }
+ }
+
+ // posiDiff
+
+ int splitSize = 0;
+ if(temp[4] != null){
+ splitSize = ((int[])temp[4])[0];
+ }
+
+ countInDels((String)temp[2],posi,thisContig,splitSize,true);
+ }
+
+ read.contigsMappedOn.clear();
+ read = null;
+ }
+
+}
diff --git a/src/geneFinder/TopHat_Call.java b/src/geneFinder/TopHat_Call.java
new file mode 100755
index 0000000..91e7465
--- /dev/null
+++ b/src/geneFinder/TopHat_Call.java
@@ -0,0 +1,103 @@
+package geneFinder;
+
+/**
+ * call TopHat to map the rna reads against the reference genome
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+import java.io.File;
+
+public class TopHat_Call {
+
+ /*
+ * map the reads against the reference using TopHat2
+ * creates a sam output file
+ * note: topHat setting is only possible for the mapping step, not for the indexing with bowtie-build
+ */
+
+ public void callTopHat(String nameRefFile){
+
+ System.out.println("Start to do the alignment with TopHat...");
+ WriteOutput.writeToLogFile("Start to do the alignment with TopHat... ");
+
+
+ long timebef = System.currentTimeMillis();
+ long timeAfter = 0;
+
+ // first build a bowtie index on the reference file
+
+ File refFile;
+ refFile = new File(nameRefFile+".fa");
+
+ System.out.print("Indexing with Bowtie.... ");
+ String firstExe = "bowtie2-build " + refFile + " " + nameRefFile;
+ Giira.callAndHandleOutput(firstExe);
+
+ String optionString = ""; // get the options for the alignment
+ if(GeneFinder.settingMapper != null){
+ for(String para : GeneFinder.settingMapper.split(",")){ // comma separated list
+ String[] paraArr = para.split("=");
+ if(paraArr.length == 1){ // if length == 1, than this parameter has only one key as indicator
+ optionString = optionString+paraArr[0]+" ";
+ } else{
+ optionString = optionString+paraArr[0]+" "+paraArr[1]+" ";
+ }
+ }
+ }
+
+ System.out.println("Done. \nPerform alignment.... ");
+
+ if(optionString.length() > 1){ // report the options
+ System.out.println("Options for TopHat: " + optionString);
+ WriteOutput.writeToLogFile("Options for TopHat: " + optionString + "\n");
+ }
+
+ // now call topHat, note that we do use the bam format as an intermediate format to ensure the right ordering of reads, final output is in sam format
+ // output directory shall be resultsRun
+
+ String out_dir = GeneFinder.pathOut+"resultsRun";
+
+ String fileNames = new String();
+
+ File rnaFile = null;
+
+ for(File readFile : GeneFinder.rnaFilesWithNames.keySet()){ // if there is more than one read file, report them to tophat in a list
+ fileNames += (GeneFinder.rnaFilesWithNames.get(readFile))+",";
+ if(GeneFinder.rnaFilesWithNames.keySet().size() == 1){
+ rnaFile = readFile;
+ }
+ }
+
+ fileNames=fileNames.substring(0,fileNames.length()-1); // trim to correct format
+
+ String secondExe = "";
+
+ if(GeneFinder.rnaFilesWithNames.keySet().size() == 1){ // make the right call depending on how many read files are provided
+ secondExe = "tophat2 --no-sort-bam " + optionString + "-o " + out_dir + " " + nameRefFile + " " + rnaFile;
+ Giira.callAndHandleOutput(secondExe);
+
+ }else{
+ secondExe = "tophat2 --no-sort-bam " + optionString + "-o " + out_dir + " " + nameRefFile + " " + fileNames;
+ Giira.callAndHandleOutput(secondExe);
+ }
+
+ // the following is necessary to ensure that the reads in the resulting sam file will be in the necessary order
+ String thirdExe = "samtools sort -n " + out_dir + "/accepted_hits.bam " + out_dir + "/accepted_hits_sorted"; // sort and view guarantees that sam file is sorted correctly
+ String fourthExe = "samtools view -h -o " + out_dir + "/accepted_hits.sam " + out_dir + "/accepted_hits_sorted.bam"; // accepted_hits.sam is the file for the further analysis
+
+ Giira.callAndHandleOutput(thirdExe);
+ Giira.callAndHandleOutput(fourthExe);
+
+ // log messages
+ System.out.println("Done.");
+ timeAfter = System.currentTimeMillis();
+ System.out.println("Time required for the alignment: "+ (double) (timeAfter-timebef)/1000.0 +"s.");
+ WriteOutput.writeToLogFile("Done.\n Time required for the alignment: "+ (double) (timeAfter-timebef)/1000.0 +"s.\n\n");
+
+ }
+
+}
diff --git a/src/geneFinder/WriteOutput.java b/src/geneFinder/WriteOutput.java
new file mode 100755
index 0000000..d6ed517
--- /dev/null
+++ b/src/geneFinder/WriteOutput.java
@@ -0,0 +1,480 @@
+package geneFinder;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.*;
+
+import types.*;
+
+/**
+ * write the result of the gene finding procedure to several output files
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class WriteOutput {
+
+ public static int geneNumTotal;
+
+ /*
+ * opens the log file and appends the given string
+ */
+
+ public static void writeToLogFile(String text){
+
+ try {
+ FileWriter log = new FileWriter(GeneFinder.logFile,true);
+ log.write(text);
+ log.close();
+ } catch (FileNotFoundException r) {
+ r.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /*
+ * opens the sam file containing the reassigned hits and appends the given string
+ */
+
+ public static void writeToOtherFile(String filename, String text){
+
+ try {
+ FileWriter f = new FileWriter(filename,true);
+ f.write(text);
+ f.close();
+ } catch (FileNotFoundException r) {
+ r.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /*
+ * sorts the sam file containing the reassigned hits (according to read names)
+ */
+
+ public static void sortReassignSamFile(){
+
+ String exe = "python " + GeneFinder.pathToHelpFiles+"sortReaSam.py " + GeneFinder.pathOut;
+ Giira.callAndHandleOutput(exe);
+
+ }
+
+ /*
+ * opens the sam file containing the reassigned hits and appends the given string
+ */
+
+ public static void removeReassignSamFile(){
+
+ try {
+ Runtime removeFile = Runtime.getRuntime();
+ Process exe3 = removeFile.exec("rm " + GeneFinder.pathOut+"resultsRun/reassignedReads.bam");
+ exe3.waitFor();
+ Process exe4 = removeFile.exec("rm " + GeneFinder.pathOut+"resultsRun/reassignedReads_sorted.bam");
+ exe4.waitFor();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /*
+ * created a fasta file with all gene sequences and a bed file with all statistics like score, start, stop etc.
+ */
+
+ public static void writeGeneFinderOutput(double[] minMax, String namePartOut){
+
+ try{
+
+ BufferedWriter bwGTF = null;
+
+ if(GeneFinder.iteration == 2){
+ bwGTF = new BufferedWriter(new FileWriter(new File(GeneFinder.pathOut+"resultsRun/stats"+ GeneFinder.outputName + "_final.gtf")));
+ }else{
+ bwGTF = new BufferedWriter(new FileWriter(new File(GeneFinder.pathOut+"resultsRun/stats"+ GeneFinder.outputName + namePartOut + ".gtf")));
+ }
+
+ int exonLength_covered_total = 0;
+ geneNumTotal = 0;
+ int geneRefNonZero = 0;
+
+ int numGenesWithOnlyMulti = 0;
+ int numGenesWithSupportBelowThreshold = 0;
+ int numGenesWithTotalSupportBelowThreshold = 0;
+
+ for(String contigName : GeneFinder.mappedContigs.keySet()){
+ Contig thisContig = GeneFinder.mappedContigs.get(contigName);
+
+ for(Gene cluster : thisContig.allGenes){
+
+ geneNumTotal++;
+
+ if(cluster.score != 0){
+
+ cluster.score = (cluster.score/( minMax[1]- minMax[0] + 1 ) ); // normalization
+
+ String multiInfo = "y"; // y stands for non-multi support, n stands for only multi support
+ String supportInfo = "y"; // yes stands for sufficient total support, no for not sufficient support
+ String hasEnoughUniques = "y"; // is n, if proportion of unique reads is below 1% and also unique coverage
+
+ if(cluster.numOfMultis == cluster.idTOassociatedRnas.keySet().size()){
+ numGenesWithOnlyMulti++;
+ multiInfo = "n";
+
+ }else{
+ if(GeneFinder.noAmbiOpti){
+ if(((((double) cluster.idTOassociatedRnas.size()) * (double) GeneFinder.readLength)/( (double) cluster.exonLength)) < GeneFinder.minCoverage){
+ numGenesWithSupportBelowThreshold++;
+ }
+ }else{
+ if(cluster.uniqueCov < GeneFinder.minCoverage){
+ numGenesWithSupportBelowThreshold++;
+ }
+ }
+
+ }
+
+ if(((((double) cluster.idTOassociatedRnas.size()) * (double) GeneFinder.readLength)/( (double) cluster.exonLength)) < GeneFinder.minCoverage){
+ numGenesWithTotalSupportBelowThreshold++;
+ supportInfo = "n";
+ }
+
+ double coverage = ((((double) cluster.idTOassociatedRnas.size()) * (double) GeneFinder.readLength)/( (double) cluster.exonLength));
+
+ int uniqueNum = (cluster.idTOassociatedRnas.keySet().size() - cluster.numOfMultis);
+
+ if((((double)uniqueNum)/((double)cluster.idTOassociatedRnas.keySet().size()) < 0.1)){ // proportion of multi reads
+ if((((double) uniqueNum) * (double) GeneFinder.readLength)/((double) cluster.exonLength) < 0.1){ // uniqueCov
+ hasEnoughUniques = "n";
+ }
+ }
+
+ geneRefNonZero++;
+ exonLength_covered_total += cluster.exonLength;
+
+ String strandInfo = "+"; // has to be updated if reverse strand
+
+ if(cluster.onRevStrand){
+ strandInfo = "-";
+ }
+
+ boolean onlyNewTranscriptStarts = DefineAlternativeTranscripts.searchForTranscripts(cluster);
+
+ if(cluster.alternativeTranscripts.isEmpty()){
+ // first write out transcript
+ bwGTF.write(contigName + "\tGIIRA\ttranscript\t" + (cluster.startPos+1) + "\t" + (cluster.stopPos+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ // now write out all exons
+
+ int exonNum = 1;
+ for(int[] exon : cluster.exonsOfGene){
+ bwGTF.write(contigName + "\tGIIRA\texon\t" + (exon[0]+1) + "\t" + exon[1] +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; exon_number " + (exonNum++) + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+ }
+ }else{
+ int transcriptID = 1;
+
+ if(cluster.possibleIntrons.keySet().size() == 0 || onlyNewTranscriptStarts){
+ // only alternative transcripts because of new transcripts starts, add the start to ensure that we grab all exons in at least one alternative
+ cluster.alternativeTranscripts.add(new Object[]{cluster.startPos,cluster.startPos,cluster.startPos,cluster.startPos});
+ }
+
+ FindExonsOfGene.sortExons(cluster);
+ DefineAlternativeTranscripts.eraseEqualTranscripts(cluster);
+
+ for(Object[] definingIntron : cluster.alternativeTranscripts){
+
+ if(!(((Integer) definingIntron[0] == (Integer) definingIntron[1]) && (definingIntron.length >= 6) && ((Integer)definingIntron[4] == -1))){
+ Object[] returnObject = DefineAlternativeTranscripts.assignExonsToTranscripts(cluster,definingIntron);
+
+ Vector<int[]> exonsOfTranscript = (Vector<int[]>) returnObject[0];
+ cluster.exonsOfTranscripts.add(exonsOfTranscript);
+
+ int transcriptStart = (Integer) returnObject[1];
+ int transcriptEnd = (Integer) returnObject[2];
+ if(!exonsOfTranscript.isEmpty()){
+ bwGTF.write(contigName + "\tGIIRA\ttranscript\t" + (transcriptStart+1) + "\t" + (transcriptEnd) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + "." + transcriptID + "\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ int exonNum = 1;
+ for(int[] exon : exonsOfTranscript){
+ bwGTF.write(contigName + "\tGIIRA\texon\t" + (exon[0]+1) + "\t" + exon[1] +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + "." + transcriptID + "\"; exon_number " + (exonNum++) + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+ }
+
+ transcriptID++;
+ }
+
+ }
+
+ }
+ }
+
+ }
+ }
+
+ if(GeneFinder.iteration == 1 && !GeneFinder.inprogeaCall){
+ thisContig.allGenes = new Vector<Gene>();
+ }
+
+ }
+
+ //System.out.println("Number of identified genes with support below threshold: " + numGenesWithSupportBelowThreshold);
+ System.out.println("Number of identified genes with total coverage below threshold: " + numGenesWithTotalSupportBelowThreshold);
+ System.out.println("Number of identified genes with only ambiguous support: " + numGenesWithOnlyMulti);
+ System.out.println("Number of identified genes on reference: " + geneRefNonZero);
+
+ writeToLogFile("\n\nNumber of identified genes on reference: " + geneRefNonZero + " \nNumber of identified genes with total coverage below threshold: " + numGenesWithTotalSupportBelowThreshold + "\nNumber of identified genes with only ambiguous support: " + numGenesWithOnlyMulti + "\n");
+
+ bwGTF.close();
+
+ }catch (FileNotFoundException r) {
+ r.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ /*
+ * output for prokaryotes
+ */
+
+ public static void writeGeneFinderOutput_Prokaryote(double[] minMax, String namePartOut){
+
+ try{
+
+ BufferedWriter bwGTF_ccAna = null;
+ BufferedWriter bwGTF_prok = null;
+
+ bwGTF_ccAna = new BufferedWriter(new FileWriter(new File(GeneFinder.pathOut+"resultsRun/stats"+ GeneFinder.outputName + namePartOut +"_ccAna.gtf")));
+ bwGTF_prok = new BufferedWriter(new FileWriter(new File(GeneFinder.pathOut+"resultsRun/stats"+ GeneFinder.outputName + namePartOut +"_prok.gtf")));
+
+ int exonLength_covered_total = 0;
+ geneNumTotal = 0;
+ int geneRefNonZero = 0;
+
+ int numGenesWithOnlyMulti = 0;
+ int numGenesWithSupportBelowThreshold = 0;
+ int numGenesWithTotalSupportBelowThreshold = 0;
+ int noCistron = 0;
+ int moreThanOneCistron = 0;
+ int moreThanTwoCistrons = 0;
+ int moreOperons = 0;
+ int operonSplitNum = 0;
+ double operonSplitAv = 0.0;
+
+ for(String contigName : GeneFinder.mappedContigs.keySet()){
+ Contig thisContig = GeneFinder.mappedContigs.get(contigName);
+
+ for(Gene cluster : thisContig.allGenes){
+
+ geneNumTotal++;
+
+ if(cluster.score != 0){
+
+ cluster.score = (cluster.score/( minMax[1]- minMax[0] + 1 ) ); // normalization
+
+ String multiInfo = "y"; // n stands for non-multi support, y stands for only multi support
+ String supportInfo = "y"; // yes stands for sufficient total support, no for not sufficient support
+ String hasEnoughUniques = "y"; // is n, if proportion of unique reads is below 1% and also unique coverage
+
+ if(cluster.numOfMultis == cluster.idTOassociatedRnas.keySet().size()){
+ numGenesWithOnlyMulti++;
+ multiInfo = "n";
+
+ }else{
+ if(GeneFinder.noAmbiOpti){
+ if(((((double) cluster.idTOassociatedRnas.size()) * (double) GeneFinder.readLength)/( (double) cluster.exonLength)) < GeneFinder.minCoverage){
+ numGenesWithSupportBelowThreshold++;
+ }
+ }else{
+ if(cluster.uniqueCov < GeneFinder.minCoverage){
+ numGenesWithSupportBelowThreshold++;
+ }
+ }
+
+ }
+
+ if(((((double) cluster.idTOassociatedRnas.size()) * (double) GeneFinder.readLength)/( (double) cluster.exonLength)) < GeneFinder.minCoverage){
+ numGenesWithTotalSupportBelowThreshold++;
+ supportInfo = "n";
+ }
+
+ double coverage = ((((double) cluster.idTOassociatedRnas.size()) * (double) GeneFinder.readLength)/( (double) cluster.exonLength));
+
+ int uniqueNum = (cluster.idTOassociatedRnas.keySet().size() - cluster.numOfMultis);
+
+ if((((double)uniqueNum)/((double)cluster.idTOassociatedRnas.keySet().size()) < 0.1)){ // proportion of multi reads
+ if((((double) uniqueNum) * (double) GeneFinder.readLength)/((double) cluster.exonLength) < 0.1){ // uniqueCov
+ hasEnoughUniques = "n";
+ }
+ }
+
+
+ //else{
+ geneRefNonZero++;
+ exonLength_covered_total += cluster.exonLength;
+
+ String strandInfo = "+"; // has to be updated if reverse strand
+
+ if(!cluster.operonDirectionIsForward){
+ strandInfo = "-";
+ }
+
+ if(cluster.operonOrfs.isEmpty()){
+ // first write out transcript
+
+ noCistron++;
+
+ // CC ANA:
+
+ bwGTF_ccAna.write(contigName + "\tGIIRA\ttranscript\t" + (cluster.startPos+1) + "\t" + (cluster.stopPos+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ // PROK FILE:
+
+ bwGTF_prok.write(contigName + "\tGIIRA\tgene\t" + (cluster.startPos+1) + "\t" + (cluster.stopPos+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ // now write out all exons
+
+ int exonNum = 1;
+ for(int[] exon : cluster.exonsOfGene){
+ bwGTF_ccAna.write(contigName + "\tGIIRA\texon\t" + (exon[0]+1) + "\t" + exon[1] +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; exon_number " + (exonNum++) + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+ }
+
+ exonNum = 1;
+
+ for(int[] exon : cluster.exonsOfGene){
+ bwGTF_prok.write(contigName + "\tGIIRA\tCDS\t" + (exon[0]+1) + "\t" + exon[1] +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; exon_number " + (exonNum++) + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+ }
+
+
+ } else if(cluster.operonOrfs.size() == 2){
+
+ bwGTF_ccAna.write(contigName + "\tGIIRA\ttranscript\t" + (cluster.operonOrfs.get(1)[0]+1) + "\t" + (cluster.operonOrfs.get(1)[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ bwGTF_ccAna.write(contigName + "\tGIIRA\texon\t" + (cluster.operonOrfs.get(1)[0]+1) + "\t" + (cluster.operonOrfs.get(1)[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; exon_number 1" + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ bwGTF_prok.write(contigName + "\tGIIRA\tgene\t" + (cluster.startPos+1) + "\t" + (cluster.stopPos+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ bwGTF_prok.write(contigName + "\tGIIRA\tCDS\t" + (cluster.operonOrfs.get(1)[0]+1) + "\t" + (cluster.operonOrfs.get(1)[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; exon_number 1" + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ } else{
+
+ moreThanOneCistron++;
+ if(cluster.operonOrfs.size() > 3){
+ moreThanTwoCistrons++;
+ }
+
+ //manage 1/-1 for operon distinction
+
+ if(cluster.operonOrfs.get(0)[0] == 1){
+ bwGTF_prok.write(contigName + "\tGIIRA\tgene\t" + (cluster.startPos+1) + "\t" + (cluster.stopPos+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+
+ for(int i = 1; i<cluster.operonOrfs.size();++i){
+
+ int[] orf = cluster.operonOrfs.get(i);
+
+ bwGTF_ccAna.write(contigName + "\tGIIRA\ttranscript\t" + (orf[0]+1) + "\t" + (orf[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "_" + (i) + "\"; transcript_id \"Transcript." + cluster.geneID + "_" + (i) + ".1\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+
+ bwGTF_ccAna.write(contigName + "\tGIIRA\texon\t" + (orf[0]+1) + "\t" + (orf[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "_" + (i) + "\"; transcript_id \"Transcript." + cluster.geneID + "_" + (i) + ".1\"; exon_number 1" + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ bwGTF_prok.write(contigName + "\tGIIRA\tCDS\t" + (orf[0]+1) + "\t" + (orf[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "\"; transcript_id \"Transcript." + cluster.geneID + ".1\"; exon_number " + (i) + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ }
+ }else{
+ moreOperons++;
+ for(int i = 1; i<cluster.operonOrfs.size();++i){
+
+ operonSplitNum++;
+
+ if(cluster.operonOrfs.get(i)[0] == 0){
+ strandInfo = "-";
+ }else{
+ strandInfo = "+";
+ }
+
+ bwGTF_prok.write(contigName + "\tGIIRA\tgene\t" + (cluster.operonOrfs.get(i)[1]+1) + "\t" + (cluster.operonOrfs.get(i)[2]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "_" + i + "\"; transcript_id \"Transcript." + cluster.geneID + "_"+ i + ".1" + "\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+
+ int exonNum = 1;
+ for(int posArr = 3; posArr<cluster.operonOrfs.get(i).length;++posArr){
+
+ int[] orf = new int[2];
+ orf[0] = cluster.operonOrfs.get(i)[posArr++];
+ orf[1] = cluster.operonOrfs.get(i)[posArr];
+
+ bwGTF_ccAna.write(contigName + "\tGIIRA\ttranscript\t" + (orf[0]+1) + "\t" + (orf[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "_" + i + "_" + (exonNum) + "\"; transcript_id \"Transcript." + cluster.geneID + "_" + i + "_" + (exonNum) + ".1\"; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+
+ bwGTF_ccAna.write(contigName + "\tGIIRA\texon\t" + (orf[0]+1) + "\t" + (orf[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "_" + i + "_" + (exonNum) + "\"; transcript_id \"Transcript." + cluster.geneID + "_" + i + "_" + (exonNum) + ".1\"; exon_number 1" + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ bwGTF_prok.write(contigName + "\tGIIRA\tCDS\t" + (orf[0]+1) + "\t" + (orf[1]+1) +
+ "\t" + cluster.score + "\t"+strandInfo+"\t0\tgene_id \"Gene." + cluster.geneID + "_" + i + "\"; transcript_id \"Transcript." + cluster.geneID + "_" + i + ".1\"; exon_number " + (exonNum++) + "; coverage: " + coverage + "; alsoUniqueSupport: " + multiInfo + "; coverageSupport: " + supportInfo + "; hasEnoughUniques: " + hasEnoughUniques + ";\n");
+
+ }
+
+ }
+
+ operonSplitAv = ((double) operonSplitNum)/((double) moreOperons);
+ }
+
+ }
+
+ }
+ }
+
+ if(GeneFinder.iteration == 1 && !GeneFinder.inprogeaCall){
+ thisContig.allGenes = new Vector<Gene>();
+ }
+
+ }
+
+ //System.out.println("Number of prokaryotic genes without operon cistron: " + noCistron);
+ //System.out.println("Number of prokaryotic genes with more than one cistron: " + moreThanOneCistron);
+ //System.out.println("Number of prokaryotic genes with more than two cistrons: " + moreThanTwoCistrons);
+ //System.out.println("Number of identified genes with support below threshold: " + numGenesWithSupportBelowThreshold);
+ System.out.println("Number of identified genes with total coverage below threshold: " + numGenesWithTotalSupportBelowThreshold);
+ System.out.println("Number of identified genes with only ambiguous support: " + numGenesWithOnlyMulti);
+ System.out.println("Number of identified genes on reference: " + geneRefNonZero);
+ System.out.println("More than one operon in transcript: " + moreOperons + " with average split number: " + operonSplitAv);
+
+ writeToLogFile("\n\nNumber of identified genes on reference: " + geneRefNonZero + " \nNumber of identified genes with total coverage below threshold: " + numGenesWithTotalSupportBelowThreshold + "\nNumber of identified genes with only ambiguous support: " + numGenesWithOnlyMulti + "\n");
+
+ bwGTF_ccAna.close();
+ bwGTF_prok.close();
+ }catch (FileNotFoundException r) {
+ r.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+}
diff --git a/src/types/Contig.java b/src/types/Contig.java
new file mode 100755
index 0000000..95899a7
--- /dev/null
+++ b/src/types/Contig.java
@@ -0,0 +1,27 @@
+package types;
+
+import java.util.TreeMap;
+import java.util.Vector;
+
+/**
+ * stores contigs and their mapped positions, splicing information and assigned genes
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+public class Contig {
+
+ public String contigName; // key
+
+ public TreeMap<Integer,Vector<Rna>> positionTOmappingRnas = new TreeMap<Integer,Vector<Rna>>(); // such that we can traverse all mapped positions
+
+ public Vector<Gene> allGenes = new Vector<Gene>(); // stores all assigned gene candidates of this contig
+
+ public TreeMap<Integer ,Integer> splicePositions = new TreeMap<Integer,Integer>(); // first entry: position in contig; second entry: number of reads
+
+ public TreeMap<Integer ,Integer> positionTOdiff = new TreeMap<Integer,Integer>(); // remembers read differences, needed for insertions + deletions, coverage increased/decreased according to value
+
+}
\ No newline at end of file
diff --git a/src/types/Gene.java b/src/types/Gene.java
new file mode 100755
index 0000000..bde10c8
--- /dev/null
+++ b/src/types/Gene.java
@@ -0,0 +1,74 @@
+package types;
+
+/**
+ * all properties of genes identified by the gene finder
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ */
+
+import java.util.HashMap;
+import java.util.TreeMap;
+import java.util.Vector;
+
+public class Gene {
+
+ public int geneID; // key
+
+ public String sequence = new String(); // interval determined by start and stop codon if this candidate gene, introns are included
+ public Vector<int[]> operonOrfs = new Vector<int[]>(); // only for prokaryotes, contains operon orfs
+ public boolean operonDirectionIsForward; // true if forward
+
+ public int exonLength; // sum of all exon lengths
+
+ public double score; // score depending on assigned reads, reflect probability of this candidate to be a true gene
+
+ public int startPos; // position of start codon
+ public int[] possibleStarts_Forward = {-1,-1,-1}; // each gene has 6 possible starts (3 for each strand) within a specified interval
+ public int[] possibleStarts_Reverse = {-1,-1,-1};
+
+ public int stopPos; // position of stop codon
+ public int[] possibleStops_Forward = {-1,-1,-1}; // each gene has 6 possible stops (3 for each strand) within a specified interval
+ public int[] possibleStops_Reverse = {-1,-1,-1};
+
+ public int coreSeq; // if the core is only as big as the read length, this gene is penalized, because it is very likely to be a false positive
+
+ public TreeMap<Integer,Object[]> possibleIntrons = new TreeMap<Integer,Object[]>(); // first entry: Vector<int[]>, second entry: Vector<Vector<Rna>> -> introns of this site
+ // with supporting rnas; third entry: exon end if rnas exist not supporting this split, -1 else ; fourth entry: starting site for "fake splits"
+
+ public TreeMap<Integer,Vector<Rna>> possibleFussyExons = new TreeMap<Integer,Vector<Rna>>(); // stores all rnas that support a "fussy exon" where reads do not support the splice site
+
+ public Vector<int[]> exonsOfGene = new Vector<int[]>(); // contains all exons of this gene
+
+ public Vector<Object[]> alternativeTranscripts = new Vector<Object[]>(); // if this gene has alternative transcripts, these are defined by one intron, which is stored here
+ public Vector<int[]> eraseIntrons_temp = new Vector<int[]>();
+ public Vector<int[]> intronEndsThatAreNotContinued = new Vector<int[]>(); // if this intron end leads to an exon that is not continued, store this here to remember for exonSearch
+
+ public double uniqueCov; // coverage of this gene only by unique reads
+ public double totalCov; // coverage of this gene given by all assigned reads
+ public int numOfMultis; // number of ambiguous reads mapping this gene
+ public boolean hasStop_temp; // indicator for candidate extraction, sign if the gene did not find a stop during the first search
+
+ public boolean onRevStrand; // indicates whether ORF is on forward or reverse strand
+
+ public boolean realDirectionNotKnown; // allow a gene to switch the direction if necessary
+
+ public HashMap<String,Object[]> idTOassociatedRnas = new HashMap<String,Object[]>(); // Object[0] = rna,[1] = supportedSplit (int[] with spliceKey and intronEnd (-1,-1) if not present), [2] = supportedFussyExon (int or -1)
+
+ public int[] direcCounter = {0,0}; // counts the reads supporting forward [0] and reverse [1] strand, only possible if XS tag provided
+
+ public Vector<String> moreThanOneHitRnas = new Vector<String>(); // if a read maps more than once to this gene, add its id here as a backup
+
+ public Gene twinNode;
+ public boolean freeToResolve; //indicates if this cluster and its twin can be locally resolved, if false, this is only possible during the ambiguous read optimization
+
+ public boolean hadTwinBefore; // if true we have to mark the associated transcript as one to be 6-frame translated (because there is an uncertainty which strand)
+
+ public boolean isMergedTwin; // indicates the twin that has been merged with other clusters (if this has happened only to one twin)
+
+ /////////////// FOR RNA-ProGen //////////////////////
+ public Vector<Vector<int[]>> exonsOfTranscripts = new Vector<Vector<int[]>>();
+ public boolean wasLookedAt; // necessary for score update
+ public double scoreRnaPart;
+}
diff --git a/src/types/Rna.java b/src/types/Rna.java
new file mode 100755
index 0000000..590f31d
--- /dev/null
+++ b/src/types/Rna.java
@@ -0,0 +1,30 @@
+package types;
+
+/**
+ * rna object to store quality and other important stuff
+ * Copyright (c) 2013,
+ * Franziska Zickmann,
+ * ZickmannF at rki.de, Robert Koch-Institute, Berlin, Germany
+ * Distributed under the GNU Lesser General Public License, version 3.0
+ *
+ */
+
+import java.util.Vector;
+
+public class Rna {
+
+ public String rnaID; // key
+
+ public double quality;
+
+ public Vector<Object[]> contigsMappedOn = new Vector<Object[]>(); // contains several Arrays á: [contig, alignPos, cigarString, mapQual,spliceInfo,mismatchInfo,direcInfo] (one for each hit)
+
+ public int isMulti; // indicator if this read is an ambiguous read
+
+ public int hitNum; // the original number of hits
+
+ public int assignedNum; // shows the number of assignments to a candidate
+
+ public Vector<Integer> isSharedBy = new Vector<Integer>(); // necessary for twin clusters -> contains the ids of all clusters that share this rna
+
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/giira.git
More information about the debian-med-commit
mailing list