[med-svn] [giira] 03/09: New upstream version 0.0.20140625
Andreas Tille
tille at debian.org
Mon Jan 9 11:39:59 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository giira.
commit 54ea89068c995776063bbfac8dd31e9fe157d1ea
Author: Andreas Tille <tille at debian.org>
Date: Mon Jan 9 11:55:59 2017 +0100
New upstream version 0.0.20140625
---
src/geneFinder/ExtractGeneCandidates.java | 3 +
src/geneFinder/FrameSearch.java | 236 ++++++++++++++++++++++++
src/geneFinder/GeneFinder.java | 10 +-
src/geneFinder/Giira.java | 5 +-
src/geneFinder/ProkaryoteExtraction.java | 3 +
src/geneFinder/Prokaryote_Specials.java | 211 +++++++++++++++++++++
src/geneFinder/ReadInParameters_GeneFinder.java | 88 ++++++++-
src/geneFinder/SamParser.java | 76 ++++++--
src/types/Rna.java | 2 +-
9 files changed, 612 insertions(+), 22 deletions(-)
diff --git a/src/geneFinder/ExtractGeneCandidates.java b/src/geneFinder/ExtractGeneCandidates.java
index a663c7c..3bc9bb2 100755
--- a/src/geneFinder/ExtractGeneCandidates.java
+++ b/src/geneFinder/ExtractGeneCandidates.java
@@ -57,6 +57,9 @@ public class ExtractGeneCandidates {
if(line.startsWith(">")){
// test if correct contig
if(line.substring(1).startsWith(contigName)){
+ if(!((line.substring(1).startsWith(contigName+" ")) || (line.substring(1).length() == contigName.length()))){
+ continue; // as an additional check to avoid picking the wrong contig because of name sub-similarities
+ }
// found right one, now extract sequence
while(((line = br.readLine()) != null) && (line.length() != 0) && (!(line.startsWith(">")))){
String line2 = "";
diff --git a/src/geneFinder/FrameSearch.java b/src/geneFinder/FrameSearch.java
index d469eaa..76fdd1f 100755
--- a/src/geneFinder/FrameSearch.java
+++ b/src/geneFinder/FrameSearch.java
@@ -26,6 +26,12 @@ public class FrameSearch {
public static int findPossibleStarts_Forward(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop){
+ if(!GeneFinder.alternativeCodons.isEmpty()){
+ if(GeneFinder.alternativeCodons.containsKey("START FO")){
+ return FrameSearch.findPossibleStarts_Forward_AlternativeStarts(cluster, contigSeq, posAr, tempStop, GeneFinder.alternativeCodons.get("START FO"));
+ }
+ }
+
if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
return -1;
}
@@ -69,6 +75,12 @@ public class FrameSearch {
public static int findPossibleStarts_Reverse(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop){
+ if(!GeneFinder.alternativeCodons.isEmpty()){
+ if(GeneFinder.alternativeCodons.containsKey("START RE")){
+ return FrameSearch.findPossibleStarts_Reverse_AlternativeStarts(cluster, contigSeq, posAr, tempStop, GeneFinder.alternativeCodons.get("START RE"));
+ }
+ }
+
int start_RE = -1;
if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
@@ -113,6 +125,112 @@ public class FrameSearch {
}
/*
+ * if alternative start and stop codons are given, perform a more general search also respecting those codons
+ *
+ */
+
+ public static int findPossibleStarts_Forward_AlternativeStarts(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop,String[] alternativeStarts){
+
+ int start1 = -1;
+
+ if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
+ return -1;
+ }
+
+ String startPart = contigSeq.substring((int)Math.max(0,(cluster.startPos-GeneFinder.readLength)), tempStop);
+
+ int startSub_alt[] = new int[alternativeStarts.length];
+
+ for(int i = 0; i<alternativeStarts.length;++i){
+ startSub_alt[i] = startPart.lastIndexOf(alternativeStarts[i]);
+ }
+
+ java.util.Arrays.sort(startSub_alt);
+
+ for(int i = startSub_alt.length -1; i>= 0;i--){
+ if(startSub_alt[i] > -1){
+ start1 = (int) Math.max(0,(cluster.startPos-GeneFinder.readLength)) + startSub_alt[i];
+ break;
+ }
+ }
+
+ if(start1 == -1){
+ return start1;
+ }
+
+ boolean foundSameFrame = false;
+
+ for(int i = 0; i<posAr;++i){
+ if((cluster.possibleStarts_Forward[i] - start1) % 3 == 0){
+ foundSameFrame = true;
+ break;
+ }
+ }
+
+ if(!foundSameFrame){
+ cluster.possibleStarts_Forward[posAr++] = start1;
+ findPossibleStarts_Forward_AlternativeStarts(cluster,contigSeq,posAr,start1,alternativeStarts);
+ }else{
+ findPossibleStarts_Forward_AlternativeStarts(cluster,contigSeq,posAr,start1,alternativeStarts);
+ }
+
+ return start1;
+ }
+
+ /*
+ * if alternative start and stop codons are given, perform a more general search also respecting those codons
+ *
+ */
+
+ public static int findPossibleStarts_Reverse_AlternativeStarts(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop,String[] alternativeStops){
+
+ int start_RE = -1;
+
+ if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
+ return -1;
+ }
+
+ String startPart = contigSeq.substring((int)Math.max(0,(cluster.startPos-GeneFinder.readLength)), tempStop);
+
+ int startSub_alt[] = new int[alternativeStops.length];
+
+ for(int i = 0; i<alternativeStops.length;++i){
+ startSub_alt[i] = startPart.lastIndexOf(alternativeStops[i]);
+ }
+
+ java.util.Arrays.sort(startSub_alt);
+
+ for(int i = startSub_alt.length -1; i>= 0;i--){
+ if(startSub_alt[i] > -1){
+ start_RE = (int) Math.max(0,(cluster.startPos-GeneFinder.readLength)) + startSub_alt[i];
+ break;
+ }
+ }
+
+ if(start_RE == -1){
+ return start_RE;
+ }
+
+ boolean foundSameFrame = false;
+
+ for(int i = 0; i<posAr;++i){
+ if((cluster.possibleStarts_Reverse[i] - start_RE) % 3 == 0){
+ foundSameFrame = true;
+ break;
+ }
+ }
+
+ if(!foundSameFrame){
+ cluster.possibleStarts_Reverse[posAr++] = start_RE;
+ findPossibleStarts_Reverse_AlternativeStarts(cluster,contigSeq,posAr,start_RE,alternativeStops);
+ }else{
+ findPossibleStarts_Reverse_AlternativeStarts(cluster,contigSeq,posAr,start_RE,alternativeStops);
+ }
+
+ return start_RE;
+}
+
+ /*
* new way of gene extraction by remembering all starts that are not in the same frame (maxNum = 3)
* after that, starts and stops are checked if we find a combination that defines the frame of the cluster
*
@@ -122,6 +240,12 @@ public class FrameSearch {
public static int findPossibleStops_Forward(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart){
+ if(!GeneFinder.alternativeCodons.isEmpty()){
+ if(GeneFinder.alternativeCodons.containsKey("STOP FO")){
+ return FrameSearch.findPossibleStops_Forward_AlternativeStops(cluster, contigSeq, posAr, tempStart, GeneFinder.alternativeCodons.get("STOP FO"));
+ }
+ }
+
int stop_FO = -1;
if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
@@ -174,6 +298,12 @@ public class FrameSearch {
public static int findPossibleStops_Reverse(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart){
+ if(!GeneFinder.alternativeCodons.isEmpty()){
+ if(GeneFinder.alternativeCodons.containsKey("STOP RE")){
+ return FrameSearch.findPossibleStops_Reverse_AlternativeStop(cluster, contigSeq, posAr, tempStart, GeneFinder.alternativeCodons.get("STOP RE"));
+ }
+ }
+
if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
return -1;
}
@@ -206,6 +336,112 @@ public class FrameSearch {
}
/*
+ * if alternative start and stop codons are given, perform a more general search also respecting those codons
+ *
+ */
+
+ public static int findPossibleStops_Forward_AlternativeStops(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart, String[] alternativeStops){
+
+ int stop_FO = -1;
+
+ if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
+ return -1;
+ }
+
+ String stopPart = contigSeq.substring(tempStart, (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1));
+
+ int stopSub_alt[] = new int[alternativeStops.length];
+
+ for(int i = 0; i<alternativeStops.length;++i){
+ stopSub_alt[i] = stopPart.indexOf(alternativeStops[i]);
+ }
+
+ java.util.Arrays.sort(stopSub_alt);
+
+ for(int i = 0; i < stopSub_alt.length;++i){
+ if(stopSub_alt[i] > -1){
+ stop_FO = tempStart + stopSub_alt[i];
+ break;
+ }
+ }
+
+ if(stop_FO == -1){
+ return stop_FO;
+ }
+
+ boolean foundSameFrame = false;
+
+ for(int i = 0; i<posAr;++i){
+ if((cluster.possibleStops_Forward[i] - stop_FO) % 3 == 0){
+ foundSameFrame = true;
+ break;
+ }
+ }
+
+ if(!foundSameFrame){
+ cluster.possibleStops_Forward[posAr++] = stop_FO;
+ findPossibleStops_Forward_AlternativeStops(cluster,contigSeq,posAr,stop_FO+3,alternativeStops);
+ }else{
+ findPossibleStops_Forward_AlternativeStops(cluster,contigSeq,posAr,stop_FO+3,alternativeStops);
+ }
+
+ return stop_FO;
+ }
+
+ /*
+ * if alternative start and stop codons are given, perform a more general search also respecting those codons
+ *
+ */
+
+ public static int findPossibleStops_Reverse_AlternativeStop(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart, String[] alternativeStarts){
+
+ int start1 = -1;
+
+ if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
+ return -1;
+ }
+
+ String stopPart = contigSeq.substring(tempStart, (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1));
+
+ int stopSub_alt[] = new int[alternativeStarts.length];
+
+ for(int i = 0; i<alternativeStarts.length;++i){
+ stopSub_alt[i] = stopPart.indexOf(alternativeStarts[i]);
+ }
+
+ java.util.Arrays.sort(stopSub_alt);
+
+ for(int i = 0; i < stopSub_alt.length;++i){
+ if(stopSub_alt[i] > -1){
+ start1 = tempStart + stopSub_alt[i];
+ break;
+ }
+ }
+
+ if(start1 == -1){
+ return start1;
+ }
+
+ boolean foundSameFrame = false;
+
+ for(int i = 0; i<posAr;++i){
+ if((cluster.possibleStops_Reverse[i] - start1) % 3 == 0){
+ foundSameFrame = true;
+ break;
+ }
+ }
+
+ if(!foundSameFrame){
+ cluster.possibleStops_Reverse[posAr++] = start1;
+ findPossibleStops_Reverse_AlternativeStop(cluster,contigSeq,posAr,start1+3,alternativeStarts);
+ }else{
+ findPossibleStops_Reverse_AlternativeStop(cluster,contigSeq,posAr,start1+3,alternativeStarts);
+ }
+
+ return start1;
+ }
+
+ /*
* test if there is one of the possible start-stop codon pairs which is in frame
* take the smallest interval possible
*/
diff --git a/src/geneFinder/GeneFinder.java b/src/geneFinder/GeneFinder.java
index 5dba17a..6a81249 100755
--- a/src/geneFinder/GeneFinder.java
+++ b/src/geneFinder/GeneFinder.java
@@ -30,6 +30,8 @@ public class GeneFinder {
public static Map<File,String> genomeFilesWithNames = new HashMap<File,String>();
public static Map<File,String> rnaFilesWithNames = new HashMap<File,String>();
+
+ public static Map<String,String[]> alternativeCodons = new HashMap<String,String[]>();
public static boolean useTopHat; // indicator for mapping tool
public static String settingMapper; // setting for the mapping tool, differs slightly depending on which tool was chosen
@@ -80,13 +82,7 @@ public class GeneFinder {
public static Object[] manager(String[] args){
- ReadInParameters_GeneFinder.readIn_GF(args);
-
- /*Gene gene = new Gene();
- gene.startPos = 0;
- String seq = readInFasta();
- Prokaryote_Specials.define_OrfsInOperon(seq,gene);
- System.exit(0);*/
+ ReadInParameters_GeneFinder.readIn_GF(args);
long timeBef = System.currentTimeMillis();
diff --git a/src/geneFinder/Giira.java b/src/geneFinder/Giira.java
index 4f264b5..f669535 100755
--- a/src/geneFinder/Giira.java
+++ b/src/geneFinder/Giira.java
@@ -43,7 +43,10 @@ public class Giira {
try {
String decodedPath = URLDecoder.decode(path, "UTF-8");
- String scriptPath = decodedPath.substring(0,decodedPath.length()-9);
+ String[] pathArr = decodedPath.split("/");
+ int lengthName = pathArr[pathArr.length-1].length();
+ String scriptPath = decodedPath.substring(0,decodedPath.length()-lengthName);
+ //String scriptPath = decodedPath.substring(0,decodedPath.length()-9);
//System.out.println("Path of Giira: " + decodedPath);
classPath = "";
diff --git a/src/geneFinder/ProkaryoteExtraction.java b/src/geneFinder/ProkaryoteExtraction.java
index 65c12a4..9e19847 100755
--- a/src/geneFinder/ProkaryoteExtraction.java
+++ b/src/geneFinder/ProkaryoteExtraction.java
@@ -56,6 +56,9 @@ public class ProkaryoteExtraction {
if(line.startsWith(">")){
// test if correct contig
if(line.substring(1).startsWith(contigName)){
+ if(!((line.substring(1).startsWith(contigName+" ")) || (line.substring(1).length() == contigName.length()))){
+ continue; // as an additional check to avoid picking the wrong contig because of name sub-similarities
+ }
// found right one, now extract sequence
while(((line = br.readLine()) != null) && (line.length() != 0) && (!(line.startsWith(">")))){
String line2 = "";
diff --git a/src/geneFinder/Prokaryote_Specials.java b/src/geneFinder/Prokaryote_Specials.java
index ed886ee..bbb3cfc 100755
--- a/src/geneFinder/Prokaryote_Specials.java
+++ b/src/geneFinder/Prokaryote_Specials.java
@@ -234,6 +234,12 @@ public class Prokaryote_Specials {
public static Vector<int[]> searchFO_orfs(String inputSeq){
+ if(!GeneFinder.alternativeCodons.isEmpty()){
+ if(GeneFinder.alternativeCodons.containsKey("START FO")){
+ return searchFO_orfs_alternativeCodons(inputSeq, GeneFinder.alternativeCodons.get("START FO"), GeneFinder.alternativeCodons.get("STOP FO"));
+ }
+ }
+
Vector<int[]> allORFs_FO = new Vector<int[]>();
int foundNewATG = 1;
@@ -315,12 +321,118 @@ public class Prokaryote_Specials {
}
/*
+ * if alternative start and stop codons are specified, respect this in a more general orf search
+ *
+ */
+
+ public static Vector<int[]> searchFO_orfs_alternativeCodons(String inputSeq, String[] alternativeStarts_FO, String[] alternativeStops_FO){
+
+ Vector<int[]> allORFs_FO = new Vector<int[]>();
+
+ int foundNewATG = 1;
+ int posLastATG = 0;
+
+ do{
+
+ int startPos = -1;
+
+ String startPart_alt = inputSeq.substring(posLastATG);
+
+ int startSub_alt[] = new int[alternativeStarts_FO.length];
+
+ for(int i = 0; i<alternativeStarts_FO.length;++i){
+ startSub_alt[i] = startPart_alt.indexOf(alternativeStarts_FO[i]);
+ }
+
+ java.util.Arrays.sort(startSub_alt);
+
+ for(int i = 0; i < startSub_alt.length;++i){
+ if(startSub_alt[i] > -1){
+ startPos = startSub_alt[i];
+ break;
+ }
+ }
+
+ int stopPos = -1;
+
+ int posLastStart = -1;
+
+ if(startPos == -1){
+ foundNewATG = 0;
+ break;
+ }else{
+ startPos = startPos + posLastATG;
+ posLastATG = startPos + 3;
+ posLastStart = startPos + 3;
+ }
+
+ int goOn = 0;
+
+ do{
+ goOn = 0;
+
+ String stopPart = inputSeq.substring(posLastStart);
+
+ int stopSub[] = new int[alternativeStops_FO.length];
+
+ for(int i = 0; i<alternativeStops_FO.length;++i){
+ stopSub[i] = stopPart.indexOf(alternativeStops_FO[i]);
+ }
+
+ java.util.Arrays.sort(stopSub);
+
+ for(int i = 0; i < stopSub.length;++i){
+ if(stopSub[i] > -1){
+ if(((((posLastStart + stopSub[i])-startPos) % 3) == 0)){
+ stopPos = posLastStart + stopSub[i];
+ }else{
+ posLastStart = posLastStart + stopSub[i]+1;
+ goOn = 1;
+ }
+ break;
+ }
+ }
+
+ if(stopPos != -1){
+
+ if(stopPos-startPos > 30){
+ if(!checkIfORFcovered(allORFs_FO,new int[]{startPos,(stopPos+2)})){
+ allORFs_FO.add(new int[]{startPos,(stopPos+2)});
+ for(int i=startPos;i<=stopPos+2;++i){
+ cov[i]++;
+ }
+ }else{
+ alreadyCovered++;
+ }
+ }else{
+ notCounted++;
+ }
+
+ break;
+ }
+
+ }while(goOn == 1);
+
+
+ }while(foundNewATG == 1);
+
+
+ return allORFs_FO;
+ }
+
+ /*
* searches all ORFs assuming reverse direction
* note: no length limit is set, ORFs too short should be penalized in the BIC scoring
*/
public static Vector<int[]> searchRE_orfs(String inputSeq){
+ if(!GeneFinder.alternativeCodons.isEmpty()){
+ if(GeneFinder.alternativeCodons.containsKey("START RE")){
+ return searchRE_orfs_alternativeCodons(inputSeq, GeneFinder.alternativeCodons.get("STOP RE"), GeneFinder.alternativeCodons.get("START RE")); // are stored the other way around so start is stop and vice versa
+ }
+ }
+
Vector<int[]> allORFs_RE= new Vector<int[]>();
int foundNewCAT = 1;
@@ -402,6 +514,105 @@ public class Prokaryote_Specials {
}
/*
+ * if alternative start and stop codons are specified, respect this in a more general orf search
+ *
+ */
+
+ public static Vector<int[]> searchRE_orfs_alternativeCodons(String inputSeq, String[] alternativeStarts_RE, String[] alternativeStops_RE){
+
+ Vector<int[]> allORFs_RE= new Vector<int[]>();
+
+ int foundNewCAT = 1;
+ int posLastCAT = inputSeq.length();
+
+ do{
+ int startPos = -1;
+
+ String startPart_alt = inputSeq.substring(0,posLastCAT);
+
+ int startSub_alt[] = new int[alternativeStarts_RE.length];
+
+ for(int i = 0; i<alternativeStarts_RE.length;++i){
+ startSub_alt[i] = startPart_alt.lastIndexOf(alternativeStarts_RE[i]);
+ }
+
+ java.util.Arrays.sort(startSub_alt);
+
+ for(int i = startSub_alt.length -1; i>= 0;i--){
+ if(startSub_alt[i] > -1){
+ startPos = startSub_alt[i];
+ break;
+ }
+ }
+
+ int stopPos = -1;
+
+ int posLastStop = -1;
+
+ if(startPos == -1){
+ foundNewCAT = 0;
+ break;
+ }else{
+ posLastCAT = startPos;
+ posLastStop = startPos;
+ }
+
+ int goOn = 0;
+
+ do{
+ goOn = 0;
+
+ String stopPart = inputSeq.substring(0,posLastStop);
+
+ int stopSub[] = new int[alternativeStops_RE.length];
+
+ for(int i = 0; i<alternativeStops_RE.length;++i){
+ stopSub[i] = stopPart.lastIndexOf(alternativeStops_RE[i]);
+ }
+
+ java.util.Arrays.sort(stopSub);
+
+ for(int i = stopSub.length -1; i>= 0;i--){
+ if(stopSub[i] > -1){
+ if(((startPos-stopSub[i]) % 3) == 0){
+ stopPos = stopSub[i];
+ }else{
+ posLastStop = stopSub[i]+2;
+ goOn = 1;
+ }
+ break;
+ }
+ }
+
+ if(stopPos != -1){
+
+ if(startPos-stopPos > 30){
+ if(!checkIfORFcovered(allORFs_RE,new int[]{stopPos,(startPos+2)})){
+ allORFs_RE.add(new int[]{stopPos,(startPos+2)});
+ for(int i=stopPos;i<=startPos+2;++i){
+ cov[i]++;
+ }
+ }else{
+ alreadyCovered++;
+ }
+ }else{
+ notCounted++;
+ }
+
+
+ break;
+ }
+
+ }while(goOn == 1);
+
+
+ }while(foundNewCAT == 1);
+
+
+ return allORFs_RE;
+ }
+
+ /*
* filter out all orfs that are completely included in bigger ones
*/
diff --git a/src/geneFinder/ReadInParameters_GeneFinder.java b/src/geneFinder/ReadInParameters_GeneFinder.java
index 9313070..3d0fd75 100755
--- a/src/geneFinder/ReadInParameters_GeneFinder.java
+++ b/src/geneFinder/ReadInParameters_GeneFinder.java
@@ -58,6 +58,7 @@ public class ReadInParameters_GeneFinder {
boolean foundProkaryote = false;
boolean foundSequential = false;
boolean foundInprogea = false;
+ boolean foundAlternativeCodons = false;
if(!parameter.isEmpty() && args.length > 0){
@@ -230,6 +231,12 @@ public class ReadInParameters_GeneFinder {
inputText += "minimal interval length: " + GeneFinder.interval + "\n";
}
+ } else if(arg.equals("-altCodon")){ // alternative start and stop codons
+ String pathToAlternative = args[i+1];
+ readInAlternativeStartsStops(pathToAlternative);
+ foundAlternativeCodons = true;
+ inputText += "Alternative Starts and stops provided \n";
+
} else if(arg.equals("-noAmbiOpti")){ // turn on or off the optimization of ambiguous reads
foundAmbiOpti = true;
GeneFinder.noAmbiOpti = true;
@@ -321,6 +328,21 @@ public class ReadInParameters_GeneFinder {
if(!havePathOut){
GeneFinder.pathOut = "";
+ }else{
+ // check if directory exists, if not, create it
+ File f = new File(GeneFinder.pathOut);
+ if(!f.exists()){
+ Runtime rtAlign = Runtime.getRuntime();
+ try {
+ String exe = "mkdir " + GeneFinder.pathOut;
+ Process pc = rtAlign.exec(exe);
+ pc.waitFor();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
}
if(!foundGenome){
System.out.println("No genome file specified. Use \"-h\" to print usage options. ");
@@ -410,6 +432,7 @@ public class ReadInParameters_GeneFinder {
GeneFinder.inprogeaCall = false;
}
+
GeneFinder.logFile = new File(GeneFinder.pathOut+"log_it" + GeneFinder.iteration + ".txt");
if(!GeneFinder.secondPart){
System.out.println(inputText);
@@ -438,6 +461,66 @@ public class ReadInParameters_GeneFinder {
}
/*
+ * reads in the alternative start and stop codons from a given input file
+ * one line per codon type, with codons tab separated
+ */
+
+ public static void readInAlternativeStartsStops(String altFile) {
+
+ Map<String,String[]> altCodons = new HashMap<String,String[]>();
+
+ try {
+
+ BufferedReader br = new BufferedReader(new FileReader(altFile));
+
+ String line = "";
+
+ while((line = br.readLine()) != null){
+
+ String[] lineArr = line.split("\t");
+ String[] temp = new String[lineArr.length-1];
+
+ if(line.startsWith("START FO")){
+
+ for(int i = 1; i<lineArr.length;++i){
+ temp[i-1] = lineArr[i];
+ }
+
+ altCodons.put("START FO",temp);
+ }
+ if(line.startsWith("START RE")){
+ for(int i = 1; i<lineArr.length;++i){
+ temp[i-1] = lineArr[i];
+ }
+
+ altCodons.put("STOP RE",temp); // for GIIRA start and stops are switched for the reverse direction
+ }
+ if(line.startsWith("STOP FO")){
+ for(int i = 1; i<lineArr.length;++i){
+ temp[i-1] = lineArr[i];
+ }
+
+ altCodons.put("STOP FO",temp);
+ }
+ if(line.startsWith("STOP RE")){
+ for(int i = 1; i<lineArr.length;++i){
+ temp[i-1] = lineArr[i];
+ }
+
+ altCodons.put("START RE",temp); // for GIIRA start and stops are switched for the reverse direction
+ }
+ }
+
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ GeneFinder.alternativeCodons = altCodons;
+ }
+
+ /*
* print the help text to screen
*/
@@ -457,7 +540,7 @@ public class ReadInParameters_GeneFinder {
" \n -iG [pathToGenomes] : specify path to directory with genome files in fasta format \n" +
" \n -iR [pathToRna] : specify path to directory with rna read files in fastq format \n" +
" \n -scripts [absolutePath] : specify the absolute path to the directory containing the required helper scripts, DEFAULT: directory of GIIRA.jar \n" +
- " \n -out [pathToResults] : specify the directory that shall contain the results files \n" +
+ " \n -out [pathToResults] : specify the absolute pyth to the directory that shall contain the results files \n" +
" \n -outName [outputName] : specify desired name for output files, DEFAULT: genes \n" +
" \n -haveSam [samfileName]: if a sam file already exists, provide the name, else a mapping is performed. NOTE: the sam file has to be sorted according to read names! \n" +
" \n -nT [numberThreads] : specify the maximal number of threads that are allowed to be used, DEFAULT: 1 \n" +
@@ -468,7 +551,7 @@ public class ReadInParameters_GeneFinder {
//" \n -splitRunAndOpti [y/n] : indicates if the optimization and giira shall be run separately, to reduce the memory consumption (y), DEFAULT: n" +
" \n -mem [int] : specify the amount of memory that cplex is allowed to use \n" +
" \n -maxReportedHits [int] : if using BWA as mapping tool, specify the maximal number of reported hits, DEFAULT: 2 \n" +
- " \n -prokaryote : if specified, genome is treated as prokaryotic, no spliced reads are accepted, and structural genes are resolved. DEFAULT: n \n" +
+ " \n -prokaryote : if specified, genome is treated as prokaryotic, no spliced reads are accepted, and structural genes are resolved. DEFAULT: False \n" +
" \n -minCov [double] : specify the minimum required coverage of the gene candidate extraction, DEFAULT: -1 (is estimated from mapping) \n" +
" \n -maxCov [double] : optional maximal coverage threshold, can also be estimated from mapping (DEFAULT) \n" +
" \n -endCov [double] : if the coverage falls below this value, the currently open candidate gene is closed. This value can be estimated from the minimum coverage (-1); DEFAULT: -1 \n" +
@@ -476,6 +559,7 @@ public class ReadInParameters_GeneFinder {
" \n -interval [int] : specify the minimal size of an interval between near candidate genes, if \"-1\" it equals the read length. DEFAULT: -1 \n " +
" \n -splLim [double] : specify the minimal coverage that is required to accept a splice site, if (-1) the threshold is equal to minCov, DEFAULT: -1 \n" +
" \n -rL [int] : specify read length, otherwise this information is extracted from SAM file (DEFAULT) \n" +
+ " \n -altCodon [pathToAlternativeCodons] : specify path to txt file with alternative start and stop codons, see example file in scripts folder \n" +
" \n -samForSequential [pathToSamFile] : if it is desired to analyse chromosomes in a sequential manner, provide a chromosome sorted sam file in addition to the one sorted by read names, DEFAULT: noSequential \n" +
" \n -noAmbiOpti : if specified, ambiguous hits are not included in the analysis \n" +
" \n -settingMapper [(list of parameters)] : A comma-separated list of the desired parameters for TopHat or BWA. Please provide \n" +
diff --git a/src/geneFinder/SamParser.java b/src/geneFinder/SamParser.java
index 291befe..76377be 100755
--- a/src/geneFinder/SamParser.java
+++ b/src/geneFinder/SamParser.java
@@ -138,7 +138,21 @@ public class SamParser {
totalHitCount++;
- if(!parts[0].equals(currentReadID)){ // now we have proceeded to a new read
+ String adaptedName = "";
+
+ if(parts[0].contains(":")){
+ String[] nameParts = parts[0].split(":");
+ for(int i=0;i<nameParts.length;++i){
+ adaptedName += nameParts[i] + ";;;"; // necessary to avoid cplex or glpk errors
+ }
+
+ adaptedName = adaptedName.substring(0,(adaptedName.length()-3));
+ }else{
+ adaptedName = parts[0];
+ }
+
+
+ if(!adaptedName.equals(currentReadID)){ // now we have proceeded to a new read
if(GeneFinder.iteration == 2 && currentRead != null && currentRead.isMulti == 1){
@@ -147,7 +161,20 @@ public class SamParser {
do{
String[] partsReaSam = lineReaSam.split(" ");
- if(currentRead.rnaID.equals(partsReaSam[0])){
+ String adaptedNameReaSam = "";
+
+ if(partsReaSam[0].contains(":")){
+ String[] nameParts = partsReaSam[0].split(":");
+ for(int i=0;i<nameParts.length;++i){
+ adaptedNameReaSam += nameParts[i] + ";;;"; // necessary to avoid cplex or glpk errors
+ }
+
+ adaptedNameReaSam = adaptedNameReaSam.substring(0,(adaptedNameReaSam.length()-3));
+ }else{
+ adaptedNameReaSam = partsReaSam[0];
+ }
+
+ if(currentRead.rnaID.equals(adaptedNameReaSam)){
allReassigned.put(Integer.parseInt(partsReaSam[3]),partsReaSam[2]);
}else{
break;
@@ -165,12 +192,12 @@ public class SamParser {
}
- currentReadID = parts[0];
+ currentReadID = adaptedName;
// set up new rna node
Rna newRna = new Rna();
- newRna.rnaID = parts[0];
+ newRna.rnaID = adaptedName;
newRna.isMulti = 0;
newRna.hitNum = 1;
newRna.assignedNum = 0;
@@ -479,9 +506,22 @@ public class SamParser {
do{
String[] partsReaSam = lineReaSam.split(" ");
- if(partsReaSam[0].compareTo(currentRead.rnaID) > 0){
+ String adaptedNameReaSam = "";
+
+ if(partsReaSam[0].contains(":")){
+ String[] nameParts = partsReaSam[0].split(":");
+ for(int i=0;i<nameParts.length;++i){
+ adaptedNameReaSam += nameParts[i] + ";;;"; // necessary to avoid cplex or glpk errors
+ }
+
+ adaptedNameReaSam = adaptedNameReaSam.substring(0,(adaptedNameReaSam.length()-3));
+ }else{
+ adaptedNameReaSam = partsReaSam[0];
+ }
+
+ if(adaptedNameReaSam.compareTo(currentRead.rnaID) > 0){
break; // we exceeded this read, so stop
- }else if(currentRead.rnaID.equals(partsReaSam[0])){
+ }else if(currentRead.rnaID.equals(adaptedNameReaSam)){
allReassigned.put(Integer.parseInt(partsReaSam[3]),partsReaSam[2]);
}
}while((lineReaSam = br.readLine()) != null);
@@ -953,9 +993,23 @@ public class SamParser {
Rna read;
- if(seenReads.keySet().contains(parts[0])){
+ String adaptedName = "";
+
+ if(parts[0].contains(":")){
+ String[] nameParts = parts[0].split(":");
+ for(int i=0;i<nameParts.length;++i){
+ adaptedName += nameParts[i] + ";;;"; // necessary to avoid cplex or glpk errors
+ }
+
+ adaptedName = adaptedName.substring(0,(adaptedName.length()-3));
+ }else{
+ adaptedName = parts[0];
+ }
+
+
+ if(seenReads.keySet().contains(adaptedName)){
- Vector<Object> temp = seenReads.get(parts[0]);
+ Vector<Object> temp = seenReads.get(adaptedName);
if(((Integer)temp.get(0)) != 0){
@@ -973,7 +1027,7 @@ public class SamParser {
temp.clear();
temp.add(0);
- seenReads.put(parts[0],temp);
+ seenReads.put(adaptedName,temp);
if(totalHitCount % 100000 == 0){
@@ -1005,7 +1059,7 @@ public class SamParser {
interChromoTotalCount++;
Vector<Object> temp = new Vector<Object>();
temp.add(0);
- seenReads.put(parts[0],temp);
+ seenReads.put(adaptedName,temp);
interChromoTotalCount++;
break;
}
@@ -1049,7 +1103,7 @@ public class SamParser {
Vector<Object> temp = new Vector<Object>();
temp.add(1);
temp.add(read);
- seenReads.put(parts[0],temp);
+ seenReads.put(adaptedName,temp);
}
}
diff --git a/src/types/Rna.java b/src/types/Rna.java
index 590f31d..8e8e123 100755
--- a/src/types/Rna.java
+++ b/src/types/Rna.java
@@ -17,7 +17,7 @@ public class Rna {
public double quality;
- public Vector<Object[]> contigsMappedOn = new Vector<Object[]>(); // contains several Arrays á: [contig, alignPos, cigarString, mapQual,spliceInfo,mismatchInfo,direcInfo] (one for each hit)
+ public Vector<Object[]> contigsMappedOn = new Vector<Object[]>(); // contains several Arrays ala: [contig, alignPos, cigarString, mapQual,spliceInfo,mismatchInfo,direcInfo] (one for each hit)
public int isMulti; // indicator if this read is an ambiguous read
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/giira.git
More information about the debian-med-commit
mailing list