[med-svn] [giira] 03/09: New upstream version 0.0.20140625

Andreas Tille tille at debian.org
Mon Jan 9 11:39:59 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository giira.

commit 54ea89068c995776063bbfac8dd31e9fe157d1ea
Author: Andreas Tille <tille at debian.org>
Date:   Mon Jan 9 11:55:59 2017 +0100

    New upstream version 0.0.20140625
---
 src/geneFinder/ExtractGeneCandidates.java       |   3 +
 src/geneFinder/FrameSearch.java                 | 236 ++++++++++++++++++++++++
 src/geneFinder/GeneFinder.java                  |  10 +-
 src/geneFinder/Giira.java                       |   5 +-
 src/geneFinder/ProkaryoteExtraction.java        |   3 +
 src/geneFinder/Prokaryote_Specials.java         | 211 +++++++++++++++++++++
 src/geneFinder/ReadInParameters_GeneFinder.java |  88 ++++++++-
 src/geneFinder/SamParser.java                   |  76 ++++++--
 src/types/Rna.java                              |   2 +-
 9 files changed, 612 insertions(+), 22 deletions(-)

diff --git a/src/geneFinder/ExtractGeneCandidates.java b/src/geneFinder/ExtractGeneCandidates.java
index a663c7c..3bc9bb2 100755
--- a/src/geneFinder/ExtractGeneCandidates.java
+++ b/src/geneFinder/ExtractGeneCandidates.java
@@ -57,6 +57,9 @@ public class ExtractGeneCandidates {
 					if(line.startsWith(">")){
 						// test if correct contig
 						if(line.substring(1).startsWith(contigName)){
+							if(!((line.substring(1).startsWith(contigName+" ")) || (line.substring(1).length() == contigName.length()))){
+								continue;			// as an additional check to avoid picking the wrong contig because of name sub-similarities 											
+							}
 							// found right one, now extract sequence
 							while(((line = br.readLine()) != null) && (line.length() != 0) &&  (!(line.startsWith(">")))){
 								String line2 = "";
diff --git a/src/geneFinder/FrameSearch.java b/src/geneFinder/FrameSearch.java
index d469eaa..76fdd1f 100755
--- a/src/geneFinder/FrameSearch.java
+++ b/src/geneFinder/FrameSearch.java
@@ -26,6 +26,12 @@ public class FrameSearch {
 	
 	public static int findPossibleStarts_Forward(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop){
 		
+		if(!GeneFinder.alternativeCodons.isEmpty()){
+			if(GeneFinder.alternativeCodons.containsKey("START FO")){
+				return FrameSearch.findPossibleStarts_Forward_AlternativeStarts(cluster, contigSeq, posAr, tempStop, GeneFinder.alternativeCodons.get("START FO"));
+			}
+		}
+		
 		if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
 			return -1;
 		}
@@ -69,6 +75,12 @@ public class FrameSearch {
 	
 	public static int findPossibleStarts_Reverse(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop){
 		
+		if(!GeneFinder.alternativeCodons.isEmpty()){
+			if(GeneFinder.alternativeCodons.containsKey("START RE")){
+				return FrameSearch.findPossibleStarts_Reverse_AlternativeStarts(cluster, contigSeq, posAr, tempStop, GeneFinder.alternativeCodons.get("START RE"));
+			}
+		}
+		
 		int start_RE = -1;
 		
 		if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
@@ -113,6 +125,112 @@ public class FrameSearch {
 	}
 	
 	/*
+	 * if alternative start and stop codons are given, perform a more general search also respecting those codons
+	 * 
+	 */
+	
+	public static int findPossibleStarts_Forward_AlternativeStarts(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop,String[] alternativeStarts){
+		
+		int start1 = -1;
+		
+		if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
+			return -1;
+		}
+		
+		String startPart = contigSeq.substring((int)Math.max(0,(cluster.startPos-GeneFinder.readLength)), tempStop);
+		
+		int startSub_alt[] = new int[alternativeStarts.length];
+		
+		for(int i = 0; i<alternativeStarts.length;++i){
+			startSub_alt[i] = startPart.lastIndexOf(alternativeStarts[i]);
+		}
+		
+		java.util.Arrays.sort(startSub_alt);
+		
+		for(int i = startSub_alt.length -1; i>= 0;i--){
+			if(startSub_alt[i] > -1){ 
+				start1 = (int) Math.max(0,(cluster.startPos-GeneFinder.readLength)) + startSub_alt[i]; 
+				break;
+			}
+		}
+		
+		if(start1 == -1){
+			return start1;
+		}
+		
+		boolean foundSameFrame = false;
+		
+		for(int i = 0; i<posAr;++i){
+			if((cluster.possibleStarts_Forward[i] - start1) % 3 == 0){
+				foundSameFrame = true;
+				break;
+			}
+		}
+		
+		if(!foundSameFrame){
+			cluster.possibleStarts_Forward[posAr++] = start1;
+			findPossibleStarts_Forward_AlternativeStarts(cluster,contigSeq,posAr,start1,alternativeStarts);
+		}else{
+			findPossibleStarts_Forward_AlternativeStarts(cluster,contigSeq,posAr,start1,alternativeStarts);
+		}
+		
+		return start1;
+	}
+
+	/*
+	 * if alternative start and stop codons are given, perform a more general search also respecting those codons
+	 * 
+	 */
+	
+	public static int findPossibleStarts_Reverse_AlternativeStarts(Gene cluster, StringBuffer contigSeq, int posAr, int tempStop,String[] alternativeStops){
+	
+	int start_RE = -1;
+	
+	if(tempStop < (int)Math.max(0,(cluster.startPos-GeneFinder.readLength))){
+		return -1;
+	}
+	
+	String startPart = contigSeq.substring((int)Math.max(0,(cluster.startPos-GeneFinder.readLength)), tempStop);
+	
+	int startSub_alt[] = new int[alternativeStops.length];
+	
+	for(int i = 0; i<alternativeStops.length;++i){
+		startSub_alt[i] = startPart.lastIndexOf(alternativeStops[i]);
+	}
+	
+	java.util.Arrays.sort(startSub_alt);
+	
+	for(int i = startSub_alt.length -1; i>= 0;i--){
+		if(startSub_alt[i] > -1){ 
+			start_RE = (int) Math.max(0,(cluster.startPos-GeneFinder.readLength)) + startSub_alt[i]; 
+			break;
+		}
+	}
+	
+	if(start_RE == -1){
+		return start_RE;
+	}
+	
+	boolean foundSameFrame = false;
+	
+	for(int i = 0; i<posAr;++i){
+		if((cluster.possibleStarts_Reverse[i] - start_RE) % 3 == 0){
+			foundSameFrame = true;
+			break;
+		}
+	}
+	
+	if(!foundSameFrame){
+		cluster.possibleStarts_Reverse[posAr++] = start_RE;
+		findPossibleStarts_Reverse_AlternativeStarts(cluster,contigSeq,posAr,start_RE,alternativeStops);
+	}else{
+		findPossibleStarts_Reverse_AlternativeStarts(cluster,contigSeq,posAr,start_RE,alternativeStops);
+	}
+	
+	return start_RE;
+}
+	
+	/*
 	 * new way of gene extraction by remembering all starts that are not in the same frame (maxNum = 3)
 	 * after that, starts and stops are checked if we find a combination that defines the frame of the cluster
 	 * 
@@ -122,6 +240,12 @@ public class FrameSearch {
 	
 	public static int findPossibleStops_Forward(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart){
 		
+		if(!GeneFinder.alternativeCodons.isEmpty()){
+			if(GeneFinder.alternativeCodons.containsKey("STOP FO")){
+				return FrameSearch.findPossibleStops_Forward_AlternativeStops(cluster, contigSeq, posAr, tempStart, GeneFinder.alternativeCodons.get("STOP FO"));
+			}
+		}
+		
 		int stop_FO = -1;
 		
 		if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
@@ -174,6 +298,12 @@ public class FrameSearch {
 	
 	public static int findPossibleStops_Reverse(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart){
 		
+		if(!GeneFinder.alternativeCodons.isEmpty()){
+			if(GeneFinder.alternativeCodons.containsKey("STOP RE")){
+				return FrameSearch.findPossibleStops_Reverse_AlternativeStop(cluster, contigSeq, posAr, tempStart, GeneFinder.alternativeCodons.get("STOP RE"));
+			}
+		}
+		
 		if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
 			return -1;
 		}
@@ -206,6 +336,112 @@ public class FrameSearch {
 	}
 	
 	/*
+	 * if alternative start and stop codons are given, perform a more general search also respecting those codons
+	 * 
+	 */
+	
+	public static int findPossibleStops_Forward_AlternativeStops(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart, String[] alternativeStops){
+		
+		int stop_FO = -1;
+		
+		if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
+			return -1;
+		}
+		
+		String stopPart = contigSeq.substring(tempStart, (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1));
+		
+		int stopSub_alt[] = new int[alternativeStops.length];
+		
+		for(int i = 0; i<alternativeStops.length;++i){
+			stopSub_alt[i] = stopPart.indexOf(alternativeStops[i]);
+		}
+		
+		java.util.Arrays.sort(stopSub_alt);
+		
+		for(int i = 0; i < stopSub_alt.length;++i){
+			if(stopSub_alt[i] > -1){ 
+				stop_FO = tempStart + stopSub_alt[i]; 
+				break;
+			}
+		}
+		
+		if(stop_FO == -1){
+			return stop_FO;
+		}
+		
+		boolean foundSameFrame = false;
+		
+		for(int i = 0; i<posAr;++i){
+			if((cluster.possibleStops_Forward[i] - stop_FO) % 3 == 0){
+				foundSameFrame = true;
+				break;
+			}
+		}
+		
+		if(!foundSameFrame){
+			cluster.possibleStops_Forward[posAr++] = stop_FO;
+			findPossibleStops_Forward_AlternativeStops(cluster,contigSeq,posAr,stop_FO+3,alternativeStops);
+		}else{
+			findPossibleStops_Forward_AlternativeStops(cluster,contigSeq,posAr,stop_FO+3,alternativeStops);
+		}
+		
+		return stop_FO;
+	}
+
+	/*
+	 * if alternative start and stop codons are given, perform a more general search also respecting those codons
+	 * 
+	 */
+	
+	public static int findPossibleStops_Reverse_AlternativeStop(Gene cluster, StringBuffer contigSeq, int posAr, int tempStart, String[] alternativeStarts){
+		
+		int start1 = -1;
+		
+		if(tempStart > (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1)){
+			return -1;
+		}
+		
+		String stopPart = contigSeq.substring(tempStart, (int) Math.min(contigSeq.length(),cluster.stopPos-2 + GeneFinder.readLength + 1));
+		
+		int stopSub_alt[] = new int[alternativeStarts.length];
+		
+		for(int i = 0; i<alternativeStarts.length;++i){
+			stopSub_alt[i] = stopPart.indexOf(alternativeStarts[i]);
+		}
+		
+		java.util.Arrays.sort(stopSub_alt);
+		
+		for(int i = 0; i < stopSub_alt.length;++i){
+			if(stopSub_alt[i] > -1){ 
+				start1 = tempStart + stopSub_alt[i]; 
+				break;
+			}
+		}
+		
+		if(start1 == -1){
+			return start1;
+		}
+		
+		boolean foundSameFrame = false;
+		
+		for(int i = 0; i<posAr;++i){
+			if((cluster.possibleStops_Reverse[i] - start1) % 3 == 0){
+				foundSameFrame = true;
+				break;
+			}
+		}
+		
+		if(!foundSameFrame){
+			cluster.possibleStops_Reverse[posAr++] = start1;
+			findPossibleStops_Reverse_AlternativeStop(cluster,contigSeq,posAr,start1+3,alternativeStarts);
+		}else{
+			findPossibleStops_Reverse_AlternativeStop(cluster,contigSeq,posAr,start1+3,alternativeStarts);
+		}
+		
+		return start1;
+	}
+
+	/*
 	 * test if there is one of the possible start-stop codon pairs which is in frame
 	 * take the smallest interval possible
 	 */
diff --git a/src/geneFinder/GeneFinder.java b/src/geneFinder/GeneFinder.java
index 5dba17a..6a81249 100755
--- a/src/geneFinder/GeneFinder.java
+++ b/src/geneFinder/GeneFinder.java
@@ -30,6 +30,8 @@ public class GeneFinder {
 	
 	public static Map<File,String> genomeFilesWithNames = new HashMap<File,String>();
 	public static Map<File,String> rnaFilesWithNames = new HashMap<File,String>();
+	
+	public static Map<String,String[]> alternativeCodons = new HashMap<String,String[]>();
 
 	public static boolean useTopHat;			// indicator for mapping tool
 	public static String settingMapper;			// setting for the mapping tool, differs slightly depending on which tool was chosen
@@ -80,13 +82,7 @@ public class GeneFinder {
 	
 	public static Object[] manager(String[] args){
 		
-	ReadInParameters_GeneFinder.readIn_GF(args);
-		
-		/*Gene gene = new Gene();
-		gene.startPos = 0;
-		String seq = readInFasta();
-		Prokaryote_Specials.define_OrfsInOperon(seq,gene);
-		System.exit(0);*/
+	    ReadInParameters_GeneFinder.readIn_GF(args);
 		
 		long timeBef = System.currentTimeMillis();
 
diff --git a/src/geneFinder/Giira.java b/src/geneFinder/Giira.java
index 4f264b5..f669535 100755
--- a/src/geneFinder/Giira.java
+++ b/src/geneFinder/Giira.java
@@ -43,7 +43,10 @@ public class Giira {
 		
 		try {
 			String decodedPath = URLDecoder.decode(path, "UTF-8");
-			String scriptPath = decodedPath.substring(0,decodedPath.length()-9);
+			String[] pathArr = decodedPath.split("/");
+			int lengthName = pathArr[pathArr.length-1].length();
+			String scriptPath = decodedPath.substring(0,decodedPath.length()-lengthName);
+			//String scriptPath = decodedPath.substring(0,decodedPath.length()-9);
 			//System.out.println("Path of Giira: " + decodedPath);
 			
 			classPath = "";
diff --git a/src/geneFinder/ProkaryoteExtraction.java b/src/geneFinder/ProkaryoteExtraction.java
index 65c12a4..9e19847 100755
--- a/src/geneFinder/ProkaryoteExtraction.java
+++ b/src/geneFinder/ProkaryoteExtraction.java
@@ -56,6 +56,9 @@ public class ProkaryoteExtraction {
 					if(line.startsWith(">")){
 						// test if correct contig
 						if(line.substring(1).startsWith(contigName)){
+							if(!((line.substring(1).startsWith(contigName+" ")) || (line.substring(1).length() == contigName.length()))){
+								continue;			// as an additional check to avoid picking the wrong contig because of name sub-similarities 											
+							}
 							// found right one, now extract sequence
 							while(((line = br.readLine()) != null) && (line.length() != 0) &&  (!(line.startsWith(">")))){
 								String line2 = "";
diff --git a/src/geneFinder/Prokaryote_Specials.java b/src/geneFinder/Prokaryote_Specials.java
index ed886ee..bbb3cfc 100755
--- a/src/geneFinder/Prokaryote_Specials.java
+++ b/src/geneFinder/Prokaryote_Specials.java
@@ -234,6 +234,12 @@ public class Prokaryote_Specials {
 	
 	public static Vector<int[]> searchFO_orfs(String inputSeq){
 		
+		if(!GeneFinder.alternativeCodons.isEmpty()){
+			if(GeneFinder.alternativeCodons.containsKey("START FO")){
+				return searchFO_orfs_alternativeCodons(inputSeq, GeneFinder.alternativeCodons.get("START FO"), GeneFinder.alternativeCodons.get("STOP FO"));
+			}
+		}
+		
 		Vector<int[]> allORFs_FO = new Vector<int[]>();
 		
 		int foundNewATG = 1;
@@ -315,12 +321,118 @@ public class Prokaryote_Specials {
 	}
 	
 	/*
+	 * if alternative start and stop codons are specified, respect this in a more general orf search
+	 * 
+	 */
+	
+	public static Vector<int[]> searchFO_orfs_alternativeCodons(String inputSeq, String[] alternativeStarts_FO, String[] alternativeStops_FO){
+		
+		Vector<int[]> allORFs_FO = new Vector<int[]>();
+		
+		int foundNewATG = 1;
+		int posLastATG = 0;
+		
+		do{
+			
+			int startPos = -1;
+			
+			String startPart_alt = inputSeq.substring(posLastATG);
+			
+			int startSub_alt[] = new int[alternativeStarts_FO.length];
+			
+			for(int i = 0; i<alternativeStarts_FO.length;++i){
+				startSub_alt[i] = startPart_alt.indexOf(alternativeStarts_FO[i]);
+			}
+			
+			java.util.Arrays.sort(startSub_alt);
+			
+			for(int i = 0; i < startSub_alt.length;++i){
+				if(startSub_alt[i] > -1){ 
+					startPos = startSub_alt[i]; 
+					break;
+				}
+			}
+					
+			int stopPos = -1;
+			
+			int posLastStart = -1;
+			
+			if(startPos == -1){
+				foundNewATG = 0;
+				break;
+			}else{
+				startPos = startPos  + posLastATG;
+				posLastATG = startPos + 3;
+				posLastStart = startPos + 3;
+			}
+
+			int goOn = 0;
+			
+			do{
+				goOn = 0;
+				
+				String stopPart = inputSeq.substring(posLastStart);
+				
+				int stopSub[] = new int[alternativeStops_FO.length];
+				
+				for(int i = 0; i<alternativeStops_FO.length;++i){
+					stopSub[i] = stopPart.indexOf(alternativeStops_FO[i]);
+				}
+				
+				java.util.Arrays.sort(stopSub);
+				
+				for(int i = 0; i < stopSub.length;++i){
+					if(stopSub[i] > -1){ 
+						if(((((posLastStart + stopSub[i])-startPos) % 3) == 0)){
+							stopPos = posLastStart + stopSub[i]; 
+						}else{
+							posLastStart = posLastStart + stopSub[i]+1;
+							goOn = 1;
+						}
+						break;
+					}
+				}
+				
+				if(stopPos != -1){
+					
+					if(stopPos-startPos > 30){
+						if(!checkIfORFcovered(allORFs_FO,new int[]{startPos,(stopPos+2)})){
+							allORFs_FO.add(new int[]{startPos,(stopPos+2)});
+							for(int i=startPos;i<=stopPos+2;++i){
+								cov[i]++;
+							}
+						}else{
+							alreadyCovered++;
+						}						
+					}else{
+						notCounted++;
+					}
+					
+					break;
+				}
+				
+			}while(goOn == 1);
+			
+			
+		}while(foundNewATG == 1);
+		
+		
+		return allORFs_FO;
+	}
+	
+	/*
 	 * searches all ORFs assuming reverse direction
 	 * note: no length limit is set, ORFs too short should be penalized in the BIC scoring
 	 */
 	
 	public static Vector<int[]> searchRE_orfs(String inputSeq){
 		
+		if(!GeneFinder.alternativeCodons.isEmpty()){
+			if(GeneFinder.alternativeCodons.containsKey("START RE")){
+				return searchRE_orfs_alternativeCodons(inputSeq, GeneFinder.alternativeCodons.get("STOP RE"), GeneFinder.alternativeCodons.get("START RE")); // are stored the other way around so start is stop and vice versa
+			}
+		}
+		
 		Vector<int[]> allORFs_RE= new Vector<int[]>();
 		
 		int foundNewCAT = 1;
@@ -402,6 +514,105 @@ public class Prokaryote_Specials {
 	}
 	
 	/*
+	 * if alternative start and stop codons are specified, respect this in a more general orf search
+	 * 
+	 */
+	
+	public static Vector<int[]> searchRE_orfs_alternativeCodons(String inputSeq, String[] alternativeStarts_RE, String[] alternativeStops_RE){
+		
+		Vector<int[]> allORFs_RE= new Vector<int[]>();
+		
+		int foundNewCAT = 1;
+		int posLastCAT = inputSeq.length();
+		
+		do{
+			int startPos = -1;
+			
+			String startPart_alt = inputSeq.substring(0,posLastCAT);
+			
+			int startSub_alt[] = new int[alternativeStarts_RE.length];
+			
+			for(int i = 0; i<alternativeStarts_RE.length;++i){
+				startSub_alt[i] = startPart_alt.lastIndexOf(alternativeStarts_RE[i]);
+			}
+			
+			java.util.Arrays.sort(startSub_alt);
+			
+			for(int i = startSub_alt.length -1; i>= 0;i--){
+				if(startSub_alt[i] > -1){ 
+					startPos = startSub_alt[i]; 
+					break;
+				}
+			}
+			
+			int stopPos = -1;
+			
+			int posLastStop = -1;
+			
+			if(startPos == -1){
+				foundNewCAT = 0;
+				break;
+			}else{
+				posLastCAT = startPos;
+				posLastStop = startPos;
+			}
+			
+			int goOn = 0;
+			
+			do{
+				goOn = 0;
+				
+				String stopPart = inputSeq.substring(0,posLastStop);
+			
+				int stopSub[] = new int[alternativeStops_RE.length];
+				
+				for(int i = 0; i<alternativeStops_RE.length;++i){
+					stopSub[i] = stopPart.lastIndexOf(alternativeStops_RE[i]);
+				}
+				
+				java.util.Arrays.sort(stopSub);
+				
+				for(int i = stopSub.length -1; i>= 0;i--){
+					if(stopSub[i] > -1){ 
+						if(((startPos-stopSub[i]) % 3) == 0){
+							stopPos = stopSub[i];
+						}else{
+							posLastStop = stopSub[i]+2;
+							goOn = 1;
+						}							
+						break;
+					}
+				}
+				
+				if(stopPos != -1){
+					
+					if(startPos-stopPos > 30){
+						if(!checkIfORFcovered(allORFs_RE,new int[]{stopPos,(startPos+2)})){
+							allORFs_RE.add(new int[]{stopPos,(startPos+2)});
+							for(int i=stopPos;i<=startPos+2;++i){
+								cov[i]++;
+							}
+						}else{
+							alreadyCovered++;
+						}					
+					}else{
+						notCounted++;
+					}
+					
+					
+					break;
+				}
+				
+			}while(goOn == 1);
+			
+			
+		}while(foundNewCAT == 1);
+		
+		
+		return allORFs_RE;
+	}
+	
+	/*
 	 * filter out all orfs that are completely included in bigger ones
 	 */
 	
diff --git a/src/geneFinder/ReadInParameters_GeneFinder.java b/src/geneFinder/ReadInParameters_GeneFinder.java
index 9313070..3d0fd75 100755
--- a/src/geneFinder/ReadInParameters_GeneFinder.java
+++ b/src/geneFinder/ReadInParameters_GeneFinder.java
@@ -58,6 +58,7 @@ public class ReadInParameters_GeneFinder {
 		boolean foundProkaryote = false;
 		boolean foundSequential = false;
 		boolean foundInprogea = false;
+		boolean foundAlternativeCodons = false;
 		
 		
 		if(!parameter.isEmpty() && args.length > 0){
@@ -230,6 +231,12 @@ public class ReadInParameters_GeneFinder {
 						inputText += "minimal interval length: " + GeneFinder.interval + "\n";	
 					}
 						
+				} else if(arg.equals("-altCodon")){  // alternative start and stop codons
+					String pathToAlternative = args[i+1];
+					readInAlternativeStartsStops(pathToAlternative);
+					foundAlternativeCodons = true;
+					inputText += "Alternative Starts and stops provided \n";
+				
 				} else if(arg.equals("-noAmbiOpti")){  // turn on or off the optimization of ambiguous reads
 					foundAmbiOpti = true;			
 					GeneFinder.noAmbiOpti = true;
@@ -321,6 +328,21 @@ public class ReadInParameters_GeneFinder {
 		
 		if(!havePathOut){
 			GeneFinder.pathOut = "";
+		}else{
+			// check if directory exists, if not, create it
+			File f = new File(GeneFinder.pathOut);
+			if(!f.exists()){
+				Runtime rtAlign = Runtime.getRuntime();
+				try {
+					String exe = "mkdir " + GeneFinder.pathOut;
+					Process pc = rtAlign.exec(exe);
+					pc.waitFor();			
+				} catch (IOException e) {
+					e.printStackTrace();
+				} catch (InterruptedException e) {
+					e.printStackTrace();
+				}
+			}
 		}
 		if(!foundGenome){
 			System.out.println("No genome file specified. Use \"-h\" to print usage options. ");
@@ -410,6 +432,7 @@ public class ReadInParameters_GeneFinder {
 			GeneFinder.inprogeaCall = false;
 		}
 		
+		
 		GeneFinder.logFile = new File(GeneFinder.pathOut+"log_it" + GeneFinder.iteration + ".txt");
 		if(!GeneFinder.secondPart){
 			System.out.println(inputText);
@@ -438,6 +461,66 @@ public class ReadInParameters_GeneFinder {
 	}	
 	
 	/*
+	 * reads in the alternative start and stop codons from a given input file
+	 * one line per codon type, with codons tab separated
+	 */
+	
+	public static void readInAlternativeStartsStops(String altFile) {
+		
+		Map<String,String[]> altCodons = new HashMap<String,String[]>();
+			
+		try {
+			
+			BufferedReader br = new BufferedReader(new FileReader(altFile));
+			
+			String line = "";
+			
+			while((line = br.readLine()) != null){
+				
+				String[] lineArr = line.split("\t");
+				String[] temp = new String[lineArr.length-1];
+				
+				if(line.startsWith("START FO")){
+
+					for(int i = 1; i<lineArr.length;++i){
+						temp[i-1] = lineArr[i];
+					}
+					
+					altCodons.put("START FO",temp);
+				}
+				if(line.startsWith("START RE")){
+					for(int i = 1; i<lineArr.length;++i){
+						temp[i-1] = lineArr[i];
+					}
+					
+					altCodons.put("STOP RE",temp);	// for GIIRA start and stops are switched for the reverse direction
+				}
+				if(line.startsWith("STOP FO")){
+					for(int i = 1; i<lineArr.length;++i){
+						temp[i-1] = lineArr[i];
+					}
+					
+					altCodons.put("STOP FO",temp);
+				}
+				if(line.startsWith("STOP RE")){
+					for(int i = 1; i<lineArr.length;++i){
+						temp[i-1] = lineArr[i];
+					}
+					
+					altCodons.put("START RE",temp);	// for GIIRA start and stops are switched for the reverse direction
+				}
+			}
+			
+		} catch (FileNotFoundException e) {
+			e.printStackTrace();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		
+		GeneFinder.alternativeCodons = altCodons;
+	}
+	
+	/*
 	 * print the help text to screen
 	 */
 	
@@ -457,7 +540,7 @@ public class ReadInParameters_GeneFinder {
 				" \n -iG [pathToGenomes] : specify path to directory with genome files in fasta format \n" +
 				" \n -iR [pathToRna] : specify path to directory with rna read files in fastq format \n" +
 				" \n -scripts [absolutePath] : specify the absolute path to the directory containing the required helper scripts, DEFAULT: directory of GIIRA.jar \n" +
-				" \n -out [pathToResults] : specify the directory that shall contain the results files \n" +
+				" \n -out [pathToResults] : specify the absolute pyth to the directory that shall contain the results files \n" +
 				" \n -outName [outputName] : specify desired name for output files, DEFAULT: genes \n" +
 				" \n -haveSam [samfileName]: if a sam file already exists, provide the name, else a mapping is performed. NOTE: the sam file has to be sorted according to read names! \n" +
 				" \n -nT [numberThreads] : specify the maximal number of threads that are allowed to be used, DEFAULT: 1 \n" +
@@ -468,7 +551,7 @@ public class ReadInParameters_GeneFinder {
 				//" \n -splitRunAndOpti [y/n] : indicates if the optimization and giira shall be run separately, to reduce the memory consumption (y), DEFAULT: n" +
 				" \n -mem [int] : specify the amount of memory that cplex is allowed to use \n" +
 				" \n -maxReportedHits [int] : if using BWA as mapping tool, specify the maximal number of reported hits, DEFAULT: 2 \n" +
-				" \n -prokaryote : if specified, genome is treated as prokaryotic, no spliced reads are accepted, and structural genes are resolved. DEFAULT: n \n" +
+				" \n -prokaryote : if specified, genome is treated as prokaryotic, no spliced reads are accepted, and structural genes are resolved. DEFAULT: False \n" +
 				" \n -minCov [double] : specify the minimum required coverage of the gene candidate extraction, DEFAULT: -1 (is estimated from mapping) \n" +
 				" \n -maxCov [double] : optional maximal coverage threshold, can also be estimated from mapping (DEFAULT) \n" +
 				" \n -endCov [double] : if the coverage falls below this value, the currently open candidate gene is closed. This value can be estimated from the minimum coverage (-1); DEFAULT: -1 \n" +
@@ -476,6 +559,7 @@ public class ReadInParameters_GeneFinder {
 				" \n -interval [int] : specify the minimal size of an interval between near candidate genes, if \"-1\" it equals the read length. DEFAULT: -1 \n " +
 				" \n -splLim [double] : specify the minimal coverage that is required to accept a splice site, if (-1) the threshold is equal to minCov, DEFAULT: -1 \n" +
 				" \n -rL [int] : specify read length, otherwise this information is extracted from SAM file (DEFAULT) \n" +
+				" \n -altCodon [pathToAlternativeCodons] : specify path to txt file with alternative start and stop codons, see example file in scripts folder \n" +
 				" \n -samForSequential [pathToSamFile] : if it is desired to analyse chromosomes in a sequential manner, provide a chromosome sorted sam file in addition to the one sorted by read names, DEFAULT: noSequential \n" +
 				" \n -noAmbiOpti : if specified, ambiguous hits are not included in the analysis \n" +					
 				" \n -settingMapper [(list of parameters)] : A comma-separated list of the desired parameters for TopHat or BWA. Please provide \n" +
diff --git a/src/geneFinder/SamParser.java b/src/geneFinder/SamParser.java
index 291befe..76377be 100755
--- a/src/geneFinder/SamParser.java
+++ b/src/geneFinder/SamParser.java
@@ -138,7 +138,21 @@ public class SamParser {
 
 							totalHitCount++;
 							
-							if(!parts[0].equals(currentReadID)){  // now we have proceeded to a new read
+							String adaptedName = "";
+							
+							if(parts[0].contains(":")){
+								String[] nameParts = parts[0].split(":");
+								for(int i=0;i<nameParts.length;++i){
+									adaptedName += nameParts[i] + ";;;";   // necessary to avoid cplex or glpk errors
+								}
+								
+								adaptedName = adaptedName.substring(0,(adaptedName.length()-3));
+							}else{
+								adaptedName = parts[0];
+							}
+							
+							
+							if(!adaptedName.equals(currentReadID)){  // now we have proceeded to a new read
 
 								if(GeneFinder.iteration == 2 && currentRead != null && currentRead.isMulti == 1){
 									
@@ -147,7 +161,20 @@ public class SamParser {
 										do{
 											String[] partsReaSam = lineReaSam.split("	");										
 											
-											if(currentRead.rnaID.equals(partsReaSam[0])){
+											String adaptedNameReaSam = "";
+											
+											if(partsReaSam[0].contains(":")){
+												String[] nameParts = partsReaSam[0].split(":");
+												for(int i=0;i<nameParts.length;++i){
+													adaptedNameReaSam += nameParts[i] + ";;;";   // necessary to avoid cplex or glpk errors
+												}
+												
+												adaptedNameReaSam = adaptedNameReaSam.substring(0,(adaptedNameReaSam.length()-3));
+											}else{
+												adaptedNameReaSam = partsReaSam[0];
+											}
+											
+											if(currentRead.rnaID.equals(adaptedNameReaSam)){
 												allReassigned.put(Integer.parseInt(partsReaSam[3]),partsReaSam[2]);
 											}else{
 												break;
@@ -165,12 +192,12 @@ public class SamParser {
 									
 								}
 
-								currentReadID = parts[0];				
+								currentReadID = adaptedName;				
 
 								// set up new rna node
 
 								Rna newRna = new Rna();
-								newRna.rnaID = parts[0];  		
+								newRna.rnaID = adaptedName;  		
 								newRna.isMulti = 0;
 								newRna.hitNum = 1;
 								newRna.assignedNum = 0;
@@ -479,9 +506,22 @@ public class SamParser {
 							do{
 								String[] partsReaSam = lineReaSam.split("	");
 								
-								if(partsReaSam[0].compareTo(currentRead.rnaID) > 0){
+								String adaptedNameReaSam = "";
+								
+								if(partsReaSam[0].contains(":")){
+									String[] nameParts = partsReaSam[0].split(":");
+									for(int i=0;i<nameParts.length;++i){
+										adaptedNameReaSam += nameParts[i] + ";;;";   // necessary to avoid cplex or glpk errors
+									}
+									
+									adaptedNameReaSam = adaptedNameReaSam.substring(0,(adaptedNameReaSam.length()-3));
+								}else{
+									adaptedNameReaSam = partsReaSam[0];
+								}
+								
+								if(adaptedNameReaSam.compareTo(currentRead.rnaID) > 0){
 									break;  // we exceeded this read, so stop
-								}else if(currentRead.rnaID.equals(partsReaSam[0])){
+								}else if(currentRead.rnaID.equals(adaptedNameReaSam)){
 									allReassigned.put(Integer.parseInt(partsReaSam[3]),partsReaSam[2]);
 								}
 							}while((lineReaSam = br.readLine()) != null);
@@ -953,9 +993,23 @@ public class SamParser {
 							
 							Rna read;
 							
-							if(seenReads.keySet().contains(parts[0])){
+							String adaptedName = "";
+							
+							if(parts[0].contains(":")){
+								String[] nameParts = parts[0].split(":");
+								for(int i=0;i<nameParts.length;++i){
+									adaptedName += nameParts[i] + ";;;";   // necessary to avoid cplex or glpk errors
+								}
+								
+								adaptedName = adaptedName.substring(0,(adaptedName.length()-3));
+							}else{
+								adaptedName = parts[0];
+							}
+							
+							
+							if(seenReads.keySet().contains(adaptedName)){
 								
-								Vector<Object> temp = seenReads.get(parts[0]);
+								Vector<Object> temp = seenReads.get(adaptedName);
 		
 								if(((Integer)temp.get(0)) != 0){
 									
@@ -973,7 +1027,7 @@ public class SamParser {
 											
 											temp.clear();
 											temp.add(0);
-											seenReads.put(parts[0],temp);
+											seenReads.put(adaptedName,temp);
 											
 											if(totalHitCount % 100000 == 0){
 		
@@ -1005,7 +1059,7 @@ public class SamParser {
 										interChromoTotalCount++;
 										Vector<Object> temp = new Vector<Object>();
 										temp.add(0);									
-										seenReads.put(parts[0],temp);
+										seenReads.put(adaptedName,temp);
 										interChromoTotalCount++;
 										break;
 									}
@@ -1049,7 +1103,7 @@ public class SamParser {
 									Vector<Object> temp = new Vector<Object>();
 									temp.add(1);
 									temp.add(read);
-									seenReads.put(parts[0],temp);	
+									seenReads.put(adaptedName,temp);	
 								}
 							}
 
diff --git a/src/types/Rna.java b/src/types/Rna.java
index 590f31d..8e8e123 100755
--- a/src/types/Rna.java
+++ b/src/types/Rna.java
@@ -17,7 +17,7 @@ public class Rna {
 	
 	public double quality; 	  	
 
-	public Vector<Object[]> contigsMappedOn = new Vector<Object[]>(); // contains several Arrays á: [contig, alignPos, cigarString, mapQual,spliceInfo,mismatchInfo,direcInfo] (one for each hit)
+	public Vector<Object[]> contigsMappedOn = new Vector<Object[]>(); // contains several Arrays ala: [contig, alignPos, cigarString, mapQual,spliceInfo,mismatchInfo,direcInfo] (one for each hit)
 	
 	public int isMulti;		// indicator if this read is an ambiguous read
 	

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/giira.git



More information about the debian-med-commit mailing list