[med-svn] [proteinortho] 01/05: Imported Upstream version 5.13+dfsg

Wed May 4 13:01:28 UTC 2016

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository proteinortho.

commit a6e05f7c0c82073170f52b8322ca933e2c160488
Author: Andreas Tille <tille at debian.org>
Date:   Wed May 4 14:50:54 2016 +0200

    Imported Upstream version 5.13+dfsg
---
 Makefile                      |  17 +++---
 manual.html                   |   6 +-
 po2tree.pl                    |   4 +-
 po_tree.c                     |   8 +--
 proteinortho5.pl              |   4 +-
 proteinortho5_clean_edges.cpp | 136 ------------------------------------------
 proteinortho5_clustering.cpp  |  80 ++++++++++++++++++++++---
 7 files changed, 90 insertions(+), 165 deletions(-)

diff --git a/Makefile b/Makefile
index 3c8b861..648850e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,21 +1,18 @@
 
 INSTALLDIR=/usr/local/bin
 
-CPP      = g++
-CPPFLAGS   = -Wall -O3
+CPP = g++
+CPPFLAGS += -Wall -O3 -Wno-unused-result
 
-all: proteinortho5_clustering proteinortho5_clean_edges proteinortho5_tree
+all: proteinortho5_clustering proteinortho5_tree
 
 proteinortho5_clustering: proteinortho5_clustering.cpp
-	$(CPP) $(CPPFLAGS) -o $@ $<
+	$(CPP) $(CPPFLAGS) $(LDFLAGS) -o $@ $<
 
-proteinortho5_tree: proteinortho5_clustering.cpp
-	$(CPP) $(CPPFLAGS) -o $@ $<
+proteinortho5_tree: po_tree.c
+	$(CPP) $(CPPFLAGS) $(LDFLAGS) -o $@ $<
 
-proteinortho5_clean_edges: proteinortho5_clean_edges.cpp
-	$(CPP) $(CPPFLAGS) -o $@ $<
-
-install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po_tree po_tree.pl
+install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po2tree.pl proteinortho5_tree
 	install -v $^ $(INSTALLDIR)
 
 test: proteinortho5.pl proteinortho5_clustering
diff --git a/manual.html b/manual.html
index fac0228..8ae8d68 100644
--- a/manual.html
+++ b/manual.html
@@ -4,7 +4,7 @@
 	</head>
 <body>
 <h1>Proteinortho Manual / PoFF Manual</h1>
-<small>This manual corresponds to version 5.12</small>
+<small>This manual corresponds to version 5.13</small>
 <h2>Introduction</h2>
 Proteinortho is a tool to detect orthologous genes within different species.
 For doing so, it compares similarities of given gene sequences and clusters them to find significant groups.
@@ -38,8 +38,8 @@ The sources come with a precompiled version of Proteinortho for 64bit Linux. If
 <h3>Building and installing from source</h3>
 <ul>
 <li> Fetch the latest source code archive from <a href="http://www.bioinf.uni-leipzig.de/Software/proteinortho/" target="_blank">www.bioinf.uni-leipzig.de/Software/proteinortho/</a>.
-<li> Extract the files e.g. via <code>tar -xzvf proteinortho_v5.06.tar.gz</code>
-<li> Change directory into the extracted folder e.g. via <code>cd proteinortho_v5.06</code>
+<li> Extract the files e.g. via <code>tar -xzvf proteinortho_v5.13.tar.gz</code>
+<li> Change directory into the extracted folder e.g. via <code>cd proteinortho_v5.13</code>
 <li> You can now run <code>./proteinortho5.pl</code> directly
 <li> If you want to recompile and install Proteinortho, type <code>make</code> followed <code>sudo make install</code> (requires root privileges).
 <li> In any case, run <code>make test</code> to make sure Proteinortho works as expected
diff --git a/po2tree.pl b/po2tree.pl
index 77d5435..e1a0590 100755
--- a/po2tree.pl
+++ b/po2tree.pl
@@ -34,7 +34,7 @@
 # @email lechner at staff.uni-marburg.de
 # @company Bioinformatics, University of Leipzig
 # @version 3.10
-# @date 08-12-2015
+# @date 16-03-2016
 #
 ##########################################################################################
 
@@ -142,7 +142,7 @@ system("cat '$file.tmp.matrix2' >>'$file.tmp.matrix'");
 
 # Run the main algorithm in C
 print STDERR "Calculating tree...\n";
-my $run = $scriptpath."po_tree";
+my $run = $scriptpath."proteinortho5_tree";
 my $out = qx($run $ARGV[0].tmp.matrix);
 $out =~ s/\[.+?\]//g;
 $out =~ s/\s:/:/g;
diff --git a/po_tree.c b/po_tree.c
index 8b6604d..769cf85 100644
--- a/po_tree.c
+++ b/po_tree.c
@@ -29,7 +29,7 @@
  *  @email marcus at bioinf.uni-leipzig.de
  *  @company Bioinformatics, University of Leipzig
  *  @version 1.10 (for PO5)
- *  @date 2016-02-22
+ *  @date 2016-03-16
  */
 
 
@@ -105,7 +105,7 @@ int main(int argc, char** argv) {
 void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen) {
 	/* affinity matrix */
 	int** aff = (int**)malloc(sizeof(int*)*viecher);
-	int i;
+	unsigned int i;
 	/* allocate further memory for affinity matrix */
 	for (i = 0; i<viecher; i++) {
 		aff[i] = (int*)malloc(sizeof(int)*viecher);
@@ -146,7 +146,7 @@ void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen) {
  *  returns:  nothing, but all involved data-structures represent the merge afterwards correctly
  */
 void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen) {
-	int i, counter = 0;	
+	unsigned int i, counter = 0;	
 
 	/* only keep components that are present in both species */
 	/* foreach component */	
@@ -199,7 +199,7 @@ void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int**
 	baum = strcat(baum,(*pnamen)[max[1]]);
 	baum = strcat(baum,":");
 	baum = strcat(baum,length_1);
-/	baum = strcat(baum,")");
+	baum = strcat(baum,")");
 //	baum = strcat(baum,")[");
 //	baum = strcat(baum,anz);
 //	baum = strcat(baum,"]");
diff --git a/proteinortho5.pl b/proteinortho5.pl
index 719e64f..22bbe97 100755
--- a/proteinortho5.pl
+++ b/proteinortho5.pl
@@ -30,7 +30,7 @@
 # @author Marcus Lechner
 # @email lechner at staff.uni-marburg.de
 # @company University of Maruburg
-# @date 2016-02-22
+# @date 2016-04-22
 #
 ##########################################################################################
 
@@ -47,7 +47,7 @@ use Thread::Queue;
 ##########################################################################################
 # Variables
 ##########################################################################################
-our $version = "5.12";
+our $version = "5.13";
 our $step = 0;		# 0/1/2/3	-> do all / only apply step 1 / only apply step 2 / only apply step 3
 our $verbose = 1;	# 0/1		-> don't / be verbose
 our $debug = 0;		# 0/1		-> don't / show debug data
diff --git a/proteinortho5_clean_edges.cpp b/proteinortho5_clean_edges.cpp
deleted file mode 100644
index 0f81b3a..0000000
--- a/proteinortho5_clean_edges.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <string>
-#include <map>
-#include <vector>
-
-using namespace std;
-
-int parse_excluded_edges(string filename, map<string, int> &excluded_edges);
-void tokenize(const string& str, vector<string>& tokens, const string& delimiters);
-int rewrite_graph(const string in, map<string, int> exclude);
-string chomp(string &str);
-
-
-int main(const int argc, char *argv[]) {
-
-	string usage = "USAGE: "+string(argv[0])+" -e <EXCLUSION FILE> GRAPH1 GRAPH2 ...";
-	
-	vector<string> graphs;
-	map<string, int> exclude;
-	for(int paras = 1; paras < argc; paras++) {
-		string arg = string(argv[paras]);
-		if( argc < 4 || arg == "-h" || arg == "--help") {
-			cout << usage;
-			return 0;
-		} else if(arg == "-e") {
-			parse_excluded_edges(string(argv[++paras]), exclude);
-		} else {
-			graphs.push_back(arg);
-		}
-	}
-
-	for(vector<string>::iterator it = graphs.begin(); it != graphs.end(); it++) {
-		rewrite_graph(*it, exclude);
-	}
-	
-	for(map<string, int>::iterator it = exclude.begin(); it != exclude.end(); it++) {
-		//cout << it->first << "\n";
-	}
-	
-}
-
-int rewrite_graph(const string in, map<string, int> exclude) {
-	ifstream graph_file(in.c_str());
-	if(!graph_file.is_open()) {
-		cerr << "could not read input file "+in;
-		return -1;
-	}	
-	
-	int count = 0;
-	string line;
-	while (getline(graph_file, line)) {
-		chomp(line);
-		vector<string> fields;
-		tokenize(line, fields, "\t");
-		if(fields.size() > 1 && fields[0].substr(0, 1) != "#") {
-			string a = fields[0];
-			string b = fields[1];
-			string c = b+" "+a;
-			if(a.compare(b) < 0 ) {
-				c = a+" "+b;
-			}
-			//cout << c << endl;
-			if(exclude.count(c) == 1) {
-				count++;
-				continue;
-			}
-		}
-		
-		cout << line << endl;
-	}
-
-	graph_file.close();
-	
-	cerr << "# excluded " << count << " edges from the graph." << endl;
-	return 0;
-}
-
-string chomp(string &str) {
-	string::size_type pos = str.find_last_not_of("\n\r \t");
-	if(pos != string::npos) {
-  		str = str.substr(0, pos+1);
-  	}
-  	return str;
-}
-    
-int parse_excluded_edges(string filename, map<string, int> &excluded_edges) {    
-    ifstream data(filename.c_str());
-    if(!data.is_open()) {
-    	cerr << "could not open file"+filename;
-    }
-
-    string line;
-    while(getline(data,line)) {
-        
-        chomp(line);
-        vector<string> v;
-        tokenize(line, v, "\t");
-        
-        if(v.size() < 2) {
-        	continue;
-        }
-        string a = v[0];
-        string b = v[1];
-        string c = b + " " + a;
-
-        if(a.compare(b) < 0) {
-        	c = a + " " + b;
-        }
-        //cout << c << endl;
-       	excluded_edges[c] = 1;
-    }
-    
-    data.close();
-    
-    return 0;
-}
-
-// Split a string at a certain delim
-void tokenize(const string& str, vector<string>& tokens, const string& delimiters = "\t") {
-    // Skip delimiters at beginning.
-    string::size_type lastPos = str.find_first_not_of(delimiters, 0);
-    // Find first "non-delimiter".
-    string::size_type pos = str.find_first_of(delimiters, lastPos);
-
-    while (string::npos != pos || string::npos != lastPos) {
-        // Found a token, add it to the vector.
-        tokens.push_back(str.substr(lastPos, pos - lastPos));
-        // Skip delimiters.  Note the "not_of"
-        lastPos = str.find_first_not_of(delimiters, pos);
-        // Find next "non-delimiter"
-        pos = str.find_first_of(delimiters, lastPos);
-    }
-}
-
diff --git a/proteinortho5_clustering.cpp b/proteinortho5_clustering.cpp
index a7bbba1..6e07ae5 100755
--- a/proteinortho5_clustering.cpp
+++ b/proteinortho5_clustering.cpp
@@ -3,7 +3,7 @@
  *	Reads edge list and splits connected components
  *	according to algebraic connectivity threshold
  *
- *	Last updated: 2014/07/07		
+ *	Last updated: 2016/04/22		
  *	Author: Marcus Lechner
  */
 
@@ -17,6 +17,7 @@
 #include <vector>
 #include <stack>
 #include <iomanip>
+#include <cstdlib>
 using namespace std;
 
 // Structs
@@ -252,7 +253,7 @@ void partition_graph() {
 		// Connectivity analysis
 		float connectivity = getConnectivity(current_group);
 
-		if (connectivity < param_con_threshold) {
+		if (connectivity < param_con_threshold && current_group.size() > 3) {
 			// Split groups is done in getConnectivity function
 			// Reset flags and repeat without incrementing protein counter
 			for (unsigned int i = 0; i < current_group.size(); i++) done[current_group[i]] = false;
@@ -260,7 +261,7 @@ void partition_graph() {
 		}
 	
 		// Output
-		print_group(current_group,connectivity);
+		if (connectivity >= param_con_threshold) {print_group(current_group,connectivity);}
 
 		// Print stats
 		stats(protein_id,protein_counter);
@@ -300,6 +301,20 @@ vector<unsigned int> get_deg_one (vector<unsigned int>& nodes) {
 vector<float> generate_random_vector(const unsigned int size) {
 	vector<float> x(size);
 	for (unsigned int i = 0; i < size; i++) {
+	  x[i] = (float)(rand() % 999+1)/1000;	// 1 bis 99
+		// at least one value must be different from the others but still within 0 and 1
+		if (i > 0 && x[i] == x[i-1]) {
+			x[i] /= 3;
+		}
+//		cerr << x[i] << endl;
+	}
+	return x;
+}
+
+// Generate random vector x of size size
+vector<float> generate_random_vector_old(const unsigned int size) {
+	vector<float> x(size);
+	for (unsigned int i = 0; i < size; i++) {
 	  x[i] = (float)rand()/RAND_MAX;
 		// at least one value must be different from the others but still within 0 and 1
 		if (i > 0 && x[i] == x[i-1]) {
@@ -361,8 +376,43 @@ vector<float> getY(float max_degree, vector<float> x_hat, vector<float> x_new, v
 	return x_hat;
 }
 
+
 // Remove edges connectiong two groups a and b
 void removeExternalEdges(map<unsigned int,bool>& a) {
+//		cerr << "+#" << endl;
+//		for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) {
+//			unsigned int protein = it->first;
+//			cerr << protein << endl;
+//		}
+//		cerr << "#-" << endl;
+		
+		// For each protein in a
+		for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) {
+			unsigned int protein = it->first;
+			// For each target
+			vector<unsigned int> cleaned_edges;
+			bool swap = false;
+			for (vector<unsigned int>::iterator ita = graph[protein].edges.begin(); ita != graph[protein].edges.end(); ita++) {
+				// If it is not present the own group, set flag
+				if (a.find(*ita) == a.end()) {
+					graph_clean << graph[protein].full_name << "\t" << graph[*ita ].full_name << endl; // Improved graph cleaning
+					swap = true;
+				}
+				// Otherwise, add it to the new edge list
+				else {
+					cleaned_edges.push_back(*ita);
+				}
+			}
+			// If changes were made, swap edge list with new one		
+			if (swap) {
+				cleaned_edges.swap(graph[protein].edges);
+			}
+		}
+}
+
+
+// Remove edges connectiong two groups a and b
+void removeExternalEdges_old(map<unsigned int,bool>& a) {
 		// For each protein in a
 		for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) {
 			unsigned int protein = it->first;
@@ -388,7 +438,7 @@ void removeExternalEdges(map<unsigned int,bool>& a) {
 }
 
 // Split connected component according to eigenvector
-void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
+void splitGroups(vector<float>& y, vector<unsigned int>& nodes, map<unsigned int,unsigned int> mapping){
 	// Remove tree like structures in the beginning
 	vector<unsigned int> one = get_deg_one(nodes);
 	if (one.size() > 0) {
@@ -408,6 +458,8 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
 		else              	{groupB[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#b01700}" << endl; }
 	}
 
+//	cerr << groupA.size() << " -- " << groupB.size() << endl;
+
 	// Catch error in laplacien calcs
 	if (groupA.size() == 0 || groupB.size() == 0) {
 		throw "Failed to partition subgraph! This might lead to an infinit loop. Please submit the .blastgraph file to lechner at staff.uni-marburg.de to help fixing this issue.";
@@ -418,6 +470,18 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
 }
 
 float getConnectivity(vector<unsigned int>& nodes) {
+	// Special cases hotfix
+	if (nodes.size() == 2) {return 1;}
+
+	if (nodes.size() == 3) {
+		vector<unsigned int> min = get_deg_one(nodes);
+		// fully connected
+		if (min.size() == 0) {return 1;}
+		// not
+		else {return 0.667;}
+	}
+	// Hotfix end
+
 	// Get max degree of nodes
 	unsigned int max_degree = max_deg(nodes);
 	
@@ -445,17 +509,17 @@ float getConnectivity(vector<unsigned int>& nodes) {
 		x_hat = makeOrthogonal(y);
 		// Get lenght (lambda) & normalize vector
 		norm = nomalize(x_hat, &current_length);
-//		cerr << "IT: " << current_length << endl;
-		if (abs(current_length-last_length) < 0.000333 && iter >= 10) break;	// min 10 iterations (prevent convergence by chance)
+		if (abs(current_length-last_length) < 0.0001 && iter >= 20) break;	// min 20 iterations (prevent convergence by chance)
 	}
-
 	//	cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
 
 	float connectivity = (-current_length+2*max_degree)/(nodes.size());
 
+//	cerr << nodes.size() << " " << connectivity << endl;
+
 	// Split groups if connectivity is too low, remove tree like structures that might have arosen
 	if (connectivity < param_con_threshold) {
-		splitGroups(x_hat, nodes);
+		splitGroups(x_hat, nodes, mapping);
 	}
 	
 	return connectivity;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/proteinortho.git