[med-svn] [proteinortho] 01/05: Imported Upstream version 5.13+dfsg
Andreas Tille
tille at debian.org
Wed May 4 13:01:28 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository proteinortho.
commit a6e05f7c0c82073170f52b8322ca933e2c160488
Author: Andreas Tille <tille at debian.org>
Date: Wed May 4 14:50:54 2016 +0200
Imported Upstream version 5.13+dfsg
---
Makefile | 17 +++---
manual.html | 6 +-
po2tree.pl | 4 +-
po_tree.c | 8 +--
proteinortho5.pl | 4 +-
proteinortho5_clean_edges.cpp | 136 ------------------------------------------
proteinortho5_clustering.cpp | 80 ++++++++++++++++++++++---
7 files changed, 90 insertions(+), 165 deletions(-)
diff --git a/Makefile b/Makefile
index 3c8b861..648850e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,21 +1,18 @@
INSTALLDIR=/usr/local/bin
-CPP = g++
-CPPFLAGS = -Wall -O3
+CPP = g++
+CPPFLAGS += -Wall -O3 -Wno-unused-result
-all: proteinortho5_clustering proteinortho5_clean_edges proteinortho5_tree
+all: proteinortho5_clustering proteinortho5_tree
proteinortho5_clustering: proteinortho5_clustering.cpp
- $(CPP) $(CPPFLAGS) -o $@ $<
+ $(CPP) $(CPPFLAGS) $(LDFLAGS) -o $@ $<
-proteinortho5_tree: proteinortho5_clustering.cpp
- $(CPP) $(CPPFLAGS) -o $@ $<
+proteinortho5_tree: po_tree.c
+ $(CPP) $(CPPFLAGS) $(LDFLAGS) -o $@ $<
-proteinortho5_clean_edges: proteinortho5_clean_edges.cpp
- $(CPP) $(CPPFLAGS) -o $@ $<
-
-install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po_tree po_tree.pl
+install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po2tree.pl proteinortho5_tree
install -v $^ $(INSTALLDIR)
test: proteinortho5.pl proteinortho5_clustering
diff --git a/manual.html b/manual.html
index fac0228..8ae8d68 100644
--- a/manual.html
+++ b/manual.html
@@ -4,7 +4,7 @@
</head>
<body>
<h1>Proteinortho Manual / PoFF Manual</h1>
-<small>This manual corresponds to version 5.12</small>
+<small>This manual corresponds to version 5.13</small>
<h2>Introduction</h2>
Proteinortho is a tool to detect orthologous genes within different species.
For doing so, it compares similarities of given gene sequences and clusters them to find significant groups.
@@ -38,8 +38,8 @@ The sources come with a precompiled version of Proteinortho for 64bit Linux. If
<h3>Building and installing from source</h3>
<ul>
<li> Fetch the latest source code archive from <a href="http://www.bioinf.uni-leipzig.de/Software/proteinortho/" target="_blank">www.bioinf.uni-leipzig.de/Software/proteinortho/</a>.
-<li> Extract the files e.g. via <code>tar -xzvf proteinortho_v5.06.tar.gz</code>
-<li> Change directory into the extracted folder e.g. via <code>cd proteinortho_v5.06</code>
+<li> Extract the files e.g. via <code>tar -xzvf proteinortho_v5.13.tar.gz</code>
+<li> Change directory into the extracted folder e.g. via <code>cd proteinortho_v5.13</code>
<li> You can now run <code>./proteinortho5.pl</code> directly
<li> If you want to recompile and install Proteinortho, type <code>make</code> followed <code>sudo make install</code> (requires root privileges).
<li> In any case, run <code>make test</code> to make sure Proteinortho works as expected
diff --git a/po2tree.pl b/po2tree.pl
index 77d5435..e1a0590 100755
--- a/po2tree.pl
+++ b/po2tree.pl
@@ -34,7 +34,7 @@
# @email lechner at staff.uni-marburg.de
# @company Bioinformatics, University of Leipzig
# @version 3.10
-# @date 08-12-2015
+# @date 16-03-2016
#
##########################################################################################
@@ -142,7 +142,7 @@ system("cat '$file.tmp.matrix2' >>'$file.tmp.matrix'");
# Run the main algorithm in C
print STDERR "Calculating tree...\n";
-my $run = $scriptpath."po_tree";
+my $run = $scriptpath."proteinortho5_tree";
my $out = qx($run $ARGV[0].tmp.matrix);
$out =~ s/\[.+?\]//g;
$out =~ s/\s:/:/g;
diff --git a/po_tree.c b/po_tree.c
index 8b6604d..769cf85 100644
--- a/po_tree.c
+++ b/po_tree.c
@@ -29,7 +29,7 @@
* @email marcus at bioinf.uni-leipzig.de
* @company Bioinformatics, University of Leipzig
* @version 1.10 (for PO5)
- * @date 2016-02-22
+ * @date 2016-03-16
*/
@@ -105,7 +105,7 @@ int main(int argc, char** argv) {
void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen) {
/* affinity matrix */
int** aff = (int**)malloc(sizeof(int*)*viecher);
- int i;
+ unsigned int i;
/* allocate further memory for affinity matrix */
for (i = 0; i<viecher; i++) {
aff[i] = (int*)malloc(sizeof(int)*viecher);
@@ -146,7 +146,7 @@ void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen) {
* returns: nothing, but all involved data-structures represent the merge afterwards correctly
*/
void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen) {
- int i, counter = 0;
+ unsigned int i, counter = 0;
/* only keep components that are present in both species */
/* foreach component */
@@ -199,7 +199,7 @@ void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int**
baum = strcat(baum,(*pnamen)[max[1]]);
baum = strcat(baum,":");
baum = strcat(baum,length_1);
-/ baum = strcat(baum,")");
+ baum = strcat(baum,")");
// baum = strcat(baum,")[");
// baum = strcat(baum,anz);
// baum = strcat(baum,"]");
diff --git a/proteinortho5.pl b/proteinortho5.pl
index 719e64f..22bbe97 100755
--- a/proteinortho5.pl
+++ b/proteinortho5.pl
@@ -30,7 +30,7 @@
# @author Marcus Lechner
# @email lechner at staff.uni-marburg.de
# @company University of Maruburg
-# @date 2016-02-22
+# @date 2016-04-22
#
##########################################################################################
@@ -47,7 +47,7 @@ use Thread::Queue;
##########################################################################################
# Variables
##########################################################################################
-our $version = "5.12";
+our $version = "5.13";
our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1 -> don't / be verbose
our $debug = 0; # 0/1 -> don't / show debug data
diff --git a/proteinortho5_clean_edges.cpp b/proteinortho5_clean_edges.cpp
deleted file mode 100644
index 0f81b3a..0000000
--- a/proteinortho5_clean_edges.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <string>
-#include <map>
-#include <vector>
-
-using namespace std;
-
-int parse_excluded_edges(string filename, map<string, int> &excluded_edges);
-void tokenize(const string& str, vector<string>& tokens, const string& delimiters);
-int rewrite_graph(const string in, map<string, int> exclude);
-string chomp(string &str);
-
-
-int main(const int argc, char *argv[]) {
-
- string usage = "USAGE: "+string(argv[0])+" -e <EXCLUSION FILE> GRAPH1 GRAPH2 ...";
-
- vector<string> graphs;
- map<string, int> exclude;
- for(int paras = 1; paras < argc; paras++) {
- string arg = string(argv[paras]);
- if( argc < 4 || arg == "-h" || arg == "--help") {
- cout << usage;
- return 0;
- } else if(arg == "-e") {
- parse_excluded_edges(string(argv[++paras]), exclude);
- } else {
- graphs.push_back(arg);
- }
- }
-
- for(vector<string>::iterator it = graphs.begin(); it != graphs.end(); it++) {
- rewrite_graph(*it, exclude);
- }
-
- for(map<string, int>::iterator it = exclude.begin(); it != exclude.end(); it++) {
- //cout << it->first << "\n";
- }
-
-}
-
-int rewrite_graph(const string in, map<string, int> exclude) {
- ifstream graph_file(in.c_str());
- if(!graph_file.is_open()) {
- cerr << "could not read input file "+in;
- return -1;
- }
-
- int count = 0;
- string line;
- while (getline(graph_file, line)) {
- chomp(line);
- vector<string> fields;
- tokenize(line, fields, "\t");
- if(fields.size() > 1 && fields[0].substr(0, 1) != "#") {
- string a = fields[0];
- string b = fields[1];
- string c = b+" "+a;
- if(a.compare(b) < 0 ) {
- c = a+" "+b;
- }
- //cout << c << endl;
- if(exclude.count(c) == 1) {
- count++;
- continue;
- }
- }
-
- cout << line << endl;
- }
-
- graph_file.close();
-
- cerr << "# excluded " << count << " edges from the graph." << endl;
- return 0;
-}
-
-string chomp(string &str) {
- string::size_type pos = str.find_last_not_of("\n\r \t");
- if(pos != string::npos) {
- str = str.substr(0, pos+1);
- }
- return str;
-}
-
-int parse_excluded_edges(string filename, map<string, int> &excluded_edges) {
- ifstream data(filename.c_str());
- if(!data.is_open()) {
- cerr << "could not open file"+filename;
- }
-
- string line;
- while(getline(data,line)) {
-
- chomp(line);
- vector<string> v;
- tokenize(line, v, "\t");
-
- if(v.size() < 2) {
- continue;
- }
- string a = v[0];
- string b = v[1];
- string c = b + " " + a;
-
- if(a.compare(b) < 0) {
- c = a + " " + b;
- }
- //cout << c << endl;
- excluded_edges[c] = 1;
- }
-
- data.close();
-
- return 0;
-}
-
-// Split a string at a certain delim
-void tokenize(const string& str, vector<string>& tokens, const string& delimiters = "\t") {
- // Skip delimiters at beginning.
- string::size_type lastPos = str.find_first_not_of(delimiters, 0);
- // Find first "non-delimiter".
- string::size_type pos = str.find_first_of(delimiters, lastPos);
-
- while (string::npos != pos || string::npos != lastPos) {
- // Found a token, add it to the vector.
- tokens.push_back(str.substr(lastPos, pos - lastPos));
- // Skip delimiters. Note the "not_of"
- lastPos = str.find_first_not_of(delimiters, pos);
- // Find next "non-delimiter"
- pos = str.find_first_of(delimiters, lastPos);
- }
-}
-
diff --git a/proteinortho5_clustering.cpp b/proteinortho5_clustering.cpp
index a7bbba1..6e07ae5 100755
--- a/proteinortho5_clustering.cpp
+++ b/proteinortho5_clustering.cpp
@@ -3,7 +3,7 @@
* Reads edge list and splits connected components
* according to algebraic connectivity threshold
*
- * Last updated: 2014/07/07
+ * Last updated: 2016/04/22
* Author: Marcus Lechner
*/
@@ -17,6 +17,7 @@
#include <vector>
#include <stack>
#include <iomanip>
+#include <cstdlib>
using namespace std;
// Structs
@@ -252,7 +253,7 @@ void partition_graph() {
// Connectivity analysis
float connectivity = getConnectivity(current_group);
- if (connectivity < param_con_threshold) {
+ if (connectivity < param_con_threshold && current_group.size() > 3) {
// Split groups is done in getConnectivity function
// Reset flags and repeat without incrementing protein counter
for (unsigned int i = 0; i < current_group.size(); i++) done[current_group[i]] = false;
@@ -260,7 +261,7 @@ void partition_graph() {
}
// Output
- print_group(current_group,connectivity);
+ if (connectivity >= param_con_threshold) {print_group(current_group,connectivity);}
// Print stats
stats(protein_id,protein_counter);
@@ -300,6 +301,20 @@ vector<unsigned int> get_deg_one (vector<unsigned int>& nodes) {
vector<float> generate_random_vector(const unsigned int size) {
vector<float> x(size);
for (unsigned int i = 0; i < size; i++) {
+ x[i] = (float)(rand() % 999+1)/1000; // 1 bis 99
+ // at least one value must be different from the others but still within 0 and 1
+ if (i > 0 && x[i] == x[i-1]) {
+ x[i] /= 3;
+ }
+// cerr << x[i] << endl;
+ }
+ return x;
+}
+
+// Generate random vector x of size size
+vector<float> generate_random_vector_old(const unsigned int size) {
+ vector<float> x(size);
+ for (unsigned int i = 0; i < size; i++) {
x[i] = (float)rand()/RAND_MAX;
// at least one value must be different from the others but still within 0 and 1
if (i > 0 && x[i] == x[i-1]) {
@@ -361,8 +376,43 @@ vector<float> getY(float max_degree, vector<float> x_hat, vector<float> x_new, v
return x_hat;
}
+
// Remove edges connectiong two groups a and b
void removeExternalEdges(map<unsigned int,bool>& a) {
+// cerr << "+#" << endl;
+// for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) {
+// unsigned int protein = it->first;
+// cerr << protein << endl;
+// }
+// cerr << "#-" << endl;
+
+ // For each protein in a
+ for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) {
+ unsigned int protein = it->first;
+ // For each target
+ vector<unsigned int> cleaned_edges;
+ bool swap = false;
+ for (vector<unsigned int>::iterator ita = graph[protein].edges.begin(); ita != graph[protein].edges.end(); ita++) {
+ // If it is not present the own group, set flag
+ if (a.find(*ita) == a.end()) {
+ graph_clean << graph[protein].full_name << "\t" << graph[*ita ].full_name << endl; // Improved graph cleaning
+ swap = true;
+ }
+ // Otherwise, add it to the new edge list
+ else {
+ cleaned_edges.push_back(*ita);
+ }
+ }
+ // If changes were made, swap edge list with new one
+ if (swap) {
+ cleaned_edges.swap(graph[protein].edges);
+ }
+ }
+}
+
+
+// Remove edges connectiong two groups a and b
+void removeExternalEdges_old(map<unsigned int,bool>& a) {
// For each protein in a
for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) {
unsigned int protein = it->first;
@@ -388,7 +438,7 @@ void removeExternalEdges(map<unsigned int,bool>& a) {
}
// Split connected component according to eigenvector
-void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
+void splitGroups(vector<float>& y, vector<unsigned int>& nodes, map<unsigned int,unsigned int> mapping){
// Remove tree like structures in the beginning
vector<unsigned int> one = get_deg_one(nodes);
if (one.size() > 0) {
@@ -408,6 +458,8 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
else {groupB[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#b01700}" << endl; }
}
+// cerr << groupA.size() << " -- " << groupB.size() << endl;
+
// Catch error in laplacien calcs
if (groupA.size() == 0 || groupB.size() == 0) {
throw "Failed to partition subgraph! This might lead to an infinit loop. Please submit the .blastgraph file to lechner at staff.uni-marburg.de to help fixing this issue.";
@@ -418,6 +470,18 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
}
float getConnectivity(vector<unsigned int>& nodes) {
+ // Special cases hotfix
+ if (nodes.size() == 2) {return 1;}
+
+ if (nodes.size() == 3) {
+ vector<unsigned int> min = get_deg_one(nodes);
+ // fully connected
+ if (min.size() == 0) {return 1;}
+ // not
+ else {return 0.667;}
+ }
+ // Hotfix end
+
// Get max degree of nodes
unsigned int max_degree = max_deg(nodes);
@@ -445,17 +509,17 @@ float getConnectivity(vector<unsigned int>& nodes) {
x_hat = makeOrthogonal(y);
// Get lenght (lambda) & normalize vector
norm = nomalize(x_hat, ¤t_length);
-// cerr << "IT: " << current_length << endl;
- if (abs(current_length-last_length) < 0.000333 && iter >= 10) break; // min 10 iterations (prevent convergence by chance)
+ if (abs(current_length-last_length) < 0.0001 && iter >= 20) break; // min 20 iterations (prevent convergence by chance)
}
-
// cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
float connectivity = (-current_length+2*max_degree)/(nodes.size());
+// cerr << nodes.size() << " " << connectivity << endl;
+
// Split groups if connectivity is too low, remove tree like structures that might have arosen
if (connectivity < param_con_threshold) {
- splitGroups(x_hat, nodes);
+ splitGroups(x_hat, nodes, mapping);
}
return connectivity;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/proteinortho.git
More information about the debian-med-commit
mailing list