[med-svn] [proteinortho] 01/06: Imported Upstream version 5.15+dfsg

Andreas Tille tille at debian.org
Mon Aug 29 13:16:15 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository proteinortho.

commit 7c6519ff208154e9ca5ad7590f655e5c37838bb4
Author: Andreas Tille <tille at debian.org>
Date:   Mon Aug 29 12:17:50 2016 +0200

    Imported Upstream version 5.15+dfsg
---
 chk_test.pl                  |  13 ----
 manual.html                  |  16 +++--
 proteinortho5.pl             |  84 +++++++++++++---------
 proteinortho5_clustering.cpp | 161 ++++++++++++++++++++++++++++---------------
 4 files changed, 170 insertions(+), 104 deletions(-)

diff --git a/chk_test.pl b/chk_test.pl
deleted file mode 100755
index d0b0df1..0000000
--- a/chk_test.pl
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/perl
-
-use warnings;
-use strict;
-
-open(FILE,"<$ARGV[0]") || die("Error, could not open file $ARGV[0]: $!");
-my @in = <FILE>;
-close(FILE);
-
-if (scalar(@in) < 30) {
-	print STDERR "Something went wrong. Test failed!\n";
-	exit 1;
-}
diff --git a/manual.html b/manual.html
index 8ae8d68..fb03622 100644
--- a/manual.html
+++ b/manual.html
@@ -4,7 +4,7 @@
 	</head>
 <body>
 <h1>Proteinortho Manual / PoFF Manual</h1>
-<small>This manual corresponds to version 5.13</small>
+<small>This manual corresponds to version 5.15</small>
 <h2>Introduction</h2>
 Proteinortho is a tool to detect orthologous genes within different species.
 For doing so, it compares similarities of given gene sequences and clusters them to find significant groups.
@@ -170,9 +170,17 @@ They are:
 <ul>
 <li>blastn+ → sequences are given as nucleotides
 <li>blastp+ → sequences are given as amino acids
-<li>blastn → sequences are given as nucleotides and blast legacy must be used
-<li>blastp → sequences are given as amino acids and blast legacy must be used
-</ul></td></tr>
+<li>tblastx+ → sequences are given as nucleotides and will be interpreted as (translated) amino acids in all three reading frames
+</ul>
+
+in case you only have access to blast legacy:
+<ul>
+<li>blastn → sequences are given as nucleotides
+<li>blastp → sequences are given as amino acids
+<li>tblastx → sequences are given as nucleotides and will be interpreted as (translated) amino acids in all three reading frames
+</ul>
+
+</td></tr>
 
 <tr><td>-project=</td><td>myproject</td><td> prefix for all result file names</td></tr>
 <tr><td>-temp=</td><td>working directory</td><td> path for temporary files</td></tr>
diff --git a/proteinortho5.pl b/proteinortho5.pl
index 22bbe97..92a08b5 100755
--- a/proteinortho5.pl
+++ b/proteinortho5.pl
@@ -27,10 +27,10 @@
 # input fasta files with proteins
 # output matrix with orthologous proteins
 # 
-# @author Marcus Lechner
+# @authors Marcus Lechner, Clemens Elias Thoelken
 # @email lechner at staff.uni-marburg.de
 # @company University of Maruburg
-# @date 2016-04-22
+# @date 2016-08-26
 #
 ##########################################################################################
 
@@ -47,7 +47,7 @@ use Thread::Queue;
 ##########################################################################################
 # Variables
 ##########################################################################################
-our $version = "5.13";
+our $version = "5.15";
 our $step = 0;		# 0/1/2/3	-> do all / only apply step 1 / only apply step 2 / only apply step 3
 our $verbose = 1;	# 0/1		-> don't / be verbose
 our $debug = 0;		# 0/1		-> don't / show debug data
@@ -172,6 +172,14 @@ our $simtable = "$project.proteinortho";			# Output file graph
 our $syntable = "$project.poff";					# Output file synteny
 our $desctable = "$project.descriptions";			# Output file seq descriptions
 
+
+if (-e "$project.proteinortho") {
+	print STDERR "!!!\nWarning: Data files for project '$project' already exists. Previous output files might be overwritten.\n";
+	print STDERR "Press CTRL + C to prevent me from proceeding)\nWaiting for 10 seconds...\n!!!\n";
+	sleep(10);
+	print STDERR "\nWell then, proceeding...\n\n";
+}
+
 ##########################################################################################
 # Run
 ##########################################################################################
@@ -211,7 +219,7 @@ if ($step == 0 || $step == 3) {
 
 
 ##########################################################################################
-# Funktions
+# Functions
 ##########################################################################################
 sub clean {
 	print STDERR "Removing temporary files...\n";
@@ -273,8 +281,9 @@ sub cluster {
 }
 
 sub print_header {
-	print STDERR "Proteinortho with PoFF version $version - An orthology detection tool\n",
-	             "******************************************************************\n";
+	print STDERR "*****************************************************************\n",
+		     "Proteinortho with PoFF version $version - An orthology detection tool\n",
+	             "*****************************************************************\n";
 }
 
 sub print_usage {
@@ -282,8 +291,8 @@ print STDERR << "JUS";
 
 Usage: proteinortho5.pl [OPTIONS] FASTA1 FASTA2 [FASTA...]
 Options: -e=          E-value for blast [default: 1e-05]
-         -p=          blast program {blastn|blastp|blastn+|blastp+}
-                      [default: blastp+]
+         -p=          blast program [default: blastp+]
+                      {blastn|blastp|tblastx|blastn+|blastp+|tblastx+}
          -project=    prefix for all result file names [default: myproject]
          -temp=       path for temporary files [default: working directory]
          -synteny     activate PoFF extension to separate similar sequences
@@ -320,7 +329,7 @@ Options: -e=          E-value for blast [default: 1e-05]
          In case jobs should be distributed onto several machines, use
          -startat=    File number to start with (default: 0)
          -stopat=     File number to end with (default: -1)
- 
+
 JUS
 }
 
@@ -345,7 +354,7 @@ sub set_threads_per_process {
 	lock($jobs_done);
 	my $willdo = ($jobs_todo-$jobs_done+$_[0]);
 
-	if ($debug) {	
+	if ($debug) {
 		print STDERR "\nTODO: $jobs_todo DONE: $jobs_done Running: $_[0] -> $willdo\n";
 	}
 
@@ -411,7 +420,7 @@ sub workerthread {
 			# If there is nothing
 			unless (defined($job)) {
 				# Check if more jobs need to be done
-	
+
 				{
 				lock($jobs_done);							# 	Productive
 				if ($jobs_done >= $jobs_todo) {						# 	Productive
@@ -422,7 +431,7 @@ sub workerthread {
 				}}
 				# If so, wait
 					if ($debug) {print STDERR "Thread $thread_id\tis seeking work ($jobs_done / $jobs_todo)\n";}
-				sleep(1);			
+				sleep(1);
 			}
 			else {
 				# Parse job
@@ -440,7 +449,7 @@ sub workerthread {
 		# Work
 		&set_threads_per_process(scalar(threads->list()));
 		my $result_ij = &blast($file_i,$file_j);
-		
+
 		my $result_ji;
 		if ($file_i eq $file_j) {
 			# One run is enough (selfblast)
@@ -476,9 +485,9 @@ sub workerthread {
 			my $synt_stats = qx($cmd);
 			chomp $synt_stats;
 			$synt_stats =~ s/#.+\n//;
-			
+
 			# Reverse mapping of full gene ids, two seperate maps in case of overlapps in short ids
-			my %full_id_map_i;				
+			my %full_id_map_i;
 			my %full_id_map_j;
 			foreach (sort keys %reciprocal_matches) {
 				my ($a, $b) = split(" ",$_);
@@ -540,7 +549,7 @@ sub workerthread {
 
 			{
 			lock($syn_lock);
-			
+
 			open(SYN,">>$syngraph") || die("Could not open file '$syngraph': $!");
 			print SYN "# $short_file_j\t$short_file_i\n";
 			print SYN "# Scores: $synt_stats\n";
@@ -649,7 +658,7 @@ sub adaptive_best_blast_matches {
 
 sub synteny_matches {
 	my %reciprocal_matches = %{(shift)};
-	my $file_i = shift; 
+	my $file_i = shift;
 	my $file_j = shift;
 
 	# Get order for both species (same hash as ids are non overlapping)
@@ -660,11 +669,11 @@ sub synteny_matches {
 		my %coords = %{&read_details($file)};
 		my $counter = 0;
 		# Number them according to their order
-		foreach my $id (sort 
+		foreach my $id (sort
 			{
 				my @a = split("\t",$coords{$a});
 				my @b = split("\t",$coords{$b});
-	
+
 #				#chr strand pos
 #				if ($a[0] ne $b[0]) {return $a[0] cmp $b[0];}
 #				if ($a[1] ne $b[1]) {return $a[1] cmp $b[1];}
@@ -700,7 +709,7 @@ sub synteny_matches {
 		if ($a[1] eq $b[1]) 	{$output .= "1\t";}	# Same strand?
 		else 			{$output .= "-1\t";}
 		my $score = (&edgeweight($values[0])+&edgeweight($values[2]))/2;	# Score made from e-values
-		$output .= $score."\n";	
+		$output .= $score."\n";
 	}
 
 	# Check multis
@@ -755,7 +764,7 @@ sub print_blast_stats {
 		if ($jobs_todo == 0) {die("Nothing to do. This should not happen!");}
 		lock($jobs_done);
 		my $percent = int($jobs_done/$jobs_todo*10000)/100;
-		print STDERR "\r                                                                               ";		
+		print STDERR "\r                                                                               ";
 		print STDERR "\rRunning blast analysis: $percent% ($jobs_done/$jobs_todo)";
 	}
 }
@@ -764,16 +773,16 @@ sub match {
 	my %length = %{(shift)};
 	my @i = @{(shift)};
 	my @j = @{(shift)};
-	
+
 	my %legal_i = &get_legal_matches(\%length, at i);
 	my %legal_j = &get_legal_matches(\%length, at j);
-	
+
 	return &get_reciprocal_matches(\%legal_i,\%legal_j);
 }
 
 sub get_reciprocal_matches {
 	my %i = %{(shift)};
-	my %j = %{(shift)}; 
+	my %j = %{(shift)};
 	my %reciprocal;
 
 	foreach (keys %i) {
@@ -795,7 +804,10 @@ sub get_legal_matches {
 	my %result;
 	foreach (@_) {
 		my ($query_id,$subject_id,$local_identity,$alignment_length,$mismatches,$openings,$query_start,$query_end,$subject_start,$subject_end,$local_evalue,$local_bitscore) = split(/\s+/);
-	
+
+		if ($debug) {print STDERR "?$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}
+
+
 		# Bug tracking
 		unless (defined($length{$query_id})) 	{print STDERR "ERROR: Query gene ID '$query_id' is present in blast output but was not present in FASTA input! Skipping line.\n"; next;}
 		unless (defined($length{$subject_id})) 	{print STDERR "ERROR: Subject gene ID '$subject_id' is present in blast output but was not present in FASTA input! Skipping line.\n"; next;}
@@ -803,10 +815,12 @@ sub get_legal_matches {
 		## Check for criteria
 		# Well formatted
 		if (!defined($local_bitscore)) 							{next;}
+
 		# Percent identity
 		if (!$twilight && $local_identity < $identity) 					{next;}
 		if ( $twilight && $local_identity < &identitybylength($alignment_length))	{next;} 
 		# Min. length
+		if ($blastmode eq "tblastx+" || $blastmode eq "tblastx") {$alignment_length *= 3;}
 		if ($alignment_length < $length{$query_id}*($coverage/100)+0.5) 		{next;}
 		if ($alignment_length < $length{$subject_id}*($coverage/100)+0.5) 		{next;}
 		# It hit itself (only during selfblast)
@@ -815,11 +829,12 @@ sub get_legal_matches {
 		# Similar hits? Take the better one
 		if (defined($result{"$query_id $subject_id"})) {
 			my ($remote_evalue, $remote_bitscore) = split(" ",$result{"$query_id $subject_id"});
-			if ($local_evalue > $remote_evalue) {next;}			
+			if ($local_evalue > $remote_evalue) {next;}
 			if ($local_bitscore < $remote_bitscore) {next;}
 		}
 
 		# Store data
+		if ($debug) {print STDERR "!$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}
 		$result{"$query_id $subject_id"} = "$local_evalue $local_bitscore";
 	}
 
@@ -860,9 +875,13 @@ sub generate_indices {
 
 sub blast {
 	my $command = "";
-	if 	($blastmode eq "blastp" || $blastmode eq "blastn") 	{lock($threads_per_process); $command = $blastpath."blastall -a $threads_per_process -d '$_[0]' -i '$_[1]' -p $blastmode -m8 -e $evalue $blastOptions";}
+	if 	($blastmode eq "blastp" || $blastmode eq "blastn" || $blastmode eq "tblastx") {
+			lock($threads_per_process);
+			$command = $blastpath."blastall -a $threads_per_process -d '$_[0]' -i '$_[1]' -p $blastmode -m8 -e $evalue $blastOptions";
+	}
 	elsif	($blastmode eq "blastp+") 				{lock($threads_per_process); $command = $blastpath."blastp -num_threads $threads_per_process -db '$_[0]' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions";}
 	elsif	($blastmode eq "blastn+") 				{lock($threads_per_process); $command = $blastpath."blastn -num_threads $threads_per_process -db '$_[0]' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions";}
+	elsif	($blastmode eq "tblastx+") 				{lock($threads_per_process); $command = $blastpath."tblastx -num_threads $threads_per_process -db '$_[0]' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions";}
 	else	{die("This should not happen!");}
 
 	my $a = $_[0];
@@ -894,7 +913,7 @@ sub readFile {
 }
 
 sub check_blast {
-	if ($blastmode eq "blastp+" || $blastmode eq "blastn+") {
+	if ($blastmode eq "blastp+" || $blastmode eq "blastn+" || $blastmode eq "tblastx+") {
 		my $tmp = $blastmode;
 		$tmp =~ s/\+//g;
 		my $cmd = $blastpath."$tmp -h";
@@ -905,15 +924,15 @@ sub check_blast {
 
 			# Commands
 			if 	($blastmode eq "blastp+") {$makedb = $blastpath."makeblastdb -dbtype prot -in";}
-			elsif 	($blastmode eq "blastn+") {$makedb = $blastpath."makeblastdb -dbtype nucl -in";}
+			elsif 	($blastmode eq "blastn+" || $blastmode eq "tblastx+") {$makedb = $blastpath."makeblastdb -dbtype nucl -in";}
 			else	{die("This should not happen!");}
-  
+
 			print STDERR "Detected NCBI BLAST version $blastversion\n";
 			return;
 		}
 		&Error("Failed to detect '$blastmode'! Tried to call '$tmp'.");
 	}
-	elsif ($blastmode eq "blastp" || $blastmode eq "blastn") {
+	elsif ($blastmode eq "blastp" || $blastmode eq "blastn" || $blastmode eq "tblastx") {
 		my $cmd = $blastpath."blastall";
 		my @blastv = qx($cmd);
 		foreach (@blastv) {
@@ -922,6 +941,7 @@ sub check_blast {
 				$blastversion = $1;
 				if 	($blastmode eq "blastp") {$makedb = $blastpath."formatdb -p T -o F -i";}
 				elsif 	($blastmode eq "blastn") {$makedb = $blastpath."formatdb -p F -o F -i";}
+				elsif 	($blastmode eq "tblastx") {$makedb = $blastpath."formatdb -p F -o F -i";}
 				else	{die("This should not happen!");}
 
 				print STDERR "Detected NCBI BLAST version $blastversion\n";
@@ -946,7 +966,7 @@ sub check_files {
 
 sub convertNCBI {
 	my $long_id = shift;
-	$long_id =~ s/\|$//g;					
+	$long_id =~ s/\|$//g;
 	my @tmp = split(/\|/,$long_id);	# take the last column for NCBI format like patterns (e.g. gi|158333234|ref|YP_001514406.1|)
 	return pop(@tmp);
 }
diff --git a/proteinortho5_clustering.cpp b/proteinortho5_clustering.cpp
index 6e07ae5..ccc1b12 100755
--- a/proteinortho5_clustering.cpp
+++ b/proteinortho5_clustering.cpp
@@ -24,31 +24,35 @@ using namespace std;
 struct protein {vector<unsigned int> edges; unsigned int species_id; string full_name;};
 
 // Functions
-float string2float(string);
+double string2double(string);
 void tokenize(const string& , vector<string>& , const string&);
 void parse_file(string);
 void remove_edge_between(const unsigned int, const unsigned int);
 void remove_edge(protein&, const unsigned int);
-float getConnectivity(vector<unsigned int>&);
+double getConnectivity(vector<unsigned int>&);
 void clear_edges(vector<unsigned int>&);
 void partition_graph(void);
-void print_group(vector<unsigned int>& , float );
+void print_group(vector<unsigned int>& , double );
 void print_header(void);
+void sort_species(void);
 vector<unsigned int> get_deg_one (vector<unsigned int>& );
 
 // Parameters
 bool param_verbose 		= false;
-float param_con_threshold 	= 0.1;
+double param_con_threshold 	= 0.1;
 string param_rmgraph            = "remove.graph";
+unsigned int min_iter 		= 100;		// min number of alg con iterations
+unsigned int max_iter 		= 10000;	// max number of alg con iterations
 
 // Globals
 unsigned int species_counter = 0;	// Species
 unsigned int protein_counter = 0;	// Proteins
 vector<string> species;			// Number -> Name
 vector<protein> graph;			// Graph containing all protein data
-float last_stat = 0;			// For progress stats
+double last_stat = 0;			// For progress stats
 unsigned int edges = 0;			// number of edges
 ofstream graph_clean;			// File to store graph data
+vector<int> reorder_table;		// Tells how proteins/species must be sorted
 
 // TMP Globals
 map<string,int> species2id;		// Name -> Number
@@ -65,7 +69,7 @@ void printHelp() {
 	cerr << "" << endl;
 	cerr << "Usage:   proteinortho5_clustering [OPTIONS] graph_files..." << endl;
 	cerr << "Options: -verbose        report progress" << endl;
-	cerr << "         -conn FLOAT     threshold for connectivity [0.1]" << endl;
+	cerr << "         -conn double     threshold for connectivity [0.1]" << endl;
 	cerr << "         -rmgraph STRING output file for graph" << endl;
 }
 
@@ -86,13 +90,13 @@ int main(int argc, char *argv[]) {
 		}
 		else if (parameter == "-verbose") {
 			paras++;
-			if (string2float(string(argv[paras]))) {
+			if (string2double(string(argv[paras]))) {
 				param_verbose = true;
 			}
 		}
 		else if (parameter == "-conn") {
 			paras++;
-			param_con_threshold = string2float(string(argv[paras]));
+			param_con_threshold = string2double(string(argv[paras]));
 		}
 		else if (parameter == "-rmgraph") {
 			paras++;
@@ -117,6 +121,9 @@ int main(int argc, char *argv[]) {
 	// Stats
 	if (param_verbose) cerr << species_counter << " species" << endl << protein_counter << " paired proteins" << endl << edges << " bidirectional edges" << endl;
 
+	// Prepare sort of output
+	sort_species();
+
 	// Write output header
 	print_header();							
 
@@ -157,10 +164,25 @@ void print_edgelist(protein& protein) {
 ///////////////////////////////////////////////////////////
 // Output
 ///////////////////////////////////////////////////////////
+// Sort
+void sort_species(void) {
+	reorder_table.reserve(species_counter);
+	vector<string> species_sorted (species_counter);
+	copy(species.begin(), species.end(), species_sorted.begin());
+	sort(species_sorted.begin(), species_sorted.end());
+	// find new locations (not efficent but list is small)	
+	for (unsigned int i = 0; i < species_counter; i++) {
+		for (unsigned int j = 0; j < species_counter; j++) {
+			if (species[i] == species_sorted[j]) {reorder_table[j] = i; continue;}
+		}
+	}
+}
+
+
 // Progress stats
-void stats(float i, float size) {
+void stats(double i, double size) {
 	if (!param_verbose) return;
-	float stat = float(i/size*100);
+	double stat = double(i/size*100);
 	if (last_stat * 1.01 < stat) {
 		last_stat = stat;
 		cerr << "\r" << "                          ";
@@ -173,13 +195,13 @@ void stats(float i, float size) {
 void print_header() {
 	cout << "# Species\tGenes\tAlg.-Conn.";
 	for (unsigned int i = 0; i < species_counter; i++) {
-		cout << "\t" << species[i];
+		cout << "\t" << species[reorder_table[i]];
 	}
 	cout << endl;
 }
 
 // Group formatting
-void print_group(vector<unsigned int>& nodes, float connectivity) {
+void print_group(vector<unsigned int>& nodes, double connectivity) {
 	vector<string> line(species_counter,"*");	// Output vector
 
 	unsigned int species_number = 0;
@@ -197,8 +219,23 @@ void print_group(vector<unsigned int>& nodes, float connectivity) {
 	}
 
 	cout << species_number << "\t" << nodes.size() << "\t" << setprecision (3) << connectivity;
-	for (vector<string>::iterator it=line.begin() ; it < line.end(); it++) {
-		cout << "\t" << *it;
+
+	// List group data
+	for (unsigned int i = 0; i < species_counter; i++) {
+		
+		// sort line if multi protein
+		vector<string> fields;
+		tokenize(line[reorder_table[i]], fields, ",");
+		if (fields.size() > 1) {
+			sort(fields.begin(), fields.end());
+			line[reorder_table[i]] = fields[0];
+			for (unsigned int k = 1; k < fields.size(); k++) {
+				line[reorder_table[i]].append(","+fields[k]);
+			}
+		}
+
+		// output
+		cout << "\t" << line[reorder_table[i]];
 	}
 	cout << endl;
 }
@@ -251,7 +288,7 @@ void partition_graph() {
 		}
 
 		// Connectivity analysis
-		float connectivity = getConnectivity(current_group);
+		double connectivity = getConnectivity(current_group);
 
 		if (connectivity < param_con_threshold && current_group.size() > 3) {
 			// Split groups is done in getConnectivity function
@@ -298,10 +335,10 @@ vector<unsigned int> get_deg_one (vector<unsigned int>& nodes) {
 }
 
 // Generate random vector x of size size
-vector<float> generate_random_vector(const unsigned int size) {
-	vector<float> x(size);
+vector<double> generate_random_vector(const unsigned int size) {
+	vector<double> x(size);
 	for (unsigned int i = 0; i < size; i++) {
-	  x[i] = (float)(rand() % 999+1)/1000;	// 1 bis 99
+	  x[i] = (double)(rand() % 999+1)/1000;	// 1 bis 99
 		// at least one value must be different from the others but still within 0 and 1
 		if (i > 0 && x[i] == x[i-1]) {
 			x[i] /= 3;
@@ -312,10 +349,10 @@ vector<float> generate_random_vector(const unsigned int size) {
 }
 
 // Generate random vector x of size size
-vector<float> generate_random_vector_old(const unsigned int size) {
-	vector<float> x(size);
+vector<double> generate_random_vector_old(const unsigned int size) {
+	vector<double> x(size);
 	for (unsigned int i = 0; i < size; i++) {
-	  x[i] = (float)rand()/RAND_MAX;
+	  x[i] = (double)rand()/RAND_MAX;
 		// at least one value must be different from the others but still within 0 and 1
 		if (i > 0 && x[i] == x[i-1]) {
 			x[i] += 0.1;
@@ -329,8 +366,8 @@ vector<float> generate_random_vector_old(const unsigned int size) {
 }
 
 // determine new X, Formula (1)
-vector<float> get_new_x(vector<float> x, vector<unsigned int>& nodes, map<unsigned int,unsigned int>& mapping) {
-	vector<float> x_new(x.size());
+vector<double> get_new_x(vector<double> x, vector<unsigned int>& nodes, map<unsigned int,unsigned int>& mapping) {
+	vector<double> x_new(x.size());
 	// go through all nodes (rows of A)
 	for (unsigned int i = 0; i < nodes.size(); i++) {
 		unsigned int node = nodes[i];	// node x is node y here
@@ -348,17 +385,17 @@ vector<float> get_new_x(vector<float> x, vector<unsigned int>& nodes, map<unsign
 }
 
 // Make vector x orthogonal to 1, Formula (2)
-vector<float> makeOrthogonal(vector<float> x) {
-	float sum = 0;
+vector<double> makeOrthogonal(vector<double> x) {
+	double sum = 0;
 	for (unsigned int i = 0; i < x.size(); i++) {sum += x[i];}
-	float average = sum/x.size();
+	double average = sum/x.size();
 	for (unsigned int i = 0; i < x.size(); i++) {x[i] -= average;}
 	return x;
 }
 
 // Normalize vector x, Formula (4)
-vector<float> nomalize(vector<float> x, float *length) {
-	float sum = 0;
+vector<double> nomalize(vector<double> x, double *length) {
+	double sum = 0;
 	for (unsigned int i = 0; i < x.size(); i++) {sum += x[i]*x[i];}
 	*length = sqrt(sum);
 	if (*length == 0) {*length = 0.000000001;}	// ATTENTION not 0!
@@ -367,7 +404,7 @@ vector<float> nomalize(vector<float> x, float *length) {
 }
 
 // Qx, Formula (5)
-vector<float> getY(float max_degree, vector<float> x_hat, vector<float> x_new, vector<unsigned int>& nodes){
+vector<double> getY(double max_degree, vector<double> x_hat, vector<double> x_new, vector<unsigned int>& nodes){
 	// (2*maxdeg - grad_node_i ) * x_hat_i + new_x_i
 	for (unsigned int i = 0; i < x_hat.size(); i++) {
 		x_hat[i] *= (2*max_degree - graph[nodes[i]].edges.size());
@@ -438,17 +475,27 @@ void removeExternalEdges_old(map<unsigned int,bool>& a) {
 }
 
 // Split connected component according to eigenvector
-void splitGroups(vector<float>& y, vector<unsigned int>& nodes, map<unsigned int,unsigned int> mapping){
+void splitGroups(vector<double>& y, vector<unsigned int>& nodes){
+
+//	cerr << "Mission to split groups:" << endl << "####################" << endl;		///
+
+//	cerr << "Vector (" << nodes.size() << ")" << endl;
+//	for (unsigned int i = 0; i < nodes.size(); i++) {
+//		cerr << graph[nodes[i]].full_name << "\t" << nodes[i] << "\t" << y[i] << endl;
+//	}
+
+
 	// Remove tree like structures in the beginning
-	vector<unsigned int> one = get_deg_one(nodes);
-	if (one.size() > 0) {
-		map<unsigned int,bool> tree;
-		for (unsigned int i = 0; i < one.size(); i++) {
-			{tree[one[i]] = true;}
-		}
-		removeExternalEdges(tree);
-		return;
-	}
+//	vector<unsigned int> one = get_deg_one(nodes);
+//	if (one.size() > 0) {
+//		cerr << "Tree " << endl;		///
+//		map<unsigned int,bool> tree;
+//		for (unsigned int i = 0; i < one.size(); i++) {
+//			{tree[one[i]] = true;}
+//		}
+//		removeExternalEdges(tree);
+//		return;
+//	}
 
 	// Store data about two groups
 	map<unsigned int,bool> groupA, groupB;
@@ -469,7 +516,7 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes, map<unsigned int
 	removeExternalEdges(groupB);
 }
 
-float getConnectivity(vector<unsigned int>& nodes) {
+double getConnectivity(vector<unsigned int>& nodes) {
 	// Special cases hotfix
 	if (nodes.size() == 2) {return 1;}
 
@@ -478,7 +525,7 @@ float getConnectivity(vector<unsigned int>& nodes) {
 		// fully connected
 		if (min.size() == 0) {return 1;}
 		// not
-		else {return 0.667;}
+		else {return 1/3;}
 	}
 	// Hotfix end
 
@@ -490,36 +537,40 @@ float getConnectivity(vector<unsigned int>& nodes) {
 	for (unsigned int i = 0; i < nodes.size(); i++) {mapping[nodes[i]] = i;}
 	
 	// Init randomized x 
-	vector<float> x = generate_random_vector(nodes.size());
+	vector<double> x = generate_random_vector(nodes.size());
 
 	// Orthogonalize + normalize vector + get initial lenght
-	float last_length, current_length = 0;
-	vector<float> x_hat = makeOrthogonal(x);
-	vector<float> norm = nomalize(x_hat, &last_length);
+	double last_length, current_length = 0;
+	vector<double> x_hat = makeOrthogonal(x);
+	vector<double> norm = nomalize(x_hat, &last_length);
 
 	// Repeat until difference < epsilon
-	unsigned int iter = 1000;							// max 1000 iterations (catch huge clustering issues)
-	while(iter-- != 0) { 
+	unsigned int iter = 0;							// catch huge clustering issues by keeping track here
+	while(iter++ < max_iter) { 
 		last_length = current_length;
 		// Get a new x
 		x = get_new_x(norm, nodes, mapping);
 		// Get y
-		vector<float> y = getY(max_degree,norm,x,nodes);
+		vector<double> y = getY(max_degree,norm,x,nodes);
 		// Orthogonalize
 		x_hat = makeOrthogonal(y);
 		// Get lenght (lambda) & normalize vector
 		norm = nomalize(x_hat, &current_length);
-		if (abs(current_length-last_length) < 0.0001 && iter >= 20) break;	// min 20 iterations (prevent convergence by chance)
+///		cerr << "I " << iter << ": " << abs(current_length-last_length) << endl; 
+		if (abs(current_length-last_length) < 0.00001 && iter >= min_iter) break;	// min 100 iterations (prevent convergence by chance), converge to 1e-6
 	}
-	//	cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
+///		cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
 
-	float connectivity = (-current_length+2*max_degree)/(nodes.size());
+	double connectivity = (-current_length+2*max_degree)/(nodes.size());
 
 //	cerr << nodes.size() << " " << connectivity << endl;
 
+///	if (connectivity > 1) {cerr << "DIE" << endl; throw "XX";}
+
 	// Split groups if connectivity is too low, remove tree like structures that might have arosen
 	if (connectivity < param_con_threshold) {
-		splitGroups(x_hat, nodes, mapping);
+///		cerr << "Conn: " << connectivity << endl; ///
+		splitGroups(x_hat, nodes);
 	}
 	
 	return connectivity;
@@ -628,10 +679,10 @@ void parse_file(string file) {
 ///////////////////////////////////////////////////////////
 // Misc functions
 ///////////////////////////////////////////////////////////
-// Convert string to float
-float string2float(string str) {
+// Convert string to double
+double string2double(string str) {
 	istringstream buffer(str);
-	float value;
+	double value;
 	buffer >> value;
 	return value;
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/proteinortho.git



More information about the debian-med-commit mailing list