[med-svn] [proteinortho] 01/06: Imported Upstream version 5.15+dfsg
Andreas Tille
tille at debian.org
Mon Aug 29 13:16:15 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository proteinortho.
commit 7c6519ff208154e9ca5ad7590f655e5c37838bb4
Author: Andreas Tille <tille at debian.org>
Date: Mon Aug 29 12:17:50 2016 +0200
Imported Upstream version 5.15+dfsg
---
chk_test.pl | 13 ----
manual.html | 16 +++--
proteinortho5.pl | 84 +++++++++++++---------
proteinortho5_clustering.cpp | 161 ++++++++++++++++++++++++++++---------------
4 files changed, 170 insertions(+), 104 deletions(-)
diff --git a/chk_test.pl b/chk_test.pl
deleted file mode 100755
index d0b0df1..0000000
--- a/chk_test.pl
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/perl
-
-use warnings;
-use strict;
-
-open(FILE,"<$ARGV[0]") || die("Error, could not open file $ARGV[0]: $!");
-my @in = <FILE>;
-close(FILE);
-
-if (scalar(@in) < 30) {
- print STDERR "Something went wrong. Test failed!\n";
- exit 1;
-}
diff --git a/manual.html b/manual.html
index 8ae8d68..fb03622 100644
--- a/manual.html
+++ b/manual.html
@@ -4,7 +4,7 @@
</head>
<body>
<h1>Proteinortho Manual / PoFF Manual</h1>
-<small>This manual corresponds to version 5.13</small>
+<small>This manual corresponds to version 5.15</small>
<h2>Introduction</h2>
Proteinortho is a tool to detect orthologous genes within different species.
For doing so, it compares similarities of given gene sequences and clusters them to find significant groups.
@@ -170,9 +170,17 @@ They are:
<ul>
<li>blastn+ → sequences are given as nucleotides
<li>blastp+ → sequences are given as amino acids
-<li>blastn → sequences are given as nucleotides and blast legacy must be used
-<li>blastp → sequences are given as amino acids and blast legacy must be used
-</ul></td></tr>
+<li>tblastx+ → sequences are given as nucleotides and will be interpreted as (translated) amino acids in all three reading frames
+</ul>
+
+in case you only have access to blast legacy:
+<ul>
+<li>blastn → sequences are given as nucleotides
+<li>blastp → sequences are given as amino acids
+<li>tblastx → sequences are given as nucleotides and will be interpreted as (translated) amino acids in all three reading frames
+</ul>
+
+</td></tr>
<tr><td>-project=</td><td>myproject</td><td> prefix for all result file names</td></tr>
<tr><td>-temp=</td><td>working directory</td><td> path for temporary files</td></tr>
diff --git a/proteinortho5.pl b/proteinortho5.pl
index 22bbe97..92a08b5 100755
--- a/proteinortho5.pl
+++ b/proteinortho5.pl
@@ -27,10 +27,10 @@
# input fasta files with proteins
# output matrix with orthologous proteins
#
-# @author Marcus Lechner
+# @authors Marcus Lechner, Clemens Elias Thoelken
# @email lechner at staff.uni-marburg.de
# @company University of Maruburg
-# @date 2016-04-22
+# @date 2016-08-26
#
##########################################################################################
@@ -47,7 +47,7 @@ use Thread::Queue;
##########################################################################################
# Variables
##########################################################################################
-our $version = "5.13";
+our $version = "5.15";
our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1 -> don't / be verbose
our $debug = 0; # 0/1 -> don't / show debug data
@@ -172,6 +172,14 @@ our $simtable = "$project.proteinortho"; # Output file graph
our $syntable = "$project.poff"; # Output file synteny
our $desctable = "$project.descriptions"; # Output file seq descriptions
+
+if (-e "$project.proteinortho") {
+ print STDERR "!!!\nWarning: Data files for project '$project' already exists. Previous output files might be overwritten.\n";
+ print STDERR "Press CTRL + C to prevent me from proceeding)\nWaiting for 10 seconds...\n!!!\n";
+ sleep(10);
+ print STDERR "\nWell then, proceeding...\n\n";
+}
+
##########################################################################################
# Run
##########################################################################################
@@ -211,7 +219,7 @@ if ($step == 0 || $step == 3) {
##########################################################################################
-# Funktions
+# Functions
##########################################################################################
sub clean {
print STDERR "Removing temporary files...\n";
@@ -273,8 +281,9 @@ sub cluster {
}
sub print_header {
- print STDERR "Proteinortho with PoFF version $version - An orthology detection tool\n",
- "******************************************************************\n";
+ print STDERR "*****************************************************************\n",
+ "Proteinortho with PoFF version $version - An orthology detection tool\n",
+ "*****************************************************************\n";
}
sub print_usage {
@@ -282,8 +291,8 @@ print STDERR << "JUS";
Usage: proteinortho5.pl [OPTIONS] FASTA1 FASTA2 [FASTA...]
Options: -e= E-value for blast [default: 1e-05]
- -p= blast program {blastn|blastp|blastn+|blastp+}
- [default: blastp+]
+ -p= blast program [default: blastp+]
+ {blastn|blastp|tblastx|blastn+|blastp+|tblastx+}
-project= prefix for all result file names [default: myproject]
-temp= path for temporary files [default: working directory]
-synteny activate PoFF extension to separate similar sequences
@@ -320,7 +329,7 @@ Options: -e= E-value for blast [default: 1e-05]
In case jobs should be distributed onto several machines, use
-startat= File number to start with (default: 0)
-stopat= File number to end with (default: -1)
-
+
JUS
}
@@ -345,7 +354,7 @@ sub set_threads_per_process {
lock($jobs_done);
my $willdo = ($jobs_todo-$jobs_done+$_[0]);
- if ($debug) {
+ if ($debug) {
print STDERR "\nTODO: $jobs_todo DONE: $jobs_done Running: $_[0] -> $willdo\n";
}
@@ -411,7 +420,7 @@ sub workerthread {
# If there is nothing
unless (defined($job)) {
# Check if more jobs need to be done
-
+
{
lock($jobs_done); # Productive
if ($jobs_done >= $jobs_todo) { # Productive
@@ -422,7 +431,7 @@ sub workerthread {
}}
# If so, wait
if ($debug) {print STDERR "Thread $thread_id\tis seeking work ($jobs_done / $jobs_todo)\n";}
- sleep(1);
+ sleep(1);
}
else {
# Parse job
@@ -440,7 +449,7 @@ sub workerthread {
# Work
&set_threads_per_process(scalar(threads->list()));
my $result_ij = &blast($file_i,$file_j);
-
+
my $result_ji;
if ($file_i eq $file_j) {
# One run is enough (selfblast)
@@ -476,9 +485,9 @@ sub workerthread {
my $synt_stats = qx($cmd);
chomp $synt_stats;
$synt_stats =~ s/#.+\n//;
-
+
# Reverse mapping of full gene ids, two seperate maps in case of overlapps in short ids
- my %full_id_map_i;
+ my %full_id_map_i;
my %full_id_map_j;
foreach (sort keys %reciprocal_matches) {
my ($a, $b) = split(" ",$_);
@@ -540,7 +549,7 @@ sub workerthread {
{
lock($syn_lock);
-
+
open(SYN,">>$syngraph") || die("Could not open file '$syngraph': $!");
print SYN "# $short_file_j\t$short_file_i\n";
print SYN "# Scores: $synt_stats\n";
@@ -649,7 +658,7 @@ sub adaptive_best_blast_matches {
sub synteny_matches {
my %reciprocal_matches = %{(shift)};
- my $file_i = shift;
+ my $file_i = shift;
my $file_j = shift;
# Get order for both species (same hash as ids are non overlapping)
@@ -660,11 +669,11 @@ sub synteny_matches {
my %coords = %{&read_details($file)};
my $counter = 0;
# Number them according to their order
- foreach my $id (sort
+ foreach my $id (sort
{
my @a = split("\t",$coords{$a});
my @b = split("\t",$coords{$b});
-
+
# #chr strand pos
# if ($a[0] ne $b[0]) {return $a[0] cmp $b[0];}
# if ($a[1] ne $b[1]) {return $a[1] cmp $b[1];}
@@ -700,7 +709,7 @@ sub synteny_matches {
if ($a[1] eq $b[1]) {$output .= "1\t";} # Same strand?
else {$output .= "-1\t";}
my $score = (&edgeweight($values[0])+&edgeweight($values[2]))/2; # Score made from e-values
- $output .= $score."\n";
+ $output .= $score."\n";
}
# Check multis
@@ -755,7 +764,7 @@ sub print_blast_stats {
if ($jobs_todo == 0) {die("Nothing to do. This should not happen!");}
lock($jobs_done);
my $percent = int($jobs_done/$jobs_todo*10000)/100;
- print STDERR "\r ";
+ print STDERR "\r ";
print STDERR "\rRunning blast analysis: $percent% ($jobs_done/$jobs_todo)";
}
}
@@ -764,16 +773,16 @@ sub match {
my %length = %{(shift)};
my @i = @{(shift)};
my @j = @{(shift)};
-
+
my %legal_i = &get_legal_matches(\%length, at i);
my %legal_j = &get_legal_matches(\%length, at j);
-
+
return &get_reciprocal_matches(\%legal_i,\%legal_j);
}
sub get_reciprocal_matches {
my %i = %{(shift)};
- my %j = %{(shift)};
+ my %j = %{(shift)};
my %reciprocal;
foreach (keys %i) {
@@ -795,7 +804,10 @@ sub get_legal_matches {
my %result;
foreach (@_) {
my ($query_id,$subject_id,$local_identity,$alignment_length,$mismatches,$openings,$query_start,$query_end,$subject_start,$subject_end,$local_evalue,$local_bitscore) = split(/\s+/);
-
+
+ if ($debug) {print STDERR "?$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}
+
+
# Bug tracking
unless (defined($length{$query_id})) {print STDERR "ERROR: Query gene ID '$query_id' is present in blast output but was not present in FASTA input! Skipping line.\n"; next;}
unless (defined($length{$subject_id})) {print STDERR "ERROR: Subject gene ID '$subject_id' is present in blast output but was not present in FASTA input! Skipping line.\n"; next;}
@@ -803,10 +815,12 @@ sub get_legal_matches {
## Check for criteria
# Well formatted
if (!defined($local_bitscore)) {next;}
+
# Percent identity
if (!$twilight && $local_identity < $identity) {next;}
if ( $twilight && $local_identity < &identitybylength($alignment_length)) {next;}
# Min. length
+ if ($blastmode eq "tblastx+" || $blastmode eq "tblastx") {$alignment_length *= 3;}
if ($alignment_length < $length{$query_id}*($coverage/100)+0.5) {next;}
if ($alignment_length < $length{$subject_id}*($coverage/100)+0.5) {next;}
# It hit itself (only during selfblast)
@@ -815,11 +829,12 @@ sub get_legal_matches {
# Similar hits? Take the better one
if (defined($result{"$query_id $subject_id"})) {
my ($remote_evalue, $remote_bitscore) = split(" ",$result{"$query_id $subject_id"});
- if ($local_evalue > $remote_evalue) {next;}
+ if ($local_evalue > $remote_evalue) {next;}
if ($local_bitscore < $remote_bitscore) {next;}
}
# Store data
+ if ($debug) {print STDERR "!$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}
$result{"$query_id $subject_id"} = "$local_evalue $local_bitscore";
}
@@ -860,9 +875,13 @@ sub generate_indices {
sub blast {
my $command = "";
- if ($blastmode eq "blastp" || $blastmode eq "blastn") {lock($threads_per_process); $command = $blastpath."blastall -a $threads_per_process -d '$_[0]' -i '$_[1]' -p $blastmode -m8 -e $evalue $blastOptions";}
+ if ($blastmode eq "blastp" || $blastmode eq "blastn" || $blastmode eq "tblastx") {
+ lock($threads_per_process);
+ $command = $blastpath."blastall -a $threads_per_process -d '$_[0]' -i '$_[1]' -p $blastmode -m8 -e $evalue $blastOptions";
+ }
elsif ($blastmode eq "blastp+") {lock($threads_per_process); $command = $blastpath."blastp -num_threads $threads_per_process -db '$_[0]' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions";}
elsif ($blastmode eq "blastn+") {lock($threads_per_process); $command = $blastpath."blastn -num_threads $threads_per_process -db '$_[0]' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions";}
+ elsif ($blastmode eq "tblastx+") {lock($threads_per_process); $command = $blastpath."tblastx -num_threads $threads_per_process -db '$_[0]' -query '$_[1]' -evalue $evalue -outfmt 6 $blastOptions";}
else {die("This should not happen!");}
my $a = $_[0];
@@ -894,7 +913,7 @@ sub readFile {
}
sub check_blast {
- if ($blastmode eq "blastp+" || $blastmode eq "blastn+") {
+ if ($blastmode eq "blastp+" || $blastmode eq "blastn+" || $blastmode eq "tblastx+") {
my $tmp = $blastmode;
$tmp =~ s/\+//g;
my $cmd = $blastpath."$tmp -h";
@@ -905,15 +924,15 @@ sub check_blast {
# Commands
if ($blastmode eq "blastp+") {$makedb = $blastpath."makeblastdb -dbtype prot -in";}
- elsif ($blastmode eq "blastn+") {$makedb = $blastpath."makeblastdb -dbtype nucl -in";}
+ elsif ($blastmode eq "blastn+" || $blastmode eq "tblastx+") {$makedb = $blastpath."makeblastdb -dbtype nucl -in";}
else {die("This should not happen!");}
-
+
print STDERR "Detected NCBI BLAST version $blastversion\n";
return;
}
&Error("Failed to detect '$blastmode'! Tried to call '$tmp'.");
}
- elsif ($blastmode eq "blastp" || $blastmode eq "blastn") {
+ elsif ($blastmode eq "blastp" || $blastmode eq "blastn" || $blastmode eq "tblastx") {
my $cmd = $blastpath."blastall";
my @blastv = qx($cmd);
foreach (@blastv) {
@@ -922,6 +941,7 @@ sub check_blast {
$blastversion = $1;
if ($blastmode eq "blastp") {$makedb = $blastpath."formatdb -p T -o F -i";}
elsif ($blastmode eq "blastn") {$makedb = $blastpath."formatdb -p F -o F -i";}
+ elsif ($blastmode eq "tblastx") {$makedb = $blastpath."formatdb -p F -o F -i";}
else {die("This should not happen!");}
print STDERR "Detected NCBI BLAST version $blastversion\n";
@@ -946,7 +966,7 @@ sub check_files {
sub convertNCBI {
my $long_id = shift;
- $long_id =~ s/\|$//g;
+ $long_id =~ s/\|$//g;
my @tmp = split(/\|/,$long_id); # take the last column for NCBI format like patterns (e.g. gi|158333234|ref|YP_001514406.1|)
return pop(@tmp);
}
diff --git a/proteinortho5_clustering.cpp b/proteinortho5_clustering.cpp
index 6e07ae5..ccc1b12 100755
--- a/proteinortho5_clustering.cpp
+++ b/proteinortho5_clustering.cpp
@@ -24,31 +24,35 @@ using namespace std;
struct protein {vector<unsigned int> edges; unsigned int species_id; string full_name;};
// Functions
-float string2float(string);
+double string2double(string);
void tokenize(const string& , vector<string>& , const string&);
void parse_file(string);
void remove_edge_between(const unsigned int, const unsigned int);
void remove_edge(protein&, const unsigned int);
-float getConnectivity(vector<unsigned int>&);
+double getConnectivity(vector<unsigned int>&);
void clear_edges(vector<unsigned int>&);
void partition_graph(void);
-void print_group(vector<unsigned int>& , float );
+void print_group(vector<unsigned int>& , double );
void print_header(void);
+void sort_species(void);
vector<unsigned int> get_deg_one (vector<unsigned int>& );
// Parameters
bool param_verbose = false;
-float param_con_threshold = 0.1;
+double param_con_threshold = 0.1;
string param_rmgraph = "remove.graph";
+unsigned int min_iter = 100; // min number of alg con iterations
+unsigned int max_iter = 10000; // max number of alg con iterations
// Globals
unsigned int species_counter = 0; // Species
unsigned int protein_counter = 0; // Proteins
vector<string> species; // Number -> Name
vector<protein> graph; // Graph containing all protein data
-float last_stat = 0; // For progress stats
+double last_stat = 0; // For progress stats
unsigned int edges = 0; // number of edges
ofstream graph_clean; // File to store graph data
+vector<int> reorder_table; // Tells how proteins/species must be sorted
// TMP Globals
map<string,int> species2id; // Name -> Number
@@ -65,7 +69,7 @@ void printHelp() {
cerr << "" << endl;
cerr << "Usage: proteinortho5_clustering [OPTIONS] graph_files..." << endl;
cerr << "Options: -verbose report progress" << endl;
- cerr << " -conn FLOAT threshold for connectivity [0.1]" << endl;
+ cerr << " -conn double threshold for connectivity [0.1]" << endl;
cerr << " -rmgraph STRING output file for graph" << endl;
}
@@ -86,13 +90,13 @@ int main(int argc, char *argv[]) {
}
else if (parameter == "-verbose") {
paras++;
- if (string2float(string(argv[paras]))) {
+ if (string2double(string(argv[paras]))) {
param_verbose = true;
}
}
else if (parameter == "-conn") {
paras++;
- param_con_threshold = string2float(string(argv[paras]));
+ param_con_threshold = string2double(string(argv[paras]));
}
else if (parameter == "-rmgraph") {
paras++;
@@ -117,6 +121,9 @@ int main(int argc, char *argv[]) {
// Stats
if (param_verbose) cerr << species_counter << " species" << endl << protein_counter << " paired proteins" << endl << edges << " bidirectional edges" << endl;
+ // Prepare sort of output
+ sort_species();
+
// Write output header
print_header();
@@ -157,10 +164,25 @@ void print_edgelist(protein& protein) {
///////////////////////////////////////////////////////////
// Output
///////////////////////////////////////////////////////////
+// Sort
+void sort_species(void) {
+ reorder_table.reserve(species_counter);
+ vector<string> species_sorted (species_counter);
+ copy(species.begin(), species.end(), species_sorted.begin());
+ sort(species_sorted.begin(), species_sorted.end());
+ // find new locations (not efficent but list is small)
+ for (unsigned int i = 0; i < species_counter; i++) {
+ for (unsigned int j = 0; j < species_counter; j++) {
+ if (species[i] == species_sorted[j]) {reorder_table[j] = i; continue;}
+ }
+ }
+}
+
+
// Progress stats
-void stats(float i, float size) {
+void stats(double i, double size) {
if (!param_verbose) return;
- float stat = float(i/size*100);
+ double stat = double(i/size*100);
if (last_stat * 1.01 < stat) {
last_stat = stat;
cerr << "\r" << " ";
@@ -173,13 +195,13 @@ void stats(float i, float size) {
void print_header() {
cout << "# Species\tGenes\tAlg.-Conn.";
for (unsigned int i = 0; i < species_counter; i++) {
- cout << "\t" << species[i];
+ cout << "\t" << species[reorder_table[i]];
}
cout << endl;
}
// Group formatting
-void print_group(vector<unsigned int>& nodes, float connectivity) {
+void print_group(vector<unsigned int>& nodes, double connectivity) {
vector<string> line(species_counter,"*"); // Output vector
unsigned int species_number = 0;
@@ -197,8 +219,23 @@ void print_group(vector<unsigned int>& nodes, float connectivity) {
}
cout << species_number << "\t" << nodes.size() << "\t" << setprecision (3) << connectivity;
- for (vector<string>::iterator it=line.begin() ; it < line.end(); it++) {
- cout << "\t" << *it;
+
+ // List group data
+ for (unsigned int i = 0; i < species_counter; i++) {
+
+ // sort line if multi protein
+ vector<string> fields;
+ tokenize(line[reorder_table[i]], fields, ",");
+ if (fields.size() > 1) {
+ sort(fields.begin(), fields.end());
+ line[reorder_table[i]] = fields[0];
+ for (unsigned int k = 1; k < fields.size(); k++) {
+ line[reorder_table[i]].append(","+fields[k]);
+ }
+ }
+
+ // output
+ cout << "\t" << line[reorder_table[i]];
}
cout << endl;
}
@@ -251,7 +288,7 @@ void partition_graph() {
}
// Connectivity analysis
- float connectivity = getConnectivity(current_group);
+ double connectivity = getConnectivity(current_group);
if (connectivity < param_con_threshold && current_group.size() > 3) {
// Split groups is done in getConnectivity function
@@ -298,10 +335,10 @@ vector<unsigned int> get_deg_one (vector<unsigned int>& nodes) {
}
// Generate random vector x of size size
-vector<float> generate_random_vector(const unsigned int size) {
- vector<float> x(size);
+vector<double> generate_random_vector(const unsigned int size) {
+ vector<double> x(size);
for (unsigned int i = 0; i < size; i++) {
- x[i] = (float)(rand() % 999+1)/1000; // 1 bis 99
+ x[i] = (double)(rand() % 999+1)/1000; // 1 bis 99
// at least one value must be different from the others but still within 0 and 1
if (i > 0 && x[i] == x[i-1]) {
x[i] /= 3;
@@ -312,10 +349,10 @@ vector<float> generate_random_vector(const unsigned int size) {
}
// Generate random vector x of size size
-vector<float> generate_random_vector_old(const unsigned int size) {
- vector<float> x(size);
+vector<double> generate_random_vector_old(const unsigned int size) {
+ vector<double> x(size);
for (unsigned int i = 0; i < size; i++) {
- x[i] = (float)rand()/RAND_MAX;
+ x[i] = (double)rand()/RAND_MAX;
// at least one value must be different from the others but still within 0 and 1
if (i > 0 && x[i] == x[i-1]) {
x[i] += 0.1;
@@ -329,8 +366,8 @@ vector<float> generate_random_vector_old(const unsigned int size) {
}
// determine new X, Formula (1)
-vector<float> get_new_x(vector<float> x, vector<unsigned int>& nodes, map<unsigned int,unsigned int>& mapping) {
- vector<float> x_new(x.size());
+vector<double> get_new_x(vector<double> x, vector<unsigned int>& nodes, map<unsigned int,unsigned int>& mapping) {
+ vector<double> x_new(x.size());
// go through all nodes (rows of A)
for (unsigned int i = 0; i < nodes.size(); i++) {
unsigned int node = nodes[i]; // node x is node y here
@@ -348,17 +385,17 @@ vector<float> get_new_x(vector<float> x, vector<unsigned int>& nodes, map<unsign
}
// Make vector x orthogonal to 1, Formula (2)
-vector<float> makeOrthogonal(vector<float> x) {
- float sum = 0;
+vector<double> makeOrthogonal(vector<double> x) {
+ double sum = 0;
for (unsigned int i = 0; i < x.size(); i++) {sum += x[i];}
- float average = sum/x.size();
+ double average = sum/x.size();
for (unsigned int i = 0; i < x.size(); i++) {x[i] -= average;}
return x;
}
// Normalize vector x, Formula (4)
-vector<float> nomalize(vector<float> x, float *length) {
- float sum = 0;
+vector<double> nomalize(vector<double> x, double *length) {
+ double sum = 0;
for (unsigned int i = 0; i < x.size(); i++) {sum += x[i]*x[i];}
*length = sqrt(sum);
if (*length == 0) {*length = 0.000000001;} // ATTENTION not 0!
@@ -367,7 +404,7 @@ vector<float> nomalize(vector<float> x, float *length) {
}
// Qx, Formula (5)
-vector<float> getY(float max_degree, vector<float> x_hat, vector<float> x_new, vector<unsigned int>& nodes){
+vector<double> getY(double max_degree, vector<double> x_hat, vector<double> x_new, vector<unsigned int>& nodes){
// (2*maxdeg - grad_node_i ) * x_hat_i + new_x_i
for (unsigned int i = 0; i < x_hat.size(); i++) {
x_hat[i] *= (2*max_degree - graph[nodes[i]].edges.size());
@@ -438,17 +475,27 @@ void removeExternalEdges_old(map<unsigned int,bool>& a) {
}
// Split connected component according to eigenvector
-void splitGroups(vector<float>& y, vector<unsigned int>& nodes, map<unsigned int,unsigned int> mapping){
+void splitGroups(vector<double>& y, vector<unsigned int>& nodes){
+
+// cerr << "Mission to split groups:" << endl << "####################" << endl; ///
+
+// cerr << "Vector (" << nodes.size() << ")" << endl;
+// for (unsigned int i = 0; i < nodes.size(); i++) {
+// cerr << graph[nodes[i]].full_name << "\t" << nodes[i] << "\t" << y[i] << endl;
+// }
+
+
// Remove tree like structures in the beginning
- vector<unsigned int> one = get_deg_one(nodes);
- if (one.size() > 0) {
- map<unsigned int,bool> tree;
- for (unsigned int i = 0; i < one.size(); i++) {
- {tree[one[i]] = true;}
- }
- removeExternalEdges(tree);
- return;
- }
+// vector<unsigned int> one = get_deg_one(nodes);
+// if (one.size() > 0) {
+// cerr << "Tree " << endl; ///
+// map<unsigned int,bool> tree;
+// for (unsigned int i = 0; i < one.size(); i++) {
+// {tree[one[i]] = true;}
+// }
+// removeExternalEdges(tree);
+// return;
+// }
// Store data about two groups
map<unsigned int,bool> groupA, groupB;
@@ -469,7 +516,7 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes, map<unsigned int
removeExternalEdges(groupB);
}
-float getConnectivity(vector<unsigned int>& nodes) {
+double getConnectivity(vector<unsigned int>& nodes) {
// Special cases hotfix
if (nodes.size() == 2) {return 1;}
@@ -478,7 +525,7 @@ float getConnectivity(vector<unsigned int>& nodes) {
// fully connected
if (min.size() == 0) {return 1;}
// not
- else {return 0.667;}
+ else {return 1/3;}
}
// Hotfix end
@@ -490,36 +537,40 @@ float getConnectivity(vector<unsigned int>& nodes) {
for (unsigned int i = 0; i < nodes.size(); i++) {mapping[nodes[i]] = i;}
// Init randomized x
- vector<float> x = generate_random_vector(nodes.size());
+ vector<double> x = generate_random_vector(nodes.size());
// Orthogonalize + normalize vector + get initial lenght
- float last_length, current_length = 0;
- vector<float> x_hat = makeOrthogonal(x);
- vector<float> norm = nomalize(x_hat, &last_length);
+ double last_length, current_length = 0;
+ vector<double> x_hat = makeOrthogonal(x);
+ vector<double> norm = nomalize(x_hat, &last_length);
// Repeat until difference < epsilon
- unsigned int iter = 1000; // max 1000 iterations (catch huge clustering issues)
- while(iter-- != 0) {
+ unsigned int iter = 0; // catch huge clustering issues by keeping track here
+ while(iter++ < max_iter) {
last_length = current_length;
// Get a new x
x = get_new_x(norm, nodes, mapping);
// Get y
- vector<float> y = getY(max_degree,norm,x,nodes);
+ vector<double> y = getY(max_degree,norm,x,nodes);
// Orthogonalize
x_hat = makeOrthogonal(y);
// Get lenght (lambda) & normalize vector
norm = nomalize(x_hat, ¤t_length);
- if (abs(current_length-last_length) < 0.0001 && iter >= 20) break; // min 20 iterations (prevent convergence by chance)
+/// cerr << "I " << iter << ": " << abs(current_length-last_length) << endl;
+ if (abs(current_length-last_length) < 0.00001 && iter >= min_iter) break; // min 100 iterations (prevent convergence by chance), converge to 1e-6
}
- // cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
+/// cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
- float connectivity = (-current_length+2*max_degree)/(nodes.size());
+ double connectivity = (-current_length+2*max_degree)/(nodes.size());
// cerr << nodes.size() << " " << connectivity << endl;
+/// if (connectivity > 1) {cerr << "DIE" << endl; throw "XX";}
+
// Split groups if connectivity is too low, remove tree like structures that might have arosen
if (connectivity < param_con_threshold) {
- splitGroups(x_hat, nodes, mapping);
+/// cerr << "Conn: " << connectivity << endl; ///
+ splitGroups(x_hat, nodes);
}
return connectivity;
@@ -628,10 +679,10 @@ void parse_file(string file) {
///////////////////////////////////////////////////////////
// Misc functions
///////////////////////////////////////////////////////////
-// Convert string to float
-float string2float(string str) {
+// Convert string to double
+double string2double(string str) {
istringstream buffer(str);
- float value;
+ double value;
buffer >> value;
return value;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/proteinortho.git
More information about the debian-med-commit
mailing list