[med-svn] [Git][med-team/proteinortho][upstream] New upstream version 6.1.2+dfsg
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Nov 4 20:07:10 GMT 2022
Andreas Tille pushed to branch upstream at Debian Med / proteinortho
Commits:
9c332f8f by Andreas Tille at 2022-11-04T20:46:12+01:00
New upstream version 6.1.2+dfsg
- - - - -
14 changed files:
- .gitignore
- CHANGELOG
- CHANGEUID
- Makefile
- proteinortho6.pl
- src/.gitlab-ci.yml
- src/chk_test.pl
- src/proteinortho2html.pl
- src/proteinortho_clustering.cpp
- + test/K5.blast-graph
- + test/P3_K5.blast-graph
- + test/P4.blast-graph
- + test/myproject.blast-graph
- + test/ring4_K5.blast-graph
Changes:
=====================================
.gitignore
=====================================
@@ -1,3 +1,5 @@
+cluster_n*
+.nfs00*
mcl.*
src/lapack-3.8.0/
myproject.*
=====================================
CHANGELOG
=====================================
@@ -352,3 +352,16 @@
- proteinortho_summary.pl improved output logs
- improved README, --help output
- improved proteinortho_history including now the *.info parameters
+ 22. Sept (6030) v6.1.2
+ - new make test cases for step=3
+ - removed -purity from proteinortho6.pl
+ - proteinortho_clustering:
+ - removed purity (now purity is determined automatically),
+ - impoved clustering based on purity threshold (groupZero is now added back to the QUEUE)
+ - BUGFIX: for the rare case that the +/- components of the fiedlervector are not connected (~1%)
+ - BUGFIX: for clustering without purity (previously one cluster was omitted)
+ 24. Sept (6040) v6.1.2
+ - new: automatic prurity detetection (proteinortho_clustering)
+ - BUGFIX: remove.graph was missing edges removed from purity
+ 24. Okt (6041) v6.1.2
+ - BUGFIX: proteinortho2html some times got the , wrong in the pop-up window.
\ No newline at end of file
=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-6020
+6041
=====================================
Makefile
=====================================
@@ -3,7 +3,6 @@
# Run 'make' for compiling everything in the current directory (using the installed version of lapack in e.g. /usr/lib/, you can install lapack with e.g. apt-get install libatlas3-base or liblapack3)
# Run 'make STATIC=TRUE' for a static version
-# Run 'make USELAPACK=FALSE' for a version without(!) LAPACK (only power iteration is used)
# Run 'make USEPRECOMPILEDLAPACK=FALSE' for directly recompiling the provided lapack version 3.8.0 and linking dynamically
# Run 'make CXX=g++-7' for using the g++-7 compiler. See Flags below for more informations
@@ -46,9 +45,9 @@ endif
ifdef installdir
INSTALLDIR=$(installdir)
endif
-ifdef LAPACK
-USELAPACK=$(LAPACK)
-endif
+#ifdef LAPACK
+#USELAPACK=$(LAPACK)
+#endif
# normalize true and TRUE
ifeq ($(MTUNEARCH),true)
@@ -57,9 +56,9 @@ endif
ifeq ($(STATIC),true)
STATIC=TRUE
endif
-ifeq ($(USELAPACK),true)
-USELAPACK=TRUE
-endif
+#ifeq ($(USELAPACK),true)
+#USELAPACK=TRUE
+#endif
ifeq ($(DEPLOY),true)
DEPLOY=TRUE
endif
@@ -165,8 +164,8 @@ echoENV:
@echo $(LDLIBS)
@echo -n "STATIC = "
@echo $(STATIC)
- @echo -n "USELAPACK = "
- @echo $(USELAPACK)
+# @echo -n "USELAPACK = "
+# @echo $(USELAPACK)
# 1. Try to compile statically with LAPACK
# 2. try to compile dynamically with the given lapack lib in src/
@@ -179,7 +178,6 @@ ifeq ($(DEPLOY),TRUE)
@echo "[ 20%] Building **proteinortho_clustering** with LAPACK (static linking, no mtune,march , for gitlab)";
@echo "$(CXX) -static $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -Wl,--allow-multiple-definition -llapack -lblas -lgfortran -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -lquadmath" && $(CXX) -static $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -Wl,--allow-multiple-definition -llapack -lblas -lgfortran -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -lquadmath && ([ $$? -eq 0 ] && $@ -test && [ $$? -eq 0 ] )
else
-ifeq ($(USELAPACK),TRUE)
ifeq ($(USEPRECOMPILEDLAPACK),TRUE)
ifeq ($(STATIC),TRUE)
@echo "[ 20%] Building **proteinortho_clustering** with LAPACK (static linking)";
@@ -223,13 +221,13 @@ else
@echo "[ 20%] Building **proteinortho_clustering** with LAPACK (dynamic linking)";
@echo "$(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ $< -Isrc/lapack-3.8.0/build/include/ -Lsrc/lapack-3.8.0/build/lib/ -llapack -lblas $(LDFLAGS) $(LDLIBS) -lgfortran" && $(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ $< -Isrc/lapack-3.8.0/build/include/ -Lsrc/lapack-3.8.0/build/lib/ -llapack -lblas $(LDFLAGS) $(LDLIBS) -lgfortran;
endif
-endif
-ifeq ($(USELAPACK),FALSE)
- @echo "[ 20%] Building **proteinortho_clustering** WITHOUT(!) LAPACK";
- @echo "$(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS) -static" && $(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS) -static && ([ $$? -eq 0 ] ) || ( \
- echo "......$(ORANGE)static linking failed of proteinortho_clustering_nolapack, now i switch to dynamic linking.$(NC)"; \
- echo "$(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS)" && $(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS) && echo "......OK dynamic linking was successful for proteinortho_clustering_nolapack!"; )
-endif
+#endif
+#ifeq ($(USELAPACK),FALSE)
+# @echo "[ 20%] Building **proteinortho_clustering** WITHOUT(!) LAPACK";
+# @echo "$(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS) -static" && $(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS) -static && ([ $$? -eq 0 ] ) || ( \
+# echo "......$(ORANGE)static linking failed of proteinortho_clustering_nolapack, now i switch to dynamic linking.$(NC)"; \
+# echo "$(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS)" && $(CXX) -std=c++11 -O3 -funroll-loops $(CPPFLAGS) $(CXXFLAGS) $(MTUNEARCHFLAGS) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS) && echo "......OK dynamic linking was successful for proteinortho_clustering_nolapack!"; )
+#endif
endif
$(BUILDDIR)/proteinortho_cleanupblastgraph: src/cleanupblastgraph.cpp
@@ -273,7 +271,7 @@ install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_gra
@echo "If needed you can add $(INSTALLDIR) to \$$PATH with 'export PATH=\$$PATH:$(INSTALLDIR)'."
.PHONY: test
-test: proteinortho6.pl test_clean test_step2 test_step3 test_clean2
+test: proteinortho6.pl test_clean test_step2 test_clean2 test_step3 test_step3_ring4_K5_power test_step3_ring4_K5_lapack test_step3_P3_K5 test_step3_P4 test_step3_K5
@echo "[TEST] All tests $(GREEN)passed$(NC)"
.PHONY: test_step2
@@ -389,28 +387,65 @@ test_step2: proteinortho6.pl
fi
.PHONY: test_step3
-test_step3: proteinortho6.pl test_step2
+test_step3: proteinortho6.pl
+ ./proteinortho6.pl -silent -force -project=test_blastp -p=blastp test/*.faa;
@echo "[TEST] 2. -step=3 tests (proteinortho_clustering) "
- @echo -n " [1/2] various test functions of proteinortho_clustering (if this fails, try make clean first): "; \
+ @echo -n " [1/3] various test functions of proteinortho_clustering (if this fails, try make clean first): "; \
OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -test> /dev/null 2>&1
@echo "$(GREEN)passed$(NC)"
- @echo -n " [2/2] Compare results of 'with lapack' and 'without lapack': "; \
- OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -epsilon 0 test_blastp.blast-graph> /dev/null 2>&1; \
- sort remove.graph -o test.A> /dev/null 2>&1; \
- OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -epsilon 0 -lapack 0 test_blastp.blast-graph> /dev/null 2>&1; \
- sort remove.graph -o test.B> /dev/null 2>&1; \
- set -e ; diff test.A test.B;
+ @echo -n " [2/3] Test proteinortho_clustering using lapack: "; \
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 test_blastp.blast-graph -debug 1 2>test_lapack.err >test_lapack.proteinortho.tsv ; \
+ set -e ; ./src/chk_test.pl test_lapack.proteinortho.tsv;
+ @echo "$(GREEN)passed$(NC)"
+ @perl -lne 'print join "\t", sort split("\t",$$_)' remove.graph | sort > test.A;
+ @echo -n " [3/3] Test proteinortho_clustering using power: "; \
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 0 test_blastp.blast-graph -debug 1 2>test_power.err >test_power.proteinortho.tsv; \
+ set -e ; ./src/chk_test.pl test_power.proteinortho.tsv;
+ @echo "$(GREEN)passed$(NC)"
+ #@perl -lne 'print join "\t", sort split("\t",$$_)' remove.graph | sort > test.B;
+ #@echo -n " [4/4] Test differences between 'using lapack' and 'using power': "; \
+ #set -e ; diff test.A test.B;
+ #@echo "$(GREEN)passed$(NC)"
+
+.PHONY: test_step3_ring4_K5_power
+test_step3_ring4_K5_power: proteinortho6.pl
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 0 test/ring4_K5.blast-graph >test_power_ring4_K5.proteinortho.tsv 2>/dev/null ; \
+ set -e ; ./src/chk_test.pl test_power_ring4_K5.proteinortho.tsv "ring4_K5";
+ @echo "$(GREEN)passed$(NC)"
+
+.PHONY: test_step3_ring4_K5_lapack
+test_step3_ring4_K5_lapack: proteinortho6.pl
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 test/ring4_K5.blast-graph >test_lapack_ring4_K5.proteinortho.tsv 2>/dev/null ; \
+ set -e ; ./src/chk_test.pl test_lapack_ring4_K5.proteinortho.tsv "ring4_K5";
+ @echo "$(GREEN)passed$(NC)"
+
+.PHONY: test_step3_P3_K5
+test_step3_P3_K5: proteinortho6.pl
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 test/P3_K5.blast-graph >test_lapack_P3_K5.proteinortho.tsv 2>/dev/null ; \
+ set -e ; ./src/chk_test.pl test_lapack_P3_K5.proteinortho.tsv "P3_K5";
+ @echo "$(GREEN)passed$(NC)"
+
+.PHONY: test_step3_P4
+test_step3_P4: proteinortho6.pl
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 test/P4.blast-graph >test_lapack_P4.proteinortho.tsv 2>/dev/null ; \
+ set -e ; ./src/chk_test.pl test_lapack_P4.proteinortho.tsv "P4";
+ @echo "$(GREEN)passed$(NC)"
+
+.PHONY: test_step3_K5
+test_step3_K5: proteinortho6.pl
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 test/K5.blast-graph >test_lapack_K5.proteinortho.tsv 2>/dev/null ; \
+ set -e ; ./src/chk_test.pl test_lapack_K5.proteinortho.tsv "K5";
@echo "$(GREEN)passed$(NC)"
.PHONY: test_clean
test_clean:
@echo "[TEST] Clean up all test files..."; \
- rm -rf proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
+ rm -rf remove.graph proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
.PHONY: test_clean2
test_clean2:
@echo "[TEST] Clean up all test files..."; \
- rm -rf proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
+ rm -rf remove.graph proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
.PHONY: clean
clean:
=====================================
proteinortho6.pl
=====================================
@@ -244,14 +244,11 @@ PoFF: weight of adjacencies vs. sequence similarity
report singleton genes without any hit
-=item B<--purity>=float (default: 1e-7)
-
-avoid spurious graph assignments
-
=item B<--conn>=float (default: 0.1)
min. algebraic connectivity
-This is the main parameter for the clustering step. Choose larger values then more splits are done, resulting in more and smaller clusters. (There are still cluster with an alg. conn. below this given threshold allowed if the protein to species ratio is good enough, see -minspecies option below)
+This is the main parameter for the clustering step.
+Choose larger values then more splits are done, resulting in more and smaller clusters. (There are still cluster with an alg. conn. below this given threshold allowed if the protein to species ratio is good enough, see -minspecies option below)
=item B<--minspecies>=float (default: 1, must be >=0)
@@ -492,6 +489,8 @@ Lechner, M., Findeisz, S., Steiner, L., Marz, M., Stadler, P. F., & Prohaska, S.
# enables the mcl algorithm for clustering instead of power iteration or lapacks routine. (needs mcl to be installed, call 'mcl' to test if you installed mcl correctly)
# =item B<--ram>=number (default: 75% of free memory)
# maximal used ram threshold for LAPACK and the input graph in MB. By default 75% of the free memory is used
+# =item B<--purity>=float (>0, default: choose value automatically)
+# avoid spurious graph assignments, the lower the purity threshold the more uncertain edges are cut
##########################################################################################
# Imports
@@ -510,7 +509,7 @@ use POSIX;
##########################################################################################
# Variables
##########################################################################################
-our $version = "6.1.1";
+our $version = "6.1.2";
our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1 -> don't / be verbose
our $core = 0;
@@ -527,7 +526,7 @@ our $alpha = 0.5; # Alpha value for src/proteinortho_ffadj_mcs.py
our $connectivity = "0.1"; # min algebraic connectivity (normal eigenvalue) for clustering step (-step=3), not the evalue cutoff for convergence
our $cpus = 0; # 0 = autodetect
our $evalue = "1e-05"; # evalue cutoff for blast step (-step=2), different to the clustering evalue cutoff (set in proteinortho_clustering.cpp)
-our $purity = "1e-07"; # for clustering step: if entries (absolute value) of the vector are blow the purity -> treated as '0'
+our $purity = "-1"; # for clustering step: if entries (absolute value) of the vector are blow the purity -> treated as '0'
our $coverage = 50; # Percent coverage threshold for two proteins
our $identity = 25; # Percent identity threshold for two proteins
our $blastmode = "diamond";
@@ -652,7 +651,7 @@ foreach my $option (@ARGV) {
elsif ($option =~ m/^--?hmm_?evalue=([0-9\.eE+-]+)$/) { $hmm_search_evalue = $1; }
elsif ($option =~ m/^--?hmm_?e=([0-9\.eE+-]+)$/) { $hmm_search_evalue = $1; }
elsif ($option =~ m/^--?hmm_?minprots?=([0-9]+)$/) { $hmm_minprots = $1; }
- elsif ($option =~ m/^--?purity=([0-9\.]+)$/) { $purity = $1; }
+ elsif ($option =~ m/^--?purity=([0-9\.]+)$/) { print STDERR "$ORANGE"."[WARNING]$NC The --memory option is deprecated\n"; } # $purity = $1;
elsif ($option =~ m/^--?report=([0-9]+)$/) { $report = $1; }
elsif ($option =~ m/^--?minspecies=([0-9.]+)$/) { if($1>=0){$minspecies = $1;}else{&Error("the argument -minspecies=$1 is invalid.$RED minspecies needs to be >=0!$NC\nminspecies: the min. number of genes per species. If a group is found with up to (minspecies) genes/species, it wont be split again (regardless of the connectivity).");} }
elsif ($option =~ m/^--?conn=([0-9\.]+)$/) { $connectivity = $1; }
@@ -1047,9 +1046,9 @@ if ($step == 0 || $step == 2) {
}# ok all db files exists
if($verbose){print STDERR "\n$GREEN**Step 2**$NC using $blastmode ";
- if($synteny || $selfblast){print STDERR "with :";}
- if($synteny){print STDERR " synteny";}
- if($selfblast){print STDERR " selfblast";}
+ if($verbose && ($synteny || $selfblast)){print STDERR "with :";}
+ if($verbose && $synteny){print STDERR " synteny";}
+ if($verbose && $selfblast){print STDERR " selfblast";}
print STDERR "\n"; }
&init_graph_output; # Initiate Output file(s)
@@ -1345,19 +1344,14 @@ sub cluster {
#$ENV{'OMP_NUM_THREADS'}=1;
#$ENV{'OMP_PROC_BIND'}=$ompprocbind;
-
- my $cmd="OMP_NUM_THREADS=1 $po_path/proteinortho_clustering ".($core?"-core -coreMinSpecies $coreMinSpecies -coreMaxProts $coreMaxProts ":"")."$cluster_verbose_level -minspecies $minspecies -cpus $cpus -weighted 1 -conn $connectivity -purity $purity ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -rmgraph '$rm_simgraph' '$simgraph'* >'$simtable' ".($verbose >= 1 ? "" : "2>/dev/null");
+ #-purity $purity
+ my $cmd="OMP_NUM_THREADS=1 $po_path/proteinortho_clustering ".($core?"-core -coreMinSpecies $coreMinSpecies -coreMaxProts $coreMaxProts ":"")."$cluster_verbose_level -minspecies $minspecies -cpus $cpus -weighted 1 -conn $connectivity ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -rmgraph '$rm_simgraph' '$simgraph'* >'$simtable' ".($verbose >= 1 ? "" : "2>/dev/null");
if($debug>0){ print STDERR "$cmd\n"; }
system($cmd);
if ($? != 0) {
- #if($verbose){print STDERR "$ORANGE"."[NOTE]$NC I restart proteinortho_clustering, it seems like the OMP_PROC_BIND flag is not compatible with the system (this has no effect on the output)$NC\n";}
- #$ENV{'OMP_NUM_THREADS'}=1;
- #system ("OMP_NUM_THREADS=1 $po_path/proteinortho_clustering ".($core?"-core -coreMinSpecies $coreMinSpecies -coreMaxProts $coreMaxProts ":"")."$cluster_verbose_level -minspecies $minspecies -cpus $cpus -weighted 1 -conn $connectivity -purity $purity ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -rmgraph '$rm_simgraph' '$simgraph'* >'$simtable' ".($verbose >= 1 ? "" : "2>/dev/null"));
- #if ($? != 0) {
- &Error("'proteinortho_clustering' failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nMaybe your operating system does not support the statically compiled version, please try recompiling proteinortho with 'make clean' and 'make' (and 'make install PREFIX=...').");
- #}
+ &Error("'proteinortho_clustering' failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nMaybe your operating system does not support the statically compiled version, please try recompiling proteinortho with 'make clean' and 'make' (and 'make install PREFIX=...').");
}
}else{
my $stderrout = `$po_path/proteinortho_do_mcl.pl $cpus $simgraph*`;
@@ -1424,16 +1418,11 @@ sub cluster {
#$ENV{'OMP_NUM_THREADS'}=1;
#$ENV{'OMP_PROC_BIND'}=$ompprocbind;
- system ("OMP_NUM_THREADS=1 $po_path/proteinortho_clustering ".($core?"-core -coreMinSpecies $coreMinSpecies -coreMaxProts $coreMaxProts ":"")."$cluster_verbose_level ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -minspecies $minspecies -cpus $cpus -weighted 1 -conn $connectivity -purity $purity -rmgraph '$rm_syngraph' '$syngraph'* >'$syntable' ".($verbose >= 1 ? "" : "2>/dev/null"));
+ #-purity $purity
+ system ("OMP_NUM_THREADS=1 $po_path/proteinortho_clustering ".($core?"-core -coreMinSpecies $coreMinSpecies -coreMaxProts $coreMaxProts ":"")."$cluster_verbose_level ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -minspecies $minspecies -cpus $cpus -weighted 1 -conn $connectivity -rmgraph '$rm_syngraph' '$syngraph'* >'$syntable' ".($verbose >= 1 ? "" : "2>/dev/null"));
- #if($verbose){print STDERR "$ORANGE"."[NOTE]$NC I restart proteinortho_clustering, it seems like the OMP_PROC_BIND flag is not compatible with the system (this has no effect on the output)$NC\n";}
if ($? != 0) {
- #if($verbose){print STDERR "$ORANGE"."[NOTE]$NC I restart proteinortho_clustering, it seems like the OMP_PROC_BIND flag is not compatible with the system (this has no effect on the output)$NC\n";}
- #$ENV{'OMP_NUM_THREADS'}=1;
- #system ("OMP_NUM_THREADS=1 $po_path/proteinortho_clustering ".($core?"-core -coreMinSpecies $coreMinSpecies -coreMaxProts $coreMaxProts ":"")."$cluster_verbose_level ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -minspecies $minspecies -cpus $cpus -weighted 1 -conn $connectivity -purity $purity -rmgraph '$rm_syngraph' '$syngraph'* >'$syntable' ".($verbose >= 1 ? "" : "2>/dev/null"));
- #if ($? != 0) {
- &Error("proteinortho_clustering failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nDid you use a static version? Maybe your operating system does not support the static compiled version, please recompile 'make clean' and 'make' or 'make USEPRECOMPILEDLAPACK=FALSE'.");
- #}
+ &Error("proteinortho_clustering failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nDid you use a static version? Maybe your operating system does not support the static compiled version, please recompile 'make clean' and 'make' or 'make USEPRECOMPILEDLAPACK=FALSE'.");
}
}else{
system($po_path.'/proteinortho_do_mcl.pl '.$cpus.' '.$syngraph.'*');
@@ -1579,7 +1568,6 @@ Options:
Choose larger values then more splits are done, resulting in more and smaller clusters.
(There are still cluster with an alg. conn. below this given threshold allowed if the protein to species ratio is good enough, see -minspecies option below)
-xml produces an OrthoXML formatted file of the *.proteinortho.tsv file.
- -purity= avoid spurious graph assignments, the higher the more uncertain edges are cut [0-1, default: 1e-07]
-nograph do not generate .proteinortho-graph file (pairwise orthology relations)
-core stop clustering if a split would result in groups that do not span across all species of the inital connected component. Overrules the -conn threshold.
@@ -1608,9 +1596,9 @@ Do you have suggestions or need more help: write a mail to lechner\@staff.uni-ma
";
}
# -mcl enables the mcl algorithm for clustering instead of power iteration or lapacks routine. (needs mcl to be installed)
-
# DEPRECATED -ram= maximal used ram threshold for LAPACK and the input graph in MB [default: 75% of the free memory]
# -memory= the amount of ram used for partioning in MB [default 75% of the total memory]
+# -purity= avoid spurious graph assignments, the lower the purity threshold the more uncertain edges are cut [>0, default: choose value automatically]
sub init_graph_output {
# if (-e $graph) {
@@ -2209,7 +2197,7 @@ sub get_legal_matches {
#}
if ($alignment_length < $length{$query_id}*($coverage/100)+0.5) {if ($debug>1) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold (alignment_length=$alignment_length<".( $length{$query_id}*($coverage/100)+0.5)."=$length{$query_id}*($coverage/100)+0.5)=length{query_id}*(coverage/100)+0.5,$query_id)\n";} next;}
- if ($alignment_length < $length{$subject_id}*($coverage/100)+0.5) {if ($debug>1) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold (alignment_length=$alignment_length<".( $length{$subject_id}*($coverage/100)+0.5)."=$length{$subject_id}*($coverage/100)+0.5)=length{subject_id}*(coverage/100)+0.5,$subject_id)\n";} next;}
+ if ($alignment_length < $length{$subject_id}*($coverage/100)+0.5) {if ($debug>1) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold (alignment_length=$alignment_length<".( $length{$subject_id}*($coverage/100)+0.5)."=$length{$subject_id}*($coverage/100)+0.5)=length{subject_id}*(coverage/100)+0.5,$subject_id)\n";} next;}
if($isoform ne ""){
=====================================
src/.gitlab-ci.yml
=====================================
@@ -107,23 +107,23 @@ gcc-latest-diamond:
- rm *info;
- for f in myproject.*; do if [ "$(grep -v '#' $f | wc -l | awk '{print $1}' | tr -d '\n')" -lt 10 ]; then echo "$f failed size check, output should be more than 10 lines..."; exit 1; fi; done
-nolapack-gcc-latest:
- image: gcc
- stage: test
- script:
- - echo "installing diamond"
- - wget http://github.com/bbuchfink/diamond/releases/download/v2.0.6/diamond-linux64.tar.gz 2>/dev/null
- - tar xzf diamond-linux64.tar.gz
- - cp diamond $HOME
- - export PATH="$PATH:$HOME"
- - echo "start proteinortho tests"
- - gcc --version
- - make clean
- - make LAPACK=FALSE
- - make test
- - perl proteinortho*pl test/*faa || exit 1
- - rm *info;
- - for f in myproject.*; do if [ "$(grep -v '#' $f | wc -l | awk '{print $1}' | tr -d '\n')" -lt 10 ]; then echo "$f failed size check, output should be more than 10 lines..."; exit 1; fi; done
+#nolapack-gcc-latest:
+# image: gcc
+# stage: test
+# script:
+# - echo "installing diamond"
+# - wget http://github.com/bbuchfink/diamond/releases/download/v2.0.6/diamond-linux64.tar.gz 2>/dev/null
+# - tar xzf diamond-linux64.tar.gz
+# - cp diamond $HOME
+# - export PATH="$PATH:$HOME"
+# - echo "start proteinortho tests"
+# - gcc --version
+# - make clean
+# - make LAPACK=FALSE
+# - make test
+# - perl proteinortho*pl test/*faa || exit 1
+# - rm *info;
+# - for f in myproject.*; do if [ "$(grep -v '#' $f | wc -l | awk '{print $1}' | tr -d '\n')" -lt 10 ]; then echo "$f failed size check, output should be more than 10 lines..."; exit 1; fi; done
#gcc5:
# image: gcc:5
=====================================
src/chk_test.pl
=====================================
@@ -3,11 +3,28 @@
use warnings;
use strict;
+my $check_case="";
+if(scalar @ARGV == 2){
+ $check_case=$ARGV[1];
+}
+
open(FILE,"<$ARGV[0]") || die("Error, could not open file $ARGV[0]: $!");
my @in = <FILE>;
close(FILE);
-if (scalar(@in) < 15) {
+if($check_case eq "ring4_K5"){
+ # check if there are 4 groups with 5 species and 5 proteins (ring of 4*K5)
+ exit( 4 != (scalar grep {my @arr=split("\t",$_); scalar @arr >2 && $arr[0] == 5 && $arr[1] == 5 } grep {$_=~m/^[^#]/} @in));
+}elsif($check_case eq "P3_K5"){
+ # check if there are 3 groups with 5 species and 5 proteins (Path of 3*K5)
+ exit( 3 != (scalar grep {my @arr=split("\t",$_); scalar @arr >2 && $arr[0] == 5 && $arr[1] == 5 } grep {$_=~m/^[^#]/} @in));
+}elsif($check_case eq "P4"){
+ # check if there is 1 groups with all entries
+ exit( 1 != (scalar grep {my @arr=split("\t",$_); scalar @arr >2 && $arr[0] == 4 && $arr[1] == 4 } grep {$_=~m/^[^#]/} @in));
+}elsif($check_case eq "K5"){
+ # check if there is 1 groups with all entries
+ exit( 1 != (scalar grep {my @arr=split("\t",$_); scalar @arr >2 && $arr[0] == 5 && $arr[1] == 5 } grep {$_=~m/^[^#]/} @in));
+}elsif (scalar(@in) < 15) {
print STDERR "Outputfile $ARGV[0] is smaller than expected. Test failed!\n";
exit 1;
}
=====================================
src/proteinortho2html.pl
=====================================
@@ -204,19 +204,19 @@ a!=c[b];c[b]=a;return d}};return p});
}
function showpopup(id) {
- var popup = document.getElementById("myPopupText"+id);
- if(popup.style.display=="inline-block"){
- popup.style.display="none";
- }else{
- popup.style.display="inline-block";
- }
+ var popup = document.getElementById("myPopupText"+id);
+ if(popup.style.display=="inline-block"){
+ popup.style.display="none";
+ }else{
+ popup.style.display="inline-block";
+ }
- var popupbutton = document.getElementById("myPopup"+id);
- if(popupbutton.innerHTML.includes(\'(+)\')){
- popupbutton.innerHTML = popupbutton.innerHTML.replace( /\(\+\)/,\'(-)\');
- }else{
- popupbutton.innerHTML = popupbutton.innerHTML.replace( /\(\-\)/,\'(+)\');
- }
+ var popupbutton = document.getElementById("myPopup"+id);
+ if(popupbutton.innerHTML.includes(\'(+)\')){
+ popupbutton.innerHTML = popupbutton.innerHTML.replace( /\(\+\)/,\'(-)\');
+ }else{
+ popupbutton.innerHTML = popupbutton.innerHTML.replace( /\(\-\)/,\'(+)\');
+ }
}
</script>
<style>
@@ -820,7 +820,7 @@ print '<script type="text/javascript">
if($coli>2){
my @ar=split(",",$k);
- print "<td>".$ar[0];
+ print "<td>".$ar[0];
if(scalar(@ar)>1){
@@ -829,8 +829,7 @@ print '<script type="text/javascript">
if($hiddentext ne ""){
$hiddentext.=", ";
}
- $hiddentext.=$ar[$ii];
-
+ $hiddentext.=$ar[$ii];
}
print '<div class="popup" onclick="showpopup(\\\''.$rowi."_".$coli.'\\\')" id="myPopup'.$rowi."_".$coli.'"><font color="blue">(+)</font></div><div class="popuptext" id="myPopupText'.$rowi."_".$coli.'"> '.$hiddentext.'</div>';
}
@@ -1119,7 +1118,7 @@ print '
if(genes!="" && genes.slice(-1)!=","){
genes+=",";
}
- genes+=tds[i].innerHTML.replace(/([<]([^>]+)[>])/ig,"").replace(/([(]([+-]+)[)])/ig,",").replace(/[^()]+[(]([^)]+)[)]/ig,"$1,").replace(/ /ig,"").replace(/,$/ig,"").replace(/,,+/ig,",").replace(/[()]/g,"");
+ genes+=tds[i].innerHTML.replace(/([<]([^>]+)[>])/ig,"").replace(/[^()]+[(]([^)]+)[)]/ig,"$1,").replace(/ /ig,"").replace(/,$/ig,"").replace(/([(]([+-]+)[)])/ig,",").replace(/,,+/ig,",").replace(/[()]/g,"");
}
}
=====================================
src/proteinortho_clustering.cpp
=====================================
@@ -3,7 +3,7 @@
* Reads edge list and splits connected components
* according to algebraic connectivity threshold
*
- * Last updated: 2022/06/28
+ * Last updated: 2022/09/21
* Author: Marcus Lechner, Paul Klemm
*/
@@ -11,7 +11,7 @@
#define _PROTEINORTHOCLUSTERING
//#define DEBUG
-//#define timeAnalysis //if set : need -std=c++11 for compiling
+//#define timeAnalysis
#include <iostream>
#include <fstream>
@@ -75,19 +75,16 @@ void parse_file(string);
// void remove_edge_index(const unsigned int, const unsigned int);
float getConnectivity_float(vector<unsigned int>*,bool,vector<float>*);
double getConnectivity_double(vector<unsigned int>*,bool,vector<double>*);
-void clear_edges(vector<unsigned int>&);
void partition_graph(void);
void print_header(void);
void sort_species(void);
void stats(unsigned int,unsigned int,bool);
string getTime(void);
-
-// Parameters (prefix param_*)
bool param_verbose = true;
bool param_core = false;
float param_con_threshold = 0.1; // as a reference: a chain a-b-c-d has 0.25
unsigned int debug_level = 0;
- float param_sep_purity = 1e-7; // as a reference: a-b-c will give +/-0.707107 and 2.34857e-08
+ float param_sep_purity = -1; // as a reference: a-b-c will give +/-0.707107 and 2.34857e-08
unsigned int param_max_nodes = 16777216; // 2^24
float param_min_species = 1;
string param_rmgraph = "remove.graph";
@@ -97,16 +94,14 @@ string getTime(void);
unsigned int param_coreMinSpecies = 0;
// bool param_useKmereHeuristic = false; // deprecated
// unsigned int param_maxRam_inKB = 16777216; // = 16 GB of memory, deprecated
- bool param_useLapack = true;
+ int param_useLapack = 1;
// min/max number of alg con iterations
unsigned int critical_min_nodes = 16777216;
const unsigned int min_iter = 16; // below this value, results may vary
unsigned int param_max_iter = 8192; // below this value, results may vary
float param_epsilon = 1e-8; // analog to http://people.sc.fsu.edu/~jburkardt/c_src/power_method/power_method_prb.c
-const unsigned int kmereHeuristic_minNodes = 1048576; //2^20
-const unsigned int kmereHeuristic_protPerSpecies = 1;
-const unsigned int kmereHeuristic_minNumberOfGroups = 3;
+int param_double = 1;
unsigned int param_max_nodes_weight = 1048576; //2^20
float param_lapack_power_threshold_d = -1;//2048; //2^8
@@ -314,10 +309,8 @@ void dispatch_queue::start(){
}
}
-dispatch_queue::~dispatch_queue()
-{
- if(debug_level>0)
- std::cerr << "Destructor: Destroying dispatch threads... threads:" << threads_.size() << " q:" << q_.size() << "\n";
+dispatch_queue::~dispatch_queue(){
+ if(debug_level>0) std::cerr << getTime() << " Destructor: Destroying dispatch threads... threads:" << threads_.size() << " q:" << q_.size() << "\n";
// Signal to dispatch threads that it's time to wrap up
std::unique_lock<std::mutex> lock(lock_);
@@ -333,16 +326,17 @@ dispatch_queue::~dispatch_queue()
}
}
if(debug_level>0)
- cerr << "~dispatch_queue done" << endl;
+ cerr << getTime() << " ~dispatch_queue done" << endl;
}
+
void dispatch_queue::waitTilDone(){
while(true){
- sleep(1);
bool all_idle=true;
for (map<size_t,bool>::iterator it=is_computing.begin() ; it != is_computing.end(); it++) {
if(it->second){all_idle=false;break;}
}
if(all_idle && q_.size()==0 && q_allCores_.size()==0){break;}
+ sleep(1);
}
}
@@ -351,6 +345,7 @@ void dispatch_queue::dispatch(fp_t&& op){
q_.push(std::move(op));
cv_.notify_one();
}
+
void dispatch_queue::dispatch_allCores(fp_t&& op){
std::unique_lock<std::mutex> lock(lock_);
q_allCores_.push(std::move(op));
@@ -430,7 +425,7 @@ void dispatch_queue::dispatch_thread_handler(size_t tid){
// Main
///////////////////////////////////////////////////////////
void printHelp() {
- cerr << "proteinortho_clustering - Spectral partitioning algorithm (last updated with proteinortho v6.1.1)" << "\n";
+ cerr << "proteinortho_clustering - Spectral partitioning algorithm (last updated with proteinortho v6.1.2)" << "\n";
cerr << "-----------------------------------------------------" << "\n";
cerr << "This tool is part of Proteinortho" << "\n";
cerr << "" << "\n";
@@ -441,17 +436,17 @@ void printHelp() {
cerr << " -core stop clustering if a split would result in groups that do not span across all species of the inital connected component (unless the connectivity is very low). This overrules the -conn threshold.\n";
cerr << " -rmgraph STRING output file name for the graph" << "\n";
cerr << " -seed int seed value for srand [current unix time]" << "\n";
- cerr << " -lapack bool use the lapack package for the computation of the algebraic connectivity ["<<param_useLapack<<"]" << "\n";
+ cerr << " -lapack int use the lapack package for the computation of the algebraic connectivity. 0=no, 1=yes if applicable, 2=always ["<<param_useLapack<<"]" << "\n";
cerr << " -cpus int the number of threads used for openMP ["<<num_cpus<<"]" << "\n";
cerr << " -coreMaxProts int the maximum number of proteins per species for -core ["<<param_coreMaxProteinsPerSpecies<<"]" << "\n";
cerr << " -coreMinSpecies int the minimum number of species for -core ["<<param_coreMinSpecies<<"]" << "\n";
cerr << "\ntechnical parameters:" << "\n";
cerr << " -test various test-functions are called first [not set]" << "\n";
- cerr << " -purity float threshold for purity: treat float values between -purity and purity as 0 ["<<param_sep_purity<<"]" << "\n";
cerr << " -maxnodes int only consider connected component with up to maxnodes nodes ["<<param_max_nodes<<"]" << "\n";
cerr << " -maxweight int only use the edge weights for connected components with maxweight nodes ["<<param_max_nodes_weight<<"]" << "\n";
cerr << " -epsilon float convergence threshold ["<<param_epsilon<<"]" << "\n";
cerr << " -weighted bool the spectral partition is calculated using the bitscores ["<<param_useWeights<<"]" << "\n";
+ cerr << " -double int always use double precision. 0=no, 1=yes if applicable, 2=always ["<<param_double<<"]" << "\n";
cerr << " -minOpenmp int the minimum number of nodes for parallel power iteration ["<<param_minOpenmp<<"]" << "\n";
cerr << " -powLapD | -power_d float the maximal graph density for the power iteration method, lapacks (d|s)syevr is used otherwise [adaptively choose optimal cutoff]" << "\n";
cerr << " -maxRunsConvergence int the maximum number of runs for the calculation of the algebraic connectivity ["<<param_max_iter<<"]" << "\n";
@@ -459,6 +454,7 @@ void printHelp() {
// deprecated:
// cerr << " -ram int maximal used ram threshold for LAPACK and the input graph in MB [16384]" << "\n";
// cerr << " -kmere bool use the kmere-split heuristic ["<<param_useKmereHeuristic<<"]" << "\n";
+// cerr << " -purity float threshold for purity: treat float values between -purity and purity as 0. -1=adaptively choose ["<<param_sep_purity<<"]" << "\n";
int main(int argc, char *argv[]) {
@@ -509,11 +505,11 @@ int main(int argc, char *argv[]) {
param_con_threshold = string2float(string(argv[paras]));
if(param_con_threshold<0 || param_con_threshold>1){cerr << string("Error: invalid value '"+string(argv[paras])+" for argument "+parameter+"'!").c_str() << "\n";throw;}
}
- else if (parameter == "-purity") {
- paras++;
- param_sep_purity = string2float(string(argv[paras]));
- if(param_sep_purity<0){cerr << string("Error: invalid value '"+string(argv[paras])+" for argument "+parameter+"'!").c_str() << "\n";throw;}
- }
+// else if (parameter == "-purity") {
+// paras++;
+// param_sep_purity = string2float(string(argv[paras]));
+// if(param_sep_purity<0 && param_sep_purity!=-1){cerr << string("Error: invalid value '"+string(argv[paras])+" for argument "+parameter+"'!").c_str() << "\n";throw;}
+// }
// else if (parameter == "-ram") {
// paras++;
// param_maxRam_inKB = string2float(string(argv[paras]))*1e+3;
@@ -529,6 +525,7 @@ int main(int argc, char *argv[]) {
else if(parameter == "-lapack"){
paras++;
param_useLapack = int(string2float(string(argv[paras])));
+ if(param_useLapack!=0 && param_useLapack!=1 && param_useLapack!=2){cerr << string("Error: invalid value '"+string(argv[paras])+" for argument "+parameter+"'!").c_str() << "\n";throw;}
}
else if (parameter == "-maxnodes") {
paras++;
@@ -552,6 +549,11 @@ int main(int argc, char *argv[]) {
param_epsilon = string2float(string(argv[paras]));
if(param_epsilon<0||param_epsilon>1){cerr << string("Error: invalid value '"+string(argv[paras])+" for argument "+parameter+"'!").c_str() << "\n";throw;}
}
+ else if (parameter == "-double") {
+ paras++;
+ param_double = string2float(string(argv[paras]));
+ if(param_double!=0 && param_double!=1 && param_double!=2){cerr << string("Error: invalid value '"+string(argv[paras])+" for argument "+parameter+"'!").c_str() << "\n";throw;}
+ }
else if (parameter == "-minOpenmp") {
paras++;
param_minOpenmp = int(string2float(string(argv[paras])));
@@ -615,10 +617,11 @@ int main(int argc, char *argv[]) {
if(param_core){param_con_threshold=999;}
if(getEnvVar("OMP_NUM_THREADS") != "1"){
- cerr << "Error: Please set OMP_NUM_THREADS=1.\n";
- throw;
- // setenv("OMP_NUM_THREADS","1",1); // -> does not work...
- // -> setenv does not work, since the BLAS is not a direct child of this program
+ // restart with OMP_NUM_THREADS=1
+ string cmd="OMP_NUM_THREADS=1";
+ for (paras = 0; paras < argc; paras++)
+ cmd += " "+string(argv[paras]);
+ return system(cmd.c_str());
}
// Parse files
@@ -660,20 +663,19 @@ int main(int argc, char *argv[]) {
print_header();
// Open graph-removal file
- string allRMgraphNames="";
- for(unsigned int i = 0 ; i < num_cpus ; i ++){
- stringstream ss;
- ss << i;
- allRMgraphNames+=(param_rmgraph+ss.str())+" ";
+ // string allRMgraphNames="";
+ // for(unsigned int i = 0 ; i < num_cpus ; i ++){
+ // stringstream ss;
+ // ss << i;
+ // allRMgraphNames+=(param_rmgraph+ss.str())+" ";
// graph_clean.push_back(make_shared<ofstream>((param_rmgraph+ss.str()).c_str()));
- }
+ // }
// // Clustering
// if (debug_level > 0) cerr << getTime() << " [DEBUG] Clustering" << "\n";
partition_graph();
- if(debug_level>0)
- cerr << "outside partition" << endl;
+ if(debug_level>0) cerr << getTime() << "[DEBUG] done with clustering" << endl;
// concat the remove graph output files
ofstream OFS((param_rmgraph).c_str());
@@ -694,6 +696,8 @@ int main(int argc, char *argv[]) {
}
OFS.close();
+ if(debug_level>0) cerr << getTime() << "[DEBUG] done <ofstream>graph_clean" << endl;
+
for(map<size_t,shared_ptr<ofstream> > ::iterator it = proteinorthotmp_clean.begin() ; it != proteinorthotmp_clean.end() ; ++it){
it->second->close(); // close ofstream
// dump to STDOUT
@@ -711,9 +715,13 @@ int main(int argc, char *argv[]) {
int r = system(("rm '"+param_rmgraph+"_proteinortho_tmp_"+to_string(it->first)+"'").c_str());
}
+ if(debug_level>0) cerr << getTime() << "[DEBUG] done <ofstream>proteinorthotmp_clean" << endl;
+
for(map<size_t,shared_ptr<ofstream> > ::iterator it = tmp_debug.begin() ; it != tmp_debug.end() ; ++it){
it->second->close(); // close ofstream
}
+
+ if(debug_level>0) cerr << getTime() << "[DEBUG] done done" << endl;
// if(system(("cat "+allRMgraphNames+" >"+param_rmgraph).c_str())!=0 || system(("rm "+allRMgraphNames).c_str())!=0){
// cerr << "[ERROR] cannot concatenate remove graphs" << "\n";
@@ -739,6 +747,26 @@ unsigned int numberOfNodesToMemoryUsageLaplacian_inKB(unsigned int n){
return (unsigned int)n*(unsigned int)n*sizeof(float)/1e+3;
}
+string getCCid(vector<unsigned int> nodes){
+ unsigned int n = nodes.size();
+ double id=0;
+ if(n==0){return "0";}
+ map<unsigned int,unsigned int> mapping;
+ for (unsigned int i = 0; i < (unsigned int)n; i++) {mapping[nodes[i]] = i;}
+ for (unsigned int i = 0 ; i < (unsigned int)n ; i++){
+ unsigned int from = nodes[i];
+ if(!mapping.count(from)){continue;}
+ unsigned int sum = 0;
+ for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
+ unsigned int to = graph[from].edges[j].edge;
+ if(!mapping.count(to)){continue;}
+ //id+=to_string(from)+"-"+to_string(graph[from].edges[j].edge)+":"+to_string(graph[from].edges[j].weight)+",";
+ id+=(from+1)*(graph[from].edges[j].edge+1)*graph[from].edges[j].weight;
+ }
+ }
+ return to_string((unsigned int)id);
+}
+
class ConnectedComponent // A graph representation (as vector of idx of the induced subgraph of 'graph') with some graph attributes (graph density, sum of node degrees)
{
public:
@@ -792,12 +820,15 @@ class ConnectedComponent // A graph representation (as vector of idx of the indu
void push_back(unsigned int i) {
m_content_CC.push_back(i);
}
-};
+};
+void partition_CC(ConnectedComponent, dispatch_queue *, bool, unsigned int);
+void find_CCs(dispatch_queue*);
+void find_CCs_givenNodes(dispatch_queue*,vector<unsigned int>);
void print_group(ConnectedComponent& , float, size_t, bool);
float calc_group(vector<unsigned int>*);
-ConnectedComponent BFS(vector<bool> * done, unsigned int cur_node ){
+ConnectedComponent BFS(map<unsigned int, bool> * done, unsigned int cur_node ){
ConnectedComponent ret; //return vector
list<unsigned int> q;
@@ -812,6 +843,7 @@ ConnectedComponent BFS(vector<bool> * done, unsigned int cur_node ){
cur_node = *it;
ret.push_back(cur_node);
ret.d_sum+=graph[cur_node].edges.size();
+ (*done)[cur_node] = true;
for (unsigned int i = 0; i < graph[cur_node].edges.size(); i++) {
@@ -820,11 +852,8 @@ ConnectedComponent BFS(vector<bool> * done, unsigned int cur_node ){
if(adjacency_node > graph.size()){
cerr << string("[ERROR] : Input graph is invalid. The node "+graph[cur_node].full_name +" is reporting an edge/adjacent node, that is not present in the graph.").c_str() << "\n";throw;
}
- if(adjacency_node > (*done).size()){
- cerr << string("[ERROR] : Input graph is invalid. The node "+graph[cur_node].full_name +" is not present in done vector.").c_str() << "\n";throw;
- }
- if( !(*done)[adjacency_node] ){
+ if( !done->count(adjacency_node) || !(*done)[adjacency_node] ){
(*done)[adjacency_node] = true;
q_new.push_back(adjacency_node);
@@ -841,128 +870,87 @@ ConnectedComponent BFS(vector<bool> * done, unsigned int cur_node ){
return ret;
}
-unsigned int BFS_not_critical( vector<unsigned int> * done_withBacktrack, unsigned int start_node, unsigned int end_node ){
-
- ConnectedComponent ret; //return vector
- list<unsigned int> q;
- q.push_back(start_node);
- (*done_withBacktrack)[start_node]=true;
-
- while(q.size()>0){
-
- list<unsigned int> q_new;
-
- for(list<unsigned int>::iterator it = q.begin() ; it != q.end() ; ++it){
- unsigned int cur_node = *it;
- ret.push_back(cur_node);
-
- for (unsigned int i = 0; i < graph[cur_node].edges.size(); i++) {
-
- unsigned int adjacency_node = graph[cur_node].edges[i].edge;
-
- if(adjacency_node > graph.size()){
- cerr << string("[ERROR] : Input graph is invalid. The node "+graph[cur_node].full_name +" is reporting an edge/adjacent node, that is not present in the graph.").c_str() << "\n";throw;
- }
- if(adjacency_node > (*done_withBacktrack).size()){
- cerr << string("[ERROR] : Input graph is invalid. The node "+graph[cur_node].full_name +" is not present in done vector.").c_str() << "\n";throw;
- }
-
+struct compare_ConnectedComponents { //sort from large to small
+ bool operator() (const ConnectedComponent &a, const ConnectedComponent &b) const {
+ return a.density < b.density;
+ }
+};
- if(cur_node==start_node && adjacency_node == end_node)
- continue;
+void find_CCs_givenNodes(dispatch_queue *q, vector<unsigned int> todo_work ){
- if( !(*done_withBacktrack)[adjacency_node] ){
+ map<unsigned int, bool> done; // Keep track on what was done (for each node)
+ bool allNodesAreDone = false;
- (*done_withBacktrack)[adjacency_node] = cur_node;
- q_new.push_back(adjacency_node);
- }
+ vector<ConnectedComponent> CC; // vector of all connected components found
+ //unsigned int min_i=0;
- if(cur_node != start_node && adjacency_node == end_node )
- return adjacency_node;
- }
+ for (unsigned int i = 0 ; i < todo_work.size() ; i++) {
+ unsigned int from = todo_work[i];
+ done[ from ] = 0;
+ }
+ for (unsigned int i = 0 ; i < todo_work.size() ; i++) {
+ unsigned int from = todo_work[i];
+ for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
+ if(!done.count(graph[from].edges[j].edge))
+ done[ graph[from].edges[j].edge ]=1;
}
-
- q=q_new;
}
- return start_node;
-}
-bool criticalHeuristic(ConnectedComponent *cur_cc){
+ while( true ){ // CC.size() < num_cpus / gather up to num_cpus connected components
- bool foundSomeCriticalEdge = false;
- map<pair<unsigned int,unsigned int>, bool> done_edges;
- unsigned int ce_counter=0;
+ allNodesAreDone = true;
- //if((*cur_cc).size()==283)throw 1;else return false;
+ for (unsigned int i = 0 ; i < todo_work.size() ; i++) {
+
+ unsigned int protein_id=todo_work[i];
+ if ( done.count(protein_id) && done[protein_id] ){continue;}// We were here already
- for (unsigned int i = 0 ; i < ( (*cur_cc).size()<100?(*cur_cc).size():100 ) ; i++) { //(*cur_cc).size()
- unsigned int node_a = (*cur_cc)[(rand() % (*cur_cc).size())];
+ //min_i=protein_id;
- if(graph[node_a].edges.size() < 3)continue; // dont test simple pathgraph (subgraph)
- for (unsigned int j = 0 ; j < graph[node_a].edges.size() ; j++) {
- unsigned int node_b = graph[node_a].edges[j].edge;
+ done[protein_id]=true; // mark this node
+ ConnectedComponent cur_cc = BFS(&done,protein_id); // get the CC of the current node (protein_id)
- if(graph[node_b].edges.size() < 3)continue; // dont test simple pathgraph (subgraph)
- pair<unsigned int,unsigned int> key_cur_edge;
- if(node_a < node_b) key_cur_edge = make_pair(node_a,node_b);
- else key_cur_edge = make_pair(node_b,node_a);
+ // Do not report singles
+ if (cur_cc.size() < 2) {continue;} // singletons are from no interest
- if(!done_edges.count(key_cur_edge)){
- done_edges[key_cur_edge] = true;
+ if(debug_level>0) cerr << "ConnectedComponent: @"<< getCCid(todo_work) << "=>" << cur_cc.size() << "@" << getCCid(cur_cc.m_content_CC) << endl;
- vector<unsigned int> done_nodes_BFS_withBacktrack = vector<unsigned int> (graph.size(), 0); // Keep track on what was done (for each node)
- done_nodes_BFS_withBacktrack[node_a]=node_a; // mark this node
+ // Skip those that are too large (try critical edge heuristic)
+ if (cur_cc.size() > param_max_nodes || cur_cc.size() > critical_min_nodes) {
- unsigned int ret = BFS_not_critical(&done_nodes_BFS_withBacktrack,node_a,node_b);
+ if(cur_cc.size() > param_max_nodes)
+ cerr << " [WARNING] Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
+ else
+ cerr << " [WARNING] Found a rather large connected component that contains " << cur_cc.size() << ">" << critical_min_nodes << " (critical_min_nodes) elements. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
- if(ret == node_a){ // == IT IS CRITICAL
-
- vector<wedge>::iterator it = graph[node_a].edges.begin();
- for(it = graph[node_a].edges.begin() ; it != graph[node_a].edges.end() ; ++it){
- if((*it).edge == node_b){
- graph[node_a].edges.erase(it);
- break;
- }
- }
- it = graph[node_b].edges.begin();
- for(it = graph[node_b].edges.begin() ; it != graph[node_b].edges.end() ; ++it){
- if((*it).edge == node_a){
- graph[node_b].edges.erase(it);
- break;
- }
- }
-
- ce_counter++;
- foundSomeCriticalEdge = true;
- }else{
-
- //backtrack all edges in the loop
-
- unsigned int v=ret;
- while(done_nodes_BFS_withBacktrack[v]!=v){
- if(v < done_nodes_BFS_withBacktrack[v]) key_cur_edge = make_pair(v,done_nodes_BFS_withBacktrack[v]);
- else key_cur_edge = make_pair(done_nodes_BFS_withBacktrack[v],v);
- done_edges[key_cur_edge] = true;
-
- done_nodes_BFS_withBacktrack[v]=v;
- }
+ if(cur_cc.size() > param_max_nodes){
+ cerr << " [WARNING] The very large connected component that contains " << cur_cc.size() << " elements, did not have any critical edges." << "\n";
+ cerr << " [WARNING] Skipping this connected component. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds." << "\n";
+ continue;
}
}
- }
- }
- if(ce_counter>0){
- cerr << " [WARNING] (critical edge heuristic) Found and removed "<<ce_counter<<" critical edge(s)." << "\n";
- //throw 1;
- }
- return foundSomeCriticalEdge;
-}
+ if(param_core && cur_cc.species_num==0){
+ if(cur_cc.size()<param_coreMinSpecies){allNodesAreDone=false;break;} // if there are less nodes than wanted species -> skip
+ map<unsigned int,bool>cc_species;
+ for (int i = 0; i < cur_cc.size(); ++i){ cc_species[graph[cur_cc[i]].species_id]=true; }
+ cur_cc.species_num=cc_species.size();
+ if(cc_species.size()<param_coreMinSpecies){allNodesAreDone=false;break;} // less species as wanted -> skip
+ }
+
+ cur_cc.calc_dsum();
+ cur_cc.calc_density();
+ if(cur_cc.density > 1){ cerr << "[WARNING] : The input graph has duplicated edges, this lead to an invalid graph density of " << cur_cc.density << " (should be <1). Please clean the .blast-graph with 'proteinortho.pl --cleanblast --step=3 --project=...' or use the cleanupblastgraph tool in src/ to remove the duplicated edges." << "\n"; throw; }
+ if (debug_level > 0) cerr << getTime() << " [DEBUG] Found connected component: " << cur_cc.size() << " proteins (ID: " << protein_id << "), graph density="<< cur_cc.density << ", sum of degrees="<< cur_cc.d_sum << "\n";
-struct compare_ConnectedComponents { //sort from large to small
- bool operator() (const ConnectedComponent &a, const ConnectedComponent &b) const {
- return a.density < b.density;
+ q->dispatch([cur_cc,q]{ partition_CC(cur_cc,q,true,false); });
+
+ allNodesAreDone=false;
+ break;
+ }
+ if(allNodesAreDone)break; // no additional CC can be found -> done
}
-};
+}
void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack, unsigned int restarted){
size_t tid=std::hash<std::thread::id>{}(std::this_thread::get_id());
@@ -970,6 +958,9 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
vector<float> x_hat;
float connectivity;
+ if( param_double == 2 && restarted==0 ){restarted=1;} // force double
+ if( param_double == 0 && restarted>0 ){return;} // force float
+
if(cur_cc.size()==0){return;} // sanity check
if(cur_cc.size()==1){
@@ -1031,6 +1022,8 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
}
if(param_core && cur_cc.species_num < param_coreMinSpecies){return;}
+ if(debug_level>0) cerr << "ConnectedComponent: @"<< getCCid(cur_cc.m_content_CC) << " connectivity=" << connectivity << endl;
+
if (connectivity < 0 || connectivity > param_con_threshold) {
// cerr << " [DEBUG] done "<<connectivity<<"\n";
print_group(cur_cc,connectivity,tid,false);
@@ -1057,35 +1050,246 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
// Store data about two groups (Zero cannot be assigned with certainty)
ConnectedComponent groupA, groupB, groupZero;
map<unsigned int,bool> groupB_set;
+ map<unsigned int,bool> groupZero_set;
- for (unsigned int i = 0; i < x_hat.size(); i++) {
- if(abs(x_hat[i]) < param_sep_purity) {
- groupZero.m_content_CC.push_back(cur_cc[i]);
- //groupZero[nodes[i]] = true;
+ if(param_sep_purity==-1){
+ // automatic purity detection
+
+ unsigned int n = cur_cc.size();
+ map<unsigned int,unsigned int> mapping;
+ for(unsigned int i = 0; i < (unsigned int)n; i++){ mapping[cur_cc[i]] = i; }
+
+ /*
+ * Automatic prurity detetection (v6.1.2)
+ * purity = threshold of nodes that are neither part of the bisection result of the current split (negative vs positive x_hat entries)
+ *
+ * purity is imporatant if there are multiple clusters that can be split or there is no good split
+ * identified nodes (in purity boundary) are put back to the queue stack (=groupZero)
+ *
+ * maximal purity = 0.1 -> get all nodes with |x_hat(v)|<0.1
+ * identify all possible purity thresholds
+ * each purity threshold is evaluated with the total sum of edge weights that are removed with this cut
+ * find most optimal purity cut-off
+ */
+
+ float new_purity=0;
+ float best_cut=0;
+ float cut_value=0;
+ for (unsigned int i = 0; i < x_hat.size(); i++) {
+ if(x_hat[i] < 0) {
+ unsigned int from=cur_cc[i];
+ for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
+ unsigned int to = graph[from].edges[j].edge;
+ if(!mapping.count(to)){continue;}
+ if(x_hat[mapping[to]] > 0) {
+ best_cut += graph[from].edges[j].weight;
+ }
+ }
+ }
}
- else if (x_hat[i] < 0) {
- groupA.m_content_CC.push_back(cur_cc[i]);
- //groupA[nodes[i]] = true;
+ if(debug_level>0) cerr << " initial purity=0 best_cut=" << best_cut << endl;
+ for(unsigned int pi = 0; pi < (unsigned int)n; pi++){
+ unsigned int from = cur_cc[pi];
+ if(!mapping.count(from) || abs(x_hat[ mapping[from] ])>0.1 ){continue;}
+ float test_purity = abs(x_hat[ mapping[from] ]);
+ float cut_value=0;
+ unsigned int num_nodes=0;
+ for (unsigned int i = 0; i < x_hat.size(); i++) {
+ if(abs(x_hat[i]) <= test_purity) {
+ num_nodes++;
+ unsigned int from=cur_cc[i];
+ for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
+ unsigned int to = graph[from].edges[j].edge;
+ if(!mapping.count(to)){continue;}
+ if(abs(x_hat[mapping[to]]) > test_purity) {
+ cut_value+=graph[from].edges[j].weight;
+ }
+ }
+ }
+ }
+ if( cut_value>0 && (best_cut==-1 || best_cut>cut_value) && num_nodes>1){
+ new_purity=test_purity;
+ best_cut=cut_value;
+ if(debug_level>0) cerr << " initial purity="<<new_purity<<" best_cut=" << best_cut << endl;
+ }
}
- else { // x_hat[i] > 0
- groupB.m_content_CC.push_back(cur_cc[i]);
- groupB_set[cur_cc[i]] = true;
+
+ if(new_purity>0.1){new_purity=0;} // if there are no impure nodes -> disable
+ unsigned int num_impure_nodes=0;
+ for (unsigned int i = 0; i < x_hat.size(); i++) { if(abs(x_hat[i]) < new_purity) { num_impure_nodes++; } }
+ if(debug_level>0) cerr << " num_impure_nodes" << num_impure_nodes << endl;
+ if(num_impure_nodes<2 && new_purity > 0.001){new_purity=0; } // disable if there is only one node that is impure with a high purity threshold
+
+ if(debug_level>0) cerr << " [DEBUG] detected a purity of "<< (new_purity==0 ? "<disabled>":to_string(new_purity)) << "\n";
+
+ for (unsigned int i = 0; i < x_hat.size(); i++) {
+ if(abs(x_hat[i]) <= new_purity) {
+ groupZero.m_content_CC.push_back(cur_cc[i]);
+ groupZero_set[cur_cc[i]] = true;
+ }
+ else if (x_hat[i] < 0) {
+ groupA.m_content_CC.push_back(cur_cc[i]);
+ }
+ else { // x_hat[i] > 0
+ groupB.m_content_CC.push_back(cur_cc[i]);
+ groupB_set[cur_cc[i]] = true;
+ }
}
- }
- if( (groupA.size() == 0 && groupB.size() == 0) ){
-
- cerr << " [WARNING] All nodes are below the purity threshold. Continue without purity." << "\n";
- for (unsigned int i = 0; i < groupZero.size(); i++) {
- if (groupZero[i] < 0) {
- groupA.m_content_CC.push_back(groupZero[i]);
+ if( (groupA.size() == 0 && groupB.size() == 0) ){
+
+ cerr << " [WARNING] All nodes are below the purity threshold. Continue without purity." << "\n";
+
+ for (unsigned int i = 0; i < groupZero.size(); i++) {
+ if (x_hat[ groupZero[i] ] < 0) {
+ groupA.m_content_CC.push_back(groupZero[i]);
+ }
+ else { // x_hat[i] > 0
+ groupB.m_content_CC.push_back(groupZero[i]);
+ }
+ }
+ }
+ }else{
+ for (unsigned int i = 0; i < x_hat.size(); i++) {
+ if(abs(x_hat[i]) < param_sep_purity) {
+ groupZero.m_content_CC.push_back(cur_cc[i]);
+ groupZero_set[cur_cc[i]] = true;
+ }
+ else if (x_hat[i] < 0) {
+ groupA.m_content_CC.push_back(cur_cc[i]);
}
else { // x_hat[i] > 0
- groupB.m_content_CC.push_back(groupZero[i]);
+ groupB.m_content_CC.push_back(cur_cc[i]);
+ groupB_set[cur_cc[i]] = true;
+ }
+ }
+ if( (groupA.size() == 0 && groupB.size() == 0) ){
+
+ cerr << " [WARNING] All nodes are below the purity threshold. Continue purity/2." << "\n";
+
+ for (unsigned int i = 0; i < x_hat.size(); i++) {
+ if(abs(x_hat[i]) < param_sep_purity/2) {
+ groupZero.m_content_CC.push_back(cur_cc[i]);
+ groupZero_set[cur_cc[i]] = true;
+ }
+ else if (x_hat[i] < 0) {
+ groupA.m_content_CC.push_back(cur_cc[i]);
+ }
+ else { // x_hat[i] > 0
+ groupB.m_content_CC.push_back(cur_cc[i]);
+ groupB_set[cur_cc[i]] = true;
+ }
+ }
+
+ if( (groupA.size() == 0 && groupB.size() == 0) ){
+
+ cerr << " [WARNING] All nodes are below the purity threshold. Continue purity/10." << "\n";
+
+ for (unsigned int i = 0; i < x_hat.size(); i++) {
+ if(abs(x_hat[i]) < param_sep_purity/10) {
+ groupZero.m_content_CC.push_back(cur_cc[i]);
+ groupZero_set[cur_cc[i]] = true;
+ }
+ else if (x_hat[i] < 0) {
+ groupA.m_content_CC.push_back(cur_cc[i]);
+ }
+ else { // x_hat[i] > 0
+ groupB.m_content_CC.push_back(cur_cc[i]);
+ groupB_set[cur_cc[i]] = true;
+ }
+ }
+
+ if( (groupA.size() == 0 && groupB.size() == 0) ){
+
+ cerr << " [WARNING] All nodes are below the purity threshold. Continue purity/100." << "\n";
+
+ for (unsigned int i = 0; i < x_hat.size(); i++) {
+ if(abs(x_hat[i]) < param_sep_purity/100) {
+ groupZero.m_content_CC.push_back(cur_cc[i]);
+ groupZero_set[cur_cc[i]] = true;
+ }
+ else if (x_hat[i] < 0) {
+ groupA.m_content_CC.push_back(cur_cc[i]);
+ }
+ else { // x_hat[i] > 0
+ groupB.m_content_CC.push_back(cur_cc[i]);
+ groupB_set[cur_cc[i]] = true;
+ }
+ }
+
+ if( (groupA.size() == 0 && groupB.size() == 0) ){
+
+ cerr << " [WARNING] All nodes are below the purity threshold. Continue without purity." << "\n";
+
+ for (unsigned int i = 0; i < groupZero.size(); i++) {
+ if (x_hat[ groupZero[i] ] < 0) {
+ groupA.m_content_CC.push_back(groupZero[i]);
+ }
+ else { // x_hat[i] > 0
+ groupB.m_content_CC.push_back(groupZero[i]);
+ }
+ }
+ }
+ }
}
}
}
- if (debug_level > 0) cerr << getTime() << " [DEBUG] splitting into ("<<groupA.size()<<","<<groupB.size()<<","<<groupZero.size()<<") sized groups!"<< "\n";
+
+ string id;
+ string idA;
+ string idB;
+ string idZ;
+ if(debug_level==15){
+ unsigned int n = cur_cc.size();
+ id=getCCid(cur_cc.m_content_CC);
+ idA=getCCid(groupA.m_content_CC);
+ idB=getCCid(groupB.m_content_CC);
+ idZ=getCCid(groupZero.m_content_CC);
+ map<unsigned int,unsigned int> mapping;
+ for (unsigned int i = 0; i < (unsigned int)n; i++) {mapping[cur_cc[i]] = i;}
+ {
+ ofstream OFS(("cluster_n"+to_string(n)+"_id"+id+".edgelist").c_str());
+ OFS << "source\ttarget\tweight"<<endl;
+
+ map<pair<unsigned int,unsigned int>,unsigned int> knownedges;
+ for(unsigned int i = 0 ; i < (unsigned int)n ; i++){
+ unsigned int from = cur_cc[i];
+ unsigned int sum = 0;
+ if(!mapping.count(from)){continue;}
+ for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
+ unsigned int to = graph[from].edges[j].edge;
+ if(!mapping.count(to)){continue;}
+ pair<unsigned int,unsigned int> key_cur_edge;
+ if(from < to) key_cur_edge = make_pair(from,to);
+ else key_cur_edge = make_pair(to,from);
+ if(!knownedges.count(key_cur_edge)){
+ knownedges[key_cur_edge] = true;
+ OFS << graph[from].full_name << "\t" << graph[to].full_name << "\t" << graph[from].edges[j].weight << endl;
+ }
+ }
+ }
+ OFS.close();
+ string id=getCCid(cur_cc.m_content_CC);
+ }
+ {
+ ofstream OFS(("cluster_n"+to_string(n)+"_id"+id+"_ci0_conn"+to_string(connectivity)+".nodeweight").c_str());
+ OFS << "id\teigenvector"<<endl;
+ for(unsigned int i = 0 ; i < (unsigned int)n ; i++){
+ OFS << graph[cur_cc[i]].full_name << "\t" << x_hat[i] << endl;
+ }
+ OFS.close();
+ }
+ {
+ ofstream OFS(("cluster_n"+to_string(n)+"_id"+id+"_ci0_conn"+to_string(connectivity)+".split").c_str());
+ OFS << "id\tsplit"<<endl;
+ for (unsigned int i = 0; i < groupZero.size(); i++) { OFS << graph[groupZero[i]].full_name << "\t0" << endl; }
+ for (unsigned int i = 0; i < groupA.size(); i++) { OFS << graph[groupA[i]].full_name << "\t1" << endl; }
+ for (unsigned int i = 0; i < groupB.size(); i++) { OFS << graph[groupB[i]].full_name << "\t-1" << endl; }
+ OFS.close();
+ }
+ }
+
+ if (debug_level > 0) cerr << getTime() << " [DEBUG] splitting CC with "<<cur_cc.size()<<" nodes "<<(debug_level==15 ? "@"+id+"" : "")<< " into ("<<groupA.size()<<""<<(debug_level==15 ? " @"+idA+"" : "")<< ","<<groupB.size()<<""<<(debug_level==15 ? " @"+idB+"" : "")<< ","<<groupZero.size()<<""<<(debug_level==15 ? " @"+idZ+"" : "")<< ") sized groups!"<< "\n";
// Catch error in laplacien calcs
if ( (groupA.size() == 0 && groupB.size() == 0) ||
@@ -1110,10 +1314,20 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
graph_clean[tid]=make_shared<ofstream>((param_rmgraph+to_string(tid)).c_str());
for(unsigned int i = 0 ; i < groupA.size() ; i++){
- protein node_i = graph[i];
+ protein node_i = graph[groupA[i]];
for(unsigned int j = 0 ; j < graph[groupA[i]].edges.size() ; j++){
protein node_j = graph[graph[groupA[i]].edges[j].edge];
- if(groupB_set.count(graph[groupA[i]].edges[j].edge)){
+ if(groupB_set.count(graph[groupA[i]].edges[j].edge) ||
+ groupZero_set.count(graph[groupA[i]].edges[j].edge) ){
+ (*graph_clean[tid]) << node_i.full_name << "\t" << species[node_i.species_id] << "\t" << node_j.full_name << "\t" << species[node_j.species_id] << "\n";
+ }
+ }
+ }
+ for(unsigned int i = 0 ; i < groupB.size() ; i++){
+ protein node_i = graph[groupB[i]];
+ for(unsigned int j = 0 ; j < graph[groupB[i]].edges.size() ; j++){
+ protein node_j = graph[graph[groupB[i]].edges[j].edge];
+ if(groupZero_set.count(graph[groupB[i]].edges[j].edge)){
(*graph_clean[tid]) << node_i.full_name << "\t" << species[node_i.species_id] << "\t" << node_j.full_name << "\t" << species[node_j.species_id] << "\n";
}
}
@@ -1132,40 +1346,44 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
param_coreMaxProteinsPerSpecies*cur_cc.species_num > cur_cc.size()){
print_group(cur_cc,connectivity,tid,false);
}else{
- if(groupA.species_num >= param_coreMinSpecies)
+ if(groupA.species_num >= param_coreMinSpecies && groupA.size()>1)
q->dispatch([groupA,q]{ partition_CC(groupA,q,true,0); });
- if(groupB.species_num >= param_coreMinSpecies)
+ if(groupB.species_num >= param_coreMinSpecies && groupB.size()>1)
q->dispatch([groupB,q]{ partition_CC(groupB,q,true,0); });
}
-
return;
}
- if(groupA.size()>0)
- q->dispatch([groupA,q]{ partition_CC(groupA,q,true,false); });
- if(groupB.size()>0)
- q->dispatch([groupB,q]{ partition_CC(groupB,q,true,false); });
+ if(groupA.size()>1)
+ //q->dispatch([groupA,q]{ partition_CC(groupA,q,true,false); });
+ q->dispatch([groupA,q]{ find_CCs_givenNodes(q,groupA.m_content_CC); });
+ if(groupB.size()>1)
+ //q->dispatch([groupB,q]{ partition_CC(groupB,q,true,false); });
+ q->dispatch([groupB,q]{ find_CCs_givenNodes(q,groupB.m_content_CC); });
+ if(groupZero.size()>1)
+ q->dispatch([groupZero,q]{ find_CCs_givenNodes(q,groupZero.m_content_CC); });
}
}
void find_CCs(dispatch_queue *q){
- vector<bool> done; // Keep track on what was done
- done = vector<bool> (graph.size(), false); // Keep track on what was done (for each node)
+ map<unsigned int, bool> done; // Keep track on what was done (for each node)
bool allNodesAreDone = false;
vector<ConnectedComponent> CC; // vector of all connected components found
- unsigned int min_i=0;
+ //unsigned int min_i=0;
while( true ){ // CC.size() < num_cpus / gather up to num_cpus connected components
allNodesAreDone = true;
- for (unsigned int protein_id = min_i ; protein_id < graph.size() ; protein_id++) {
+ for (unsigned int protein_id = 0 ; protein_id < graph.size() ; protein_id++) {
- if (done[protein_id]){continue;}// We were here already
+ if (done.count(protein_id) && done[protein_id]){continue;}// We were here already
- min_i=protein_id;
+ if(debug_level>0) cerr << "find_CCs:start at "<< protein_id << "<" << graph.size() << endl;
+
+ //min_i=protein_id;
done[protein_id]=true; // mark this node
ConnectedComponent cur_cc = BFS(&done,protein_id); // get the CC of the current node (protein_id)
@@ -1184,7 +1402,6 @@ void find_CCs(dispatch_queue *q){
if(cur_cc.size() > param_max_nodes){
cerr << " [WARNING] The very large connected component that contains " << cur_cc.size() << " elements, did not have any critical edges." << "\n";
cerr << " [WARNING] Skipping this connected component. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds." << "\n";
- //clear_edges(cur_cc.m_content_CC);
continue;
}
}
@@ -1200,11 +1417,9 @@ void find_CCs(dispatch_queue *q){
cur_cc.calc_dsum();
cur_cc.calc_density();
if(cur_cc.density > 1){ cerr << "[WARNING] : The input graph has duplicated edges, this lead to an invalid graph density of " << cur_cc.density << " (should be <1). Please clean the .blast-graph with 'proteinortho.pl --cleanblast --step=3 --project=...' or use the cleanupblastgraph tool in src/ to remove the duplicated edges." << "\n"; throw; }
- if (debug_level > 0) cerr << getTime() << " [DEBUG] Found connected component: " << cur_cc.size() << " proteins (ID: " << protein_id << "), graph density="<< cur_cc.density << ", sum of degrees="<< cur_cc.d_sum << "\n";
-
+ if (debug_level > 0) cerr << getTime() << " [DEBUG:find_CCs] Found connected component: " << cur_cc.size() << " proteins (ID: " << protein_id << "), graph density="<< cur_cc.density << ", sum of degrees="<< cur_cc.d_sum << " ini from " << graph[protein_id].full_name<< "\n";
q->dispatch([cur_cc,q]{ partition_CC(cur_cc,q,true,false); });
-
allNodesAreDone=false;
break;
}
@@ -1212,7 +1427,6 @@ void find_CCs(dispatch_queue *q){
}
}
-
///////////////////////////////////////////////////////////
// Major partioning algorithm
///////////////////////////////////////////////////////////
@@ -1226,37 +1440,15 @@ void partition_graph() {
q.dispatch([&q]{ find_CCs(&q); });
q.start();
- if(debug_level>0)
- cerr << "done with ini" << endl;
+ if(debug_level>0) cerr << getTime() << " [DEBUG] done with ini dispatch_queue" << endl;
q.waitTilDone();
- if(debug_level>0)
- cerr << "quit" << endl;
+ if(debug_level>0) cerr << getTime() << " [DEBUG] quit dispatch_queue" << endl;
return;
}
-///////////////////////////////////////////////////////////
-// Basic Graph functions
-///////////////////////////////////////////////////////////
-// Remove all edges from the given list of protein ids
-void clear_edges(vector<unsigned int>& nodes) {
- #ifdef timeAnalysis
- auto t_tmp = std::chrono::steady_clock::now( );
- #endif
-
- for (unsigned int i = 0; i < nodes.size(); i++) {
- graph[nodes[i]].edges.clear();
- vector<wedge>().swap(graph[nodes[i]].edges);
- }
-
- #ifdef timeAnalysis
- if(!t_master.count("clear_edges")) t_master["clear_edges"]=0;
- t_master["clear_edges"] += (float)std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::steady_clock::now( ) - t_tmp ).count()/1e+9;
- #endif
-}
-
///////////////////////////////////////////////////////////
// File parser
///////////////////////////////////////////////////////////
@@ -1326,6 +1518,8 @@ void parse_file(string file) {
// 5.16 do not point to yourself
if (!fields[0].compare(fields[1])) {continue;}
+ //if(debug_level>0) cerr << "parse_file:(" << fields[0]<<","<<fields[1] << ") -> " << (protein2id.find(fields[0]) == protein2id.end()) << endl;
+
// A new protein
if (protein2id.find(fields[0]) == protein2id.end()) {
protein a;
@@ -1390,6 +1584,8 @@ void parse_file(string file) {
unsigned int a_id = protein2id[fields[0]];
unsigned int b_id = protein2id[fields[1]];
+ if(!param_useWeights){bitscore_avg=1;}
+
// 5.17, add weight
wedge w;
w.edge=b_id;
@@ -1414,7 +1610,6 @@ void parse_file(string file) {
}
// graph_ram_total_inKB += graph_ram_total/1e+3;
-
// if (debug_level > 0) cerr << getTime() << " [DEBUG] Expected Memory Usage of the current input graph: " << graph_ram_total_inKB << " KB = " << graph_ram_total_inKB/1e+3 << " MB. (current input graph are all the currently loaded files)" << "\n";
if(species_counter==0){species.push_back("0");species_counter++;}
@@ -1461,7 +1656,7 @@ void stats(unsigned int lapack,unsigned int power,bool active_status) {
last_stat_act = active_status;
if(debug_level==0) cerr << '\r';
else cerr << endl;
- cerr << "Clustering: working on (" << lapack << (lapack==1 ? "@find_CC" : "@lapack")<<(active_status?"*":"")<<" + " << power << "@power"<<(!active_status?"*":"")<<") connected component"<< (lapack+power>1?"s":"") <<" " << std::flush ;
+ cerr << "Clustering: working on (" << lapack << (lapack==1 ? "@BFS/lapack" : "@lapack")<<(active_status?"*":"")<<" + " << power << "@power"<<(!active_status?"*":"")<<") connected component"<< (lapack+power>1?"s":"") <<" " << std::flush ;
if(lapack==0 && power==0){ cerr << '\r'<<"Clustering: done \n"; }
}
}
@@ -1475,10 +1670,8 @@ void print_header() {
cout << "\n";
}
-struct protein_degree
-{
- inline bool operator() (const pair<string,int> & p1, const pair<string,int>& p2)
- {
+struct protein_degree{
+ inline bool operator() (const pair<string,int> & p1, const pair<string,int>& p2){
return (p1.second > p2.second);
}
};
@@ -1490,6 +1683,26 @@ void print_group(ConnectedComponent& nodes, float connectivity, size_t tid, bool
auto t_tmp = std::chrono::steady_clock::now( );
#endif
+ if(debug_level>0) cerr << getTime()<< " [DEBUG] print_group start"<< endl;
+
+ if(nodes.size()<2){return;}
+
+ map<unsigned int, bool> done; // Keep track on what was done (for each node)
+ for (unsigned int i = 0 ; i < nodes.size() ; i++) {
+ unsigned int from = nodes[i];
+ done[ from ] = 0;
+ }
+ for (unsigned int i = 0 ; i < nodes.size() ; i++) {
+ unsigned int from = nodes[i];
+ for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
+ if(!done.count(graph[from].edges[j].edge))
+ done[ graph[from].edges[j].edge ]=1;
+ }
+ }
+ ConnectedComponent cur_cc = BFS(&done,nodes[0]); // get the CC of the current node (protein_id)#
+ if((int)cur_cc.size() - (int)nodes.size() != 0){return;}
+
+
if(!proteinorthotmp_clean.count(tid)){
proteinorthotmp_clean[tid]=make_shared<ofstream>((param_rmgraph+"_proteinortho_tmp_"+to_string(tid)).c_str());
}
@@ -1507,8 +1720,8 @@ void print_group(ConnectedComponent& nodes, float connectivity, size_t tid, bool
line[current_species].push_back(make_pair( graph[current_protein].full_name , graph[current_protein].edges.size() ));
}
- (*proteinorthotmp_clean[tid]) << species_number << "\t" << nodes.size() << "\t" << setprecision (3) << (connectivity < 0 ? -connectivity : connectivity) << (failed ? "*": "");
-// cerr << species_number << "\t" << nodes.size() << "\t" << setprecision (3) << connectivity;
+ (*proteinorthotmp_clean[tid]) << species_number << "\t" << nodes.size() << "\t" << (debug_level>0 ? "@"+getCCid(nodes.m_content_CC)+":":"") << setprecision (3) << (connectivity < 0 ? -connectivity : connectivity) << (failed ? "*": "");
+ // cerr << species_number << "\t" << nodes.size() << "\t" << setprecision (3) << connectivity;
// List group data
for (unsigned int i = 0; i < species_counter; i++) {
@@ -1531,11 +1744,13 @@ void print_group(ConnectedComponent& nodes, float connectivity, size_t tid, bool
// output
(*proteinorthotmp_clean[tid]) << "\t" << return_line;
-// cerr << "\t" << return_line;
+ // cerr << "\t" << return_line;
}
(*proteinorthotmp_clean[tid]) << "\n";
-// cerr << "\n";
+ // cerr << "\n";
+
+ if(debug_level>0) cerr << getTime()<< " [DEBUG] print_group @"<<getCCid(nodes.m_content_CC)<<" ERROR="<< ((int)cur_cc.size() - (int)nodes.size()) << endl;
//os.close();
@@ -1556,7 +1771,7 @@ float calc_group(vector<unsigned int>* nodes) {
}
return speciesids.size()==0 ? 99999 : ((float)nodes->size())/((float)speciesids.size()); // 99999 such that if the species information is missing, then the criterion always fails and the splits are only made based on the alg. connectivity
-/* SEG FAIL:
+ /* SEG FAIL:
vector<unsigned int> line(species_counter,0); // Species vector
unsigned int species_number = 0;
// For each protein in group, count species
@@ -1621,7 +1836,7 @@ void tokenize(const string& str, vector<string>& tokens, const string& delimiter
// Return maximum degree of given protein_ids -- openMP A ML
unsigned int max_of_diag(vector<unsigned int>& nodes, vector<unsigned int>& diag) {
- if(debug_level==9) cerr << getTime() << " [debug:9] max_of_diag start" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] max_of_diag start" << endl;
unsigned int max = 0;
@@ -1632,7 +1847,7 @@ unsigned int max_of_diag(vector<unsigned int>& nodes, vector<unsigned int>& diag
if (diag[i] > max) max = diag[i] ;
}
- if(debug_level==9) cerr << getTime() << " [debug:9] max_of_diag end" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] max_of_diag end" << endl;
return max;
}
@@ -1642,7 +1857,7 @@ template<class T> vector<T> generate_random_vector(const unsigned int size) {
auto t_tmp = std::chrono::steady_clock::now( );
#endif
- if(debug_level==9) cerr << getTime() << " [debug:9] generate_random_vector start" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] generate_random_vector start" << endl;
vector<T> x(size);
@@ -1652,7 +1867,7 @@ template<class T> vector<T> generate_random_vector(const unsigned int size) {
if (x[i] == x[i-1]) x[i] /= 3; // Check: at least one value must be different from the others but still within 0 and 1
}
- if(debug_level==9) cerr << getTime() << " [debug:9] generate_random_vector end" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] generate_random_vector end" << endl;
#ifdef timeAnalysis
if(!t_master.count("partition_graph::convergence::generate_random_vector")) t_master["partition_graph::convergence::generate_random_vector"]=0;
@@ -1669,22 +1884,22 @@ template<class T> vector<T> get_new_x(vector<T>*x, vector<unsigned int>& nodes,
auto t_tmp = std::chrono::steady_clock::now( );
#endif
- if(debug_level==9) cerr << getTime() << " [debug:9] get_new_x start" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] get_new_x start" << endl;
vector<T> x_new(x->size(),0);
bool useOpenMpFlag = (nodes.size() > param_minOpenmp);
- if(debug_level==9) cerr << getTime() << " [debug:9] get_new_x done ini " << useOpenMpFlag << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] get_new_x done ini " << useOpenMpFlag << endl;
if(isWeighted){
#pragma omp parallel for if (useOpenMpFlag)
for (unsigned int i = 0; i < nodes.size(); i++) {
-
// go through adjacency list of node
for (unsigned int j = 0; j < graph[nodes[i]].edges.size(); j++) {
// y points to z, so take entry z from x
unsigned int abs_target = graph[nodes[i]].edges[j].edge;
+ if(!mapping.count(abs_target)){continue;}
unsigned int rel_target = mapping[abs_target];
x_new[i] += (*x)[rel_target]*(T)graph[nodes[i]].edges[j].weight;
@@ -1700,6 +1915,7 @@ template<class T> vector<T> get_new_x(vector<T>*x, vector<unsigned int>& nodes,
for (unsigned int j = 0; j < graph[nodes[i]].edges.size(); j++) {
// y points to z, so take entry z from x
unsigned int abs_target = graph[nodes[i]].edges[j].edge;
+ if(!mapping.count(abs_target)){continue;}
unsigned int rel_target = mapping[abs_target];
x_new[i] += (*x)[rel_target];
@@ -1707,7 +1923,7 @@ template<class T> vector<T> get_new_x(vector<T>*x, vector<unsigned int>& nodes,
}
}
- if(debug_level==9) cerr << getTime() << " [debug:9] get_new_x end" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] get_new_x end" << endl;
#ifdef timeAnalysis
if(!t_master.count("partition_graph::convergence::get_new_x")) t_master["partition_graph::convergence::get_new_x"]=0;
@@ -1723,7 +1939,7 @@ template<class T> vector<T> makeOrthogonal(vector<T> x) {
auto t_tmp = std::chrono::steady_clock::now( );
#endif
- if(debug_level==9) cerr << getTime() << " [debug:9] makeOrthogonal start" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] makeOrthogonal start" << endl;
T sum = 0;
@@ -1742,7 +1958,7 @@ template<class T> vector<T> makeOrthogonal(vector<T> x) {
t_master["partition_graph::convergence::makeOrthogonal"] += (T)std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::steady_clock::now( ) - t_tmp ).count()/1e+9;
#endif
- if(debug_level==9) cerr << getTime() << " [debug:9] makeOrthogonal end" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] makeOrthogonal end" << endl;
return x;
}
@@ -1753,7 +1969,7 @@ template<class T> void normalize(vector<T> *x, T *length) {
auto t_tmp = std::chrono::steady_clock::now( );
#endif
- if(debug_level==9) cerr << getTime() << " [debug:9] normalize start" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] normalize start" << endl;
T sum = 0;
bool useOpenMpFlag = (x->size() > param_minOpenmp);
@@ -1772,14 +1988,14 @@ template<class T> void normalize(vector<T> *x, T *length) {
t_master["partition_graph::convergence::normalize"] += (T)std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::steady_clock::now( ) - t_tmp ).count()/1e+9;
#endif
- if(debug_level==9) cerr << getTime() << " [debug:9] normalize end" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] normalize end" << endl;
//return x;
}
// Qx, Formula (5) -- openMP A ML
template<class T> vector<T> getY(T max_degree, vector<T> x_hat, vector<T>* x_new, vector<unsigned int>& nodes, vector<unsigned int>& diag){
- if(debug_level==9) cerr << getTime() << " [debug:9] getY start" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] getY start" << endl;
#ifdef timeAnalysis
auto t_tmp = std::chrono::steady_clock::now( );
#endif
@@ -1788,34 +2004,19 @@ template<class T> vector<T> getY(T max_degree, vector<T> x_hat, vector<T>* x_new
#pragma omp parallel for if (useOpenMpFlag)
for (unsigned int i = 0; i < x_hat.size(); i++) {
-
x_hat[i] *= ((T)2*max_degree - (T)diag[i]);
x_hat[i] += (*x_new)[i];
-
}
#ifdef timeAnalysis
if(!t_master.count("partition_graph::convergence::getY")) t_master["partition_graph::convergence::getY"]=0;
t_master["partition_graph::convergence::getY"] += (T)std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::steady_clock::now( ) - t_tmp ).count()/1e+9;
#endif
- if(debug_level==9) cerr << getTime() << " [debug:9] getY end" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] getY end" << endl;
return x_hat;
}
-unsigned int sumOutDegs(const vector<unsigned int>& nodes) {
-
- if(debug_level==9) cerr << getTime() << " [debug:9] sumOutDegs start" << endl;
- unsigned int sum = 0;
- for (unsigned int a = 0; a < nodes.size(); a++) {
- unsigned int from = nodes[a];
- sum += graph[from].edges.size();
- }
-
- if(debug_level==9) cerr << getTime() << " [debug:9] sumOutDegs end" << endl;
- return sum/2;
-}
-
double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vector<double> *x_hat) {
bool useWeights = (param_useWeights && nodes->size() <= param_max_nodes_weight); //param_useWeights = user input whether weights should be used or not. useWeights = the true value, that is true if param_useWeights is true and the maximum number of nodes are not exeeded for the weighted algorithm (param_max_nodes_weight)
@@ -1833,7 +2034,8 @@ double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vecto
double connectivity = -1;
*x_hat = vector<double>(n);
- if( n < 32768 && useLapack ){
+ if( ( n < 32768 && useLapack && param_useLapack == 1 ) || param_useLapack == 2 ){
+ if (debug_level > 0) cerr << getTime() << " [DEBUG]@double using LAPACK" << endl;
// maximal number of nodes 32768 (2^15 = SHRT_MAX) -> laplace matrix 2^15*2^15=2^30 (max vector size) entries
// max vector size = std::vector<int> myvector; cout << myvector.max_size() << "\n"; -> 2^30
@@ -1934,14 +2136,15 @@ double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vecto
connectivity = eigenvalues[0]/((double)n);
}
for(unsigned int i = 0 ; i < (unsigned int)n ; i++)
- (*x_hat)[i]=eigenvectors[i];
+ (*x_hat)[i]=-eigenvectors[i];
// deallocate
delete [] eigenvectors;
}else{
+ if (debug_level > 0) cerr << getTime() << " [DEBUG]@double using POWER" << endl;
- if(debug_level==9) cerr << getTime() << " [debug:9] start" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] start" << endl;
if(param_useWeights && !useWeights){
cerr << " [INFO] The maximum number of nodes for the weighted algorithm is exeeded. Continue now with the faster unweighted algorithm." << "\n";
@@ -1952,22 +2155,29 @@ double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vecto
if(useWeights){
for (unsigned int i = 0; i < (unsigned int)n; i++) {
diag[i]=0;
+ unsigned int from = (*nodes)[i];
+ if(!mapping.count(from)){continue;}
+
for (unsigned int j = 0; j < graph[(*nodes)[i]].edges.size(); j++) {
+
+ unsigned int to = graph[from].edges[j].edge;
+ if(!mapping.count(to)){continue;}
diag[i] += graph[(*nodes)[i]].edges[j].weight;
if(useWeights && maxWeight<graph[(*nodes)[i]].edges[j].weight)maxWeight=graph[(*nodes)[i]].edges[j].weight;
}
}
- }else{
- for (unsigned int i = 0; i < (unsigned int)n; i++) {
- diag[i]=graph[(*nodes)[i]].edges.size();
- if(useWeights)
- for (unsigned int j = 0; j < graph[(*nodes)[i]].edges.size(); j++) {
- if(maxWeight<graph[(*nodes)[i]].edges[j].weight)maxWeight=graph[(*nodes)[i]].edges[j].weight;
- }
- }
}
+ // else{
+ // for (unsigned int i = 0; i < (unsigned int)n; i++) {
+ // diag[i]=graph[(*nodes)[i]].edges.size();
+ // if(useWeights)
+ // for (unsigned int j = 0; j < graph[(*nodes)[i]].edges.size(); j++) {
+ // if(maxWeight<graph[(*nodes)[i]].edges[j].weight)maxWeight=graph[(*nodes)[i]].edges[j].weight;
+ // }
+ // }
+ // }
- if(debug_level==9) cerr << getTime() << " [debug:9] ini done" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] ini done" << endl;
// Get max degree / sum of weights of nodes
unsigned int max_d = max_of_diag((*nodes),diag);
@@ -1991,7 +2201,7 @@ double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vecto
while(++iter < param_max_iter) {
- if(debug_level==9) cerr << getTime() << " [debug:9] iter:" << iter << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] iter:" << iter << endl;
last_length = current_length;
@@ -2009,7 +2219,7 @@ double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vecto
if ( abs(current_length-last_length) < param_epsilon && iter >= min_iter ) break; // prevent convergence by chance, converge to param_epsilon
}
- if(debug_level==9) cerr << getTime() << " [debug:9] post while" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9] post while" << endl;
#ifdef timeAnalysis
if(!t_master.count("partition_graph::convergence")) t_master["partition_graph::convergence"]=0;
@@ -2027,9 +2237,8 @@ double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vecto
}else{
connectivity = (-current_length+(double)2*max_d)/((double)n);
}
- normalize(x_hat, ¤t_length);
-
- if(debug_level==9) cerr << getTime() << " [debug:9] post last norm" << endl;
+ // normalize(x_hat, ¤t_length);
+ //if(debug_level==9) cerr << getTime() << " [DEBUG:9] post last norm" << endl;
// 5.17 catch hardly-converging groups
if (iter >= param_max_iter) {
@@ -2037,7 +2246,6 @@ double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vecto
}
}
-
if (debug_level > 1){
cerr << getTime() << " [DEBUG] Connectivity score:" << connectivity;
if ( (debug_level > 1 && (unsigned int)n<100 ) || debug_level > 2 ){
@@ -2069,7 +2277,7 @@ double getConnectivity_double(vector<unsigned int> *nodes, bool useLapack, vecto
return connectivity;
}
-float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<float> *x_hat) {
+float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<float> *x_hat){
if (debug_level > 0) cerr << getTime() << " getConnectivity_float"<< endl;
@@ -2088,7 +2296,9 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
float connectivity = -1;
*x_hat = vector<float>(n);
- if( n < 32768 && useLapack ){
+ if( ( n < 32768 && useLapack && param_useLapack == 1 ) || param_useLapack == 2 ){
+
+ if (debug_level > 0) cerr << getTime() << " [DEBUG:9]@float using LAPACK" << endl;
// maximal number of nodes 32768 (2^15 = SHRT_MAX) -> laplace matrix 2^15*2^15=2^30 (max vector size) entries
// max vector size = std::vector<int> myvector; cout << myvector.max_size() << "\n"; -> 2^30
@@ -2096,30 +2306,6 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
float * laplacian = (float*)calloc( (unsigned int)n*(unsigned int)n,sizeof(float) );
- if(debug_level==15){
- unsigned int id=0;
- for(unsigned int i = 0 ; i < (unsigned int)n ; i++){
- unsigned int from = (*nodes)[i];
- if(!mapping.count(from)){continue;}
- unsigned int sum = 0;
- for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
- id+=graph[from].edges[j].weight;
- }
- }
- ofstream OFS(("cluster_n"+to_string(n)+"_id"+to_string(id)+".edgelist").c_str());
- OFS << "source\ttarget\tweight"<<endl;
- for(unsigned int i = 0 ; i < (unsigned int)n ; i++){
- unsigned int from = (*nodes)[i];
- if(!mapping.count(from)){continue;}
- unsigned int sum = 0;
- for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
- unsigned int to = graph[from].edges[j].edge;
- OFS << from << "\t" << to << "\t" << graph[from].edges[j].weight << endl;
- }
- }
- OFS.close();
- }
-
bool fill_laplacian_return=1; // return value of the fill algorithm -> true : all fine, false : ERROR
// fill laplacian
for(unsigned int i = 0 ; i < (unsigned int)n ; i++){
@@ -2131,7 +2317,6 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
unsigned int to = graph[from].edges[j].edge;
-
if(!mapping.count(to)){continue;}
unsigned int vector_idx = mapping[from] + mapping[to]*n; //transform the 2-d coordinate (i,j) of the nxn matrix to 1-d vector coordinate i+j*n of the 2n vector
@@ -2171,7 +2356,7 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
float eigenvalues[(unsigned int)n]; // need only 1 eigenvalue
il = 2; //that is the second one (il=1 -> the first one, il=2 the second one)
iu = 2;
- if(debug_level==15){ iu = 5; } // generate more vectors for debug 15
+ //if(debug_level==15){ iu = 5; } // generate more vectors for debug 15
float * eigenvectors = (float*)malloc( (unsigned int)ldz*(unsigned int)m*(iu-il+1)*sizeof(float) );
float eps=param_epsilon;
@@ -2213,35 +2398,15 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
connectivity = eigenvalues[0]/((float)n);
}
for(unsigned int i = 0 ; i < (unsigned int)n ; i++)
- (*x_hat)[i]=eigenvectors[i];
+ (*x_hat)[i]=-eigenvectors[i];
- if(debug_level==15){
- unsigned int id=0;
- for(unsigned int i = 0 ; i < (unsigned int)n ; i++){
- unsigned int from = (*nodes)[i];
- if(!mapping.count(from)){continue;}
- unsigned int sum = 0;
- for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
- id+=graph[from].edges[j].weight;
- }
- }
-
- for(unsigned int ci = 0 ; ci < 5 ; ci++){
- ofstream OFS(("cluster_n"+to_string(n)+"_id"+to_string(id)+"_ci"+to_string(ci)+".nodeweight").c_str());
- OFS << "id\teigenvector"<<endl;
- for(unsigned int i = 0 ; i < (unsigned int)n ; i++){
- OFS << (*nodes)[i] <<"\t" << eigenvectors[i+n*ci] << endl;
- }
- OFS.close();
- }
- }
-
- // deallocate
delete [] eigenvectors;
}else{
- if(debug_level==9) cerr << getTime() << " [debug:9]@float start" << endl;
+ if (debug_level > 0) cerr << getTime() << " [DEBUG:9]@float using POWER" << endl;
+
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9]@float start" << endl;
if(param_useWeights && !useWeights){
cerr << " [INFO] The maximum number of nodes for the weighted algorithm is exeeded. Continue now with the faster unweighted algorithm." << "\n";
@@ -2250,27 +2415,31 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
vector<unsigned int> diag(n);
if(useWeights){
-
- #pragma omp parallel for reduction(max: maxWeight)
for (unsigned int i = 0; i < (unsigned int)n; i++) {
diag[i]=0;
+ unsigned int from = (*nodes)[i];
+ if(!mapping.count(from)){continue;}
+
for (unsigned int j = 0; j < graph[(*nodes)[i]].edges.size(); j++) {
+
+ unsigned int to = graph[from].edges[j].edge;
+ if(!mapping.count(to)){continue;}
diag[i] += graph[(*nodes)[i]].edges[j].weight;
if(useWeights && maxWeight<graph[(*nodes)[i]].edges[j].weight)maxWeight=graph[(*nodes)[i]].edges[j].weight;
}
}
- }else{
-
- #pragma omp parallel for reduction(max: maxWeight)
- for (unsigned int i = 0; i < (unsigned int)n; i++) {
- diag[i]=graph[(*nodes)[i]].edges.size();
- if(useWeights)
- for (unsigned int j = 0; j < graph[(*nodes)[i]].edges.size(); j++) {
- if(maxWeight<graph[(*nodes)[i]].edges[j].weight)maxWeight=graph[(*nodes)[i]].edges[j].weight;
- }
- }
}
- if(debug_level==9) cerr << getTime() << " [debug:9]@float ini done" << endl;
+ // else{
+ // #pragma omp parallel for reduction(max: maxWeight)
+ // for (unsigned int i = 0; i < (unsigned int)n; i++) {
+ // diag[i]=graph[(*nodes)[i]].edges.size();
+ // if(useWeights)
+ // for (unsigned int j = 0; j < graph[(*nodes)[i]].edges.size(); j++) {
+ // if(maxWeight<graph[(*nodes)[i]].edges[j].weight)maxWeight=graph[(*nodes)[i]].edges[j].weight;
+ // }
+ // }
+ // }
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9]@float ini done" << endl;
// Get max degree / sum of weights of nodes
unsigned int max_d = max_of_diag((*nodes),diag);
@@ -2294,7 +2463,7 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
while(++iter < param_max_iter) {
- if(debug_level==9) cerr << getTime() << " [debug:9]@float iter:" << iter << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9]@float iter:" << iter << endl;
last_length = current_length;
@@ -2312,7 +2481,7 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
if ( abs(current_length-last_length) < param_epsilon && iter >= min_iter ) break; // prevent convergence by chance, converge to param_epsilon
}
- if(debug_level==9) cerr << getTime() << " [debug:9]@float post while" << endl;
+ if(debug_level==9) cerr << getTime() << " [DEBUG:9]@float post while" << endl;
#ifdef timeAnalysis
if(!t_master.count("partition_graph::convergence")) t_master["partition_graph::convergence"]=0;
@@ -2330,16 +2499,18 @@ float getConnectivity_float(vector<unsigned int> *nodes, bool useLapack, vector<
}else{
connectivity = (-current_length+(float)2*max_d)/((float)n);
}
- normalize(x_hat, ¤t_length);
-
- if(debug_level==9) cerr << getTime() << " [debug:9]@float post last norm" << endl;
+ //normalize(x_hat, ¤t_length);
+ //if(debug_level==9) cerr << getTime() << " [DEBUG:9]@float post last norm" << endl;
// 5.17 catch hardly-converging groups
if (iter >= param_max_iter) {
if (debug_level > 0) cerr << getTime() << " [DEBUG] Connectivity score of connected component with " << n << " elements did not converge perfectly in time." << "\n";
}
+ if(debug_level>0) cerr << getTime() << " [DEBUG] length:" << current_length << endl;
}
+ //float cl = 0;
+ //normalize(x_hat, &cl);
if (debug_level > 1){
cerr << getTime() << " [DEBUG] Connectivity score:" << connectivity;
@@ -2720,5 +2891,4 @@ bool test__getY(){
}
-
-#endif /* _PROTEINORTHOCLUSTERING */
+#endif /* _PROTEINORTHOCLUSTERING */
\ No newline at end of file
=====================================
test/K5.blast-graph
=====================================
@@ -0,0 +1,32 @@
+# file_a file_b
+# a b evalue_ab bitscore_ab evalue_ba bitscore_ba
+# A B
+# 7.9e-34 138.3 7.9e-34 138.3
+a1 b1 7.9e-34 138.3 7.9e-34 138.3
+# A C
+# 7.9e-34 138.3 7.9e-34 138.3
+a1 c1 7.9e-34 138.3 7.9e-34 138.3
+# A D
+# 7.9e-34 138.3 7.9e-34 138.3
+a1 d1 7.9e-34 138.3 7.9e-34 138.3
+# A E
+# 7.9e-34 138.3 7.9e-34 138.3
+a1 e1 7.9e-34 138.3 7.9e-34 138.3
+# B C
+# 7.9e-34 138.3 7.9e-34 138.3
+b1 c1 7.9e-34 138.3 7.9e-34 138.3
+# B D
+# 7.9e-34 138.3 7.9e-34 138.3
+b1 d1 7.9e-34 138.3 7.9e-34 138.3
+# B E
+# 7.9e-34 138.3 7.9e-34 138.3
+b1 e1 7.9e-34 138.3 7.9e-34 138.3
+# C D
+# 7.9e-34 138.3 7.9e-34 138.3
+c1 d1 7.9e-34 138.3 7.9e-34 138.3
+# C E
+# 7.9e-34 138.3 7.9e-34 138.3
+c1 e1 7.9e-34 138.3 7.9e-34 138.3
+# D E
+# 7.9e-34 138.3 7.9e-34 138.3
+d1 e1 7.9e-34 138.3 7.9e-34 138.3
=====================================
test/P3_K5.blast-graph
=====================================
@@ -0,0 +1,54 @@
+# file_a file_b
+# a b evalue_ab bitscore_ab evalue_ba bitscore_ba
+# A B
+# 2.3e-31 132.9 7.9e-34 138.3
+a1 b1 3.2e-54 208.8 3.4e-57 216.1
+a2 b2 3.2e-54 208.8 3.4e-57 216.1
+a3 b3 3.2e-54 208.8 3.4e-57 216.1
+# A C
+# 2.3e-31 132.9 7.9e-34 138.3
+a1 c1 3.2e-54 208.8 3.4e-57 216.1
+a2 c2 3.2e-54 208.8 3.4e-57 216.1
+a3 c3 3.2e-54 208.8 3.4e-57 216.1
+# A D
+# 2.3e-31 132.9 7.9e-34 138.3
+a1 d1 3.2e-54 208.8 3.4e-57 216.1
+a1 d2 3.2e-54 208.8 3.4e-57 216.1
+a2 d2 3.2e-54 208.8 3.4e-57 216.1
+a2 d3 3.2e-54 208.8 3.4e-57 216.1
+a3 d3 3.2e-54 208.8 3.4e-57 216.1
+# A E
+# 2.3e-31 132.9 7.9e-34 138.3
+a1 e1 3.2e-54 208.8 3.4e-57 216.1
+a2 e2 3.2e-54 208.8 3.4e-57 216.1
+a3 e3 3.2e-54 208.8 3.4e-57 216.1
+# B C
+# 2.3e-31 132.9 7.9e-34 138.3
+b1 c1 3.2e-54 208.8 3.4e-57 216.1
+b2 c2 3.2e-54 208.8 3.4e-57 216.1
+b3 c3 3.2e-54 208.8 3.4e-57 216.1
+# B D
+# 2.3e-31 132.9 7.9e-34 138.3
+b1 d1 3.2e-54 208.8 3.4e-57 216.1
+b2 d2 3.2e-54 208.8 3.4e-57 216.1
+b3 d3 3.2e-54 208.8 3.4e-57 216.1
+# B E
+# 2.3e-31 132.9 7.9e-34 138.3
+b1 e1 3.2e-54 208.8 3.4e-57 216.1
+b2 e2 3.2e-54 208.8 3.4e-57 216.1
+b3 e3 3.2e-54 208.8 3.4e-57 216.1
+# C D
+# 2.3e-31 132.9 7.9e-34 138.3
+c1 d1 3.2e-54 208.8 3.4e-57 216.1
+c2 d2 3.2e-54 208.8 3.4e-57 216.1
+c3 d3 3.2e-54 208.8 3.4e-57 216.1
+# C E
+# 2.3e-31 132.9 7.9e-34 138.3
+c1 e1 3.2e-54 208.8 3.4e-57 216.1
+c2 e2 3.2e-54 208.8 3.4e-57 216.1
+c3 e3 3.2e-54 208.8 3.4e-57 216.1
+# D E
+# 2.3e-31 132.9 7.9e-34 138.3
+d1 e1 3.2e-54 208.8 3.4e-57 216.1
+d2 e2 3.2e-54 208.8 3.4e-57 216.1
+d3 e3 3.2e-54 208.8 3.4e-57 216.1
=====================================
test/P4.blast-graph
=====================================
@@ -0,0 +1,11 @@
+# file_a file_b
+# a b evalue_ab bitscore_ab evalue_ba bitscore_ba
+# A B
+# 7.9e-34 138.3 7.9e-34 138.3
+a1 b1 7.9e-34 138.3 7.9e-34 138.3
+# B C
+# 7.9e-34 138.3 7.9e-34 138.3
+b1 c1 7.9e-34 138.3 7.9e-34 138.3
+# C D
+# 7.9e-34 138.3 7.9e-34 138.3
+c1 d1 7.9e-34 138.3 7.9e-34 138.3
=====================================
test/myproject.blast-graph
=====================================
@@ -0,0 +1,169 @@
+# file_a file_b
+# a b evalue_ab bitscore_ab evalue_ba bitscore_ba
+# E.faa C.faa
+# 3.60017e-123 437.95 3.80065e-124 440.45
+E_10 C_10 7.2e-123 430.6 7.6e-124 434.1
+E_11 C_11 1.7e-50 189.1 1.6e-49 186.0
+E_13 C_13 1.5e-26 109.0 7.9e-26 106.7
+E_14 C_14 8.0e-162 560.5 6.7e-162 560.8
+E_15 C_15 7.8e-99 350.5 2.0e-100 355.9
+E_16 C_16 2.7e-45 171.8 6.0e-46 174.1
+E_17 C_17 5.2e-184 634.4 2.8e-180 622.1
+E_18 C_64 1.3e-62 229.9 7.1e-64 234.2
+E_19 C_22 1.4e-29 118.6 6.8e-30 119.8
+E_19 C_63 1.9e-31 124.8 6.1e-31 123.2
+E_313 C_12 1.6e-130 456.4 2.3e-129 452.6
+E_315 C_12 3.4e-127 445.3 1.3e-127 446.8
+E_317 C_1 9.4e-134 467.2 6.1e-134 468.0
+E_366 C_1 2.6e-128 449.1 6.4e-131 458.0
+E_368 C_1 3.6e-133 465.3 1.5e-132 463.4
+E_437 C_1 3.3e-129 452.2 1.7e-128 449.9
+# C2.faa L.faa
+# 5.5e-128 446.8 1.9e-130 449.9
+C_10 L_10 5.5e-128 446.8 1.9e-130 449.9
+# C2.faa E.faa
+# 7.6e-124 434.1 1e-124 430.6
+C_10 E_10 7.6e-124 434.1 1.0e-124 430.6
+# C2.faa C.faa
+# 0 1382.5 0 1382.5
+C_10 C_10 0.0e+00 1382.5 0.0e+00 1382.5
+# C2.faa M.faa
+# 3.6e-127 444.1 3.4e-129 445.7
+C_10 M_10 3.6e-127 444.1 3.4e-129 445.7
+# L.faa C.faa
+# 5.5e-94 360.75 1.05e-94 362.85
+L_10 C_10 1.3e-128 449.9 5.5e-128 446.8
+L_11 C_11 1.8e-58 215.7 3.0e-57 210.7
+L_14 C_14 3.6e-121 425.2 2.3e-123 431.8
+L_15 C_15 4.3e-86 308.1 2.1e-87 311.6
+L_16 C_16 1.1e-41 159.8 3.4e-42 160.6
+L_17 C_17 6.0e-169 584.3 1.3e-167 578.9
+L_18 C_64 1.7e-64 236.1 1.4e-65 238.8
+L_19 C_63 8.0e-28 112.8 1.8e-28 114.0
+L_2 C_164 3.3e-93 331.6 3.8e-93 330.5
+L_2 C_166 2.2e-89 318.9 1.0e-90 322.4
+L_2 C_167 9.8e-90 320.1 1.5e-89 318.5
+L_2 C_2 1.1e-93 333.2 2.1e-94 334.7
+L_20 C_20 8.1e-171 590.5 2.2e-170 588.2
+L_313 C_12 1.8e-115 406.4 2.3e-114 401.7
+L_313 C_21 5.1e-110 388.3 4.2e-111 391.0
+L_323 C_1 5.1e-135 471.5 3.0e-134 468.0
+L_336 C_1 1.9e-131 459.5 3.1e-131 458.0
+L_621 C_13 1.9e-34 135.2 1.5e-33 131.3
+L_627 C_1 3.9e-135 471.9 2.3e-134 468.4
+L_631 C_1 6.0e-136 474.6 2.6e-138 481.5
+# M.faa C.faa
+# 1.9e-117 412.9 2.3e-117 411.8
+M_10 C_10 2.4e-127 445.7 3.6e-127 444.1
+M_11 C_11 4.5e-57 211.1 1.3e-55 205.3
+M_14 C_14 3.6e-127 445.3 2.8e-129 451.4
+M_15 C_15 2.7e-83 298.9 3.3e-85 304.3
+M_16 C_16 1.1e-35 139.8 8.1e-36 139.4
+M_17 C_17 5.4e-154 534.6 1.7e-154 535.4
+M_18 C_64 1.5e-68 249.6 8.1e-69 249.6
+M_19 C_63 3.1e-24 100.9 1.6e-24 100.9
+M_2 C_164 2.7e-92 328.6 4.7e-91 323.6
+M_2 C_166 1.1e-90 323.2 1.4e-90 322.0
+M_2 C_167 1.7e-94 335.9 3.8e-93 330.5
+M_2 C_2 2.5e-93 332.0 8.8e-93 329.3
+M_20 C_20 1.1e-170 590.1 2.2e-170 588.2
+M_313 C_12 1.9e-117 412.9 2.3e-117 411.8
+M_323 C_1 9.5e-134 467.2 2.2e-132 461.8
+M_331 C_1 9.3e-129 450.7 4.5e-130 454.1
+M_336 C_1 1.0e-132 463.8 8.8e-134 466.5
+M_621 C_13 6.4e-38 146.7 3.7e-37 143.3
+M_630 C_1 8.7e-127 444.1 1.6e-127 445.7
+M_632 C_1 2.3e-132 462.6 1.6e-130 455.7
+M_636 C_1 4.1e-129 451.8 6.5e-129 450.3
+# M.faa L.faa
+# 0 1370.1 0 1372.1
+M_10 L_10 0.0e+00 1219.1 0.0e+00 1220.7
+M_11 L_11 1.9e-163 563.5 3.2e-163 562.8
+M_14 L_14 0.0e+00 1298.9 0.0e+00 1296.6
+M_15 L_15 4.8e-262 891.7 6.3e-262 891.3
+M_16 L_16 4.0e-135 469.2 2.6e-135 469.9
+M_17 L_17 0.0e+00 1630.2 0.0e+00 1638.2
+M_18 L_18 4.4e-213 728.8 2.2e-212 726.5
+M_19 L_19 2.5e-73 263.1 7.3e-73 261.5
+M_2 L_2 9.4e-230 784.3 1.1e-228 780.8
+M_20 L_20 0.0e+00 1698.3 0.0e+00 1697.6
+M_3 L_3 3.6e-212 725.7 4.8e-212 725.3
+M_313 L_313 0.0e+00 1370.1 0.0e+00 1372.1
+M_317 L_317 0.0e+00 1414.1 0.0e+00 1412.5
+M_319 L_319 0.0e+00 1478.0 0.0e+00 1473.0
+M_323 L_323 0.0e+00 1545.0 0.0e+00 1542.3
+M_328 L_328 0.0e+00 1474.5 0.0e+00 1473.8
+M_331 L_331 0.0e+00 1559.7 0.0e+00 1560.8
+M_333 L_333 0.0e+00 1568.5 0.0e+00 1566.6
+M_336 L_336 0.0e+00 1464.5 0.0e+00 1469.5
+M_4 L_4 0.0e+00 1321.6 0.0e+00 1315.8
+M_5 L_5 0.0e+00 1607.4 0.0e+00 1607.8
+M_6 L_6 0.0e+00 1629.4 0.0e+00 1625.9
+M_617 L_617 0.0e+00 1484.9 0.0e+00 1483.4
+M_619 L_619 2.1e-82 293.5 6.3e-82 292.0
+M_621 L_621 2.0e-99 350.1 1.2e-99 350.9
+M_623 L_623 4.9e-109 382.1 2.2e-109 383.3
+M_627 L_627 0.0e+00 1313.1 0.0e+00 1323.1
+M_632 L_631 0.0e+00 1248.4 0.0e+00 1235.3
+M_632 L_633 0.0e+00 1232.6 0.0e+00 1226.1
+M_634 L_635 0.0e+00 1309.7 0.0e+00 1314.7
+M_636 L_635 0.0e+00 1347.0 0.0e+00 1352.8
+M_636 L_637 0.0e+00 1297.3 0.0e+00 1297.3
+M_638 L_637 0.0e+00 1344.7 0.0e+00 1343.6
+M_638 L_639 0.0e+00 1343.2 0.0e+00 1338.9
+M_640 L_641 0.0e+00 1488.0 0.0e+00 1488.0
+M_640 L_643 0.0e+00 1479.9 0.0e+00 1481.5
+M_642 L_641 0.0e+00 1505.7 0.0e+00 1496.1
+M_642 L_643 0.0e+00 1528.5 0.0e+00 1521.5
+M_644 L_645 0.0e+00 1691.8 0.0e+00 1692.2
+M_644 L_647 0.0e+00 1627.8 0.0e+00 1630.9
+M_646 L_645 0.0e+00 1702.6 0.0e+00 1700.6
+M_646 L_647 0.0e+00 1662.9 0.0e+00 1663.7
+M_649 L_641 0.0e+00 1444.1 0.0e+00 1442.6
+M_8 L_8 0.0e+00 1667.1 0.0e+00 1672.5
+M_9 L_9 2.0e-103 363.6 5.0e-102 359.0
+# L.faa E.faa
+# 1.70000000085e-145 520.8 3.500145e-149 523.85
+L_10 E_10 9.0e-128 447.2 5.6e-127 443.4
+L_11 E_11 2.1e-65 238.8 8.4e-65 235.7
+L_14 E_14 2.6e-162 562.0 9.9e-164 565.8
+L_15 E_15 4.2e-103 364.8 4.6e-103 363.6
+L_16 E_16 1.6e-59 219.2 1.3e-59 218.4
+L_17 E_17 1.2e-199 686.4 1.4e-201 691.8
+L_18 E_18 7.0e-80 287.3 2.6e-80 287.7
+L_19 E_19 5.3e-30 120.2 3.9e-31 122.9
+L_313 E_313 8.0e-178 613.6 1.2e-177 612.1
+L_317 E_437 1.7e-154 536.2 2.9e-153 531.2
+L_323 E_368 1.4e-180 622.9 3.5e-180 620.5
+L_617 E_432 3.4e-145 505.4 7.0e-149 516.5
+L_621 E_13 1.5e-37 145.6 3.7e-37 143.3
+L_623 E_13 3.7e-39 151.0 7.5e-38 145.6
+L_627 E_368 6.4e-184 634.0 2.3e-184 634.4
+L_631 E_317 1.5e-169 586.3 2.3e-171 591.3
+L_631 E_368 1.4e-175 606.3 1.9e-178 614.8
+L_635 E_317 3.1e-175 605.1 3.1e-173 597.4
+# M.faa E.faa
+# 2e-163 565.8 2.3e-163 564.7
+M_10 E_10 2.6e-127 445.7 2.1e-126 441.4
+M_11 E_11 5.8e-63 230.7 2.3e-62 227.6
+M_14 E_14 5.1e-164 567.8 1.5e-164 568.5
+M_15 E_15 1.2e-102 363.2 3.6e-103 364.0
+M_16 E_16 2.2e-50 188.7 1.5e-50 188.3
+M_17 E_17 9.4e-197 676.8 2.1e-200 688.0
+M_18 E_18 1.8e-80 289.3 3.5e-80 287.3
+M_19 E_19 1.3e-28 115.5 3.6e-29 116.3
+M_313 E_313 1.8e-177 612.5 6.9e-178 612.8
+M_319 E_367 6.9e-156 540.8 2.6e-156 541.2
+M_323 E_317 2.3e-170 589.0 2.0e-167 578.2
+M_323 E_368 6.9e-175 604.0 1.9e-173 598.2
+M_328 E_432 2.7e-152 528.9 2.0e-151 525.0
+M_331 E_368 3.0e-170 588.6 1.6e-172 595.1
+M_336 E_317 2.0e-163 565.8 4.6e-164 567.0
+M_336 E_366 2.9e-165 572.0 8.9e-165 569.3
+M_336 E_368 1.6e-171 592.8 3.2e-173 597.4
+M_621 E_13 3.5e-42 161.0 5.0e-42 159.5
+M_627 E_368 2.1e-176 609.0 2.3e-179 617.8
+M_632 E_368 1.6e-176 609.4 4.5e-175 603.6
+M_634 E_317 4.2e-164 568.2 2.3e-163 564.7
+M_636 E_317 3.5e-163 565.1 2.5e-162 561.2
+M_636 E_366 2.0e-158 549.3 9.0e-157 542.7
=====================================
test/ring4_K5.blast-graph
=====================================
@@ -0,0 +1,66 @@
+# file_a file_b
+# a b evalue_ab bitscore_ab evalue_ba bitscore_ba
+# A B
+# 2.3e-31 132.9 7.9e-34 138.3
+a1 b1 3.2e-54 208.8 3.4e-57 216.1
+a2 b2 3.2e-54 208.8 3.4e-57 216.1
+a3 b3 3.2e-54 208.8 3.4e-57 216.1
+a4 b4 3.2e-54 208.8 3.4e-57 216.1
+# A C
+# 2.3e-31 132.9 7.9e-34 138.3
+a1 c1 3.2e-54 208.8 3.4e-57 216.1
+a2 c2 3.2e-54 208.8 3.4e-57 216.1
+a3 c3 3.2e-54 208.8 3.4e-57 216.1
+a4 c4 3.2e-54 208.8 3.4e-57 216.1
+# A D
+# 2.3e-31 132.9 7.9e-34 138.3
+a1 d1 3.2e-54 208.8 3.4e-57 216.1
+a1 d2 3.2e-54 208.8 3.4e-57 216.1
+a2 d2 3.2e-54 208.8 3.4e-57 216.1
+a2 d3 3.2e-54 208.8 3.4e-57 216.1
+a3 d3 3.2e-54 208.8 3.4e-57 216.1
+a3 d4 3.2e-54 208.8 3.4e-57 216.1
+a4 d4 3.2e-54 208.8 3.4e-57 216.1
+a4 d1 3.2e-54 208.8 3.4e-57 216.1
+# A E
+# 2.3e-31 132.9 7.9e-34 138.3
+a1 e1 3.2e-54 208.8 3.4e-57 216.1
+a2 e2 3.2e-54 208.8 3.4e-57 216.1
+a3 e3 3.2e-54 208.8 3.4e-57 216.1
+a4 e4 3.2e-54 208.8 3.4e-57 216.1
+# B C
+# 2.3e-31 132.9 7.9e-34 138.3
+b1 c1 3.2e-54 208.8 3.4e-57 216.1
+b2 c2 3.2e-54 208.8 3.4e-57 216.1
+b3 c3 3.2e-54 208.8 3.4e-57 216.1
+b4 c4 3.2e-54 208.8 3.4e-57 216.1
+# B D
+# 2.3e-31 132.9 7.9e-34 138.3
+b1 d1 3.2e-54 208.8 3.4e-57 216.1
+b2 d2 3.2e-54 208.8 3.4e-57 216.1
+b3 d3 3.2e-54 208.8 3.4e-57 216.1
+b4 d4 3.2e-54 208.8 3.4e-57 216.1
+# B E
+# 2.3e-31 132.9 7.9e-34 138.3
+b1 e1 3.2e-54 208.8 3.4e-57 216.1
+b2 e2 3.2e-54 208.8 3.4e-57 216.1
+b3 e3 3.2e-54 208.8 3.4e-57 216.1
+b4 e4 3.2e-54 208.8 3.4e-57 216.1
+# C D
+# 2.3e-31 132.9 7.9e-34 138.3
+c1 d1 3.2e-54 208.8 3.4e-57 216.1
+c2 d2 3.2e-54 208.8 3.4e-57 216.1
+c3 d3 3.2e-54 208.8 3.4e-57 216.1
+c4 d4 3.2e-54 208.8 3.4e-57 216.1
+# C E
+# 2.3e-31 132.9 7.9e-34 138.3
+c1 e1 3.2e-54 208.8 3.4e-57 216.1
+c2 e2 3.2e-54 208.8 3.4e-57 216.1
+c3 e3 3.2e-54 208.8 3.4e-57 216.1
+c4 e4 3.2e-54 208.8 3.4e-57 216.1
+# D E
+# 2.3e-31 132.9 7.9e-34 138.3
+d1 e1 3.2e-54 208.8 3.4e-57 216.1
+d2 e2 3.2e-54 208.8 3.4e-57 216.1
+d3 e3 3.2e-54 208.8 3.4e-57 216.1
+d4 e4 3.2e-54 208.8 3.4e-57 216.1
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/9c332f8fb3030cad69929f9a8e45403d037d877a
--
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/9c332f8fb3030cad69929f9a8e45403d037d877a
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221104/be461278/attachment-0001.htm>
More information about the debian-med-commit
mailing list