[med-svn] [Git][med-team/proteinortho][master] 5 commits: Fix watch file
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sat Nov 26 05:32:44 GMT 2022
Andreas Tille pushed to branch master at Debian Med / proteinortho
Commits:
24cd3c07 by Andreas Tille at 2022-11-26T06:28:27+01:00
Fix watch file
- - - - -
bec02a9c by Andreas Tille at 2022-11-26T06:28:54+01:00
New upstream version 6.1.4+dfsg
- - - - -
3f615c18 by Andreas Tille at 2022-11-26T06:28:54+01:00
routine-update: New upstream version
- - - - -
9acfc5be by Andreas Tille at 2022-11-26T06:28:54+01:00
Update upstream source from tag 'upstream/6.1.4+dfsg'
Update to upstream version '6.1.4+dfsg'
with Debian dir a2621cec7bbd46d26e3d676155a87a14b4b73fb9
- - - - -
85c9aa9e by Andreas Tille at 2022-11-26T06:31:44+01:00
routine-update: Ready to upload to unstable
- - - - -
9 changed files:
- CHANGELOG
- CHANGEUID
- Makefile
- debian/changelog
- debian/watch
- proteinortho6.pl
- src/po_tree.c
- src/proteinortho_clustering.cpp
- src/proteinortho_summary.pl
Changes:
=====================================
CHANGELOG
=====================================
@@ -364,4 +364,15 @@
- new: automatic prurity detetection (proteinortho_clustering)
- BUGFIX: remove.graph was missing edges removed from purity
24. Okt (6041) v6.1.2
- - BUGFIX: proteinortho2html some times got the , wrong in the pop-up window.
\ No newline at end of file
+ - BUGFIX: proteinortho2html some times got the , wrong in the pop-up window.
+ 4. Nov (6042) v6.1.3
+ - small bugfix: when cleaning data on symlinks, the new "X_clean" files are generated in abs_path which may not be in the input directory!
+ - removed the precompiled proteinortho_treeBuilderCore as virustotal is not happy with this file (false positive hit: #65, thanks to Björn for pointing this out)
+ - small bugfix: subparaBlast with diamond and e.g. fast did not work previously
+ 15. Nov (6045) v6.1.3
+ - small adjustments to the upper limit of the automatic purity detection system (0.1 -> 0.12)
+ - added the with and without lapack test back to the Makefile (make test or make test_step3)
+ - refined feature for proteinortho_clustering -maxnodes: if max-nodes are exceeded, remove the worst 10% of edges of this CC (by weight) until the CC breaks into smaller CC
+ 23. Nov (6051) v6.1.4
+ - small bugfix: newlines of the output of proteinortho_summary.pl are now printed to STDOUT instead of STDERR, thanks to Matthias Bernt
+ - bugfix: for large project the step=1 could soft lock itself sometimes caused by the log_worker (if used with many cores)
=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-6041
+6051
=====================================
Makefile
=====================================
@@ -96,7 +96,7 @@ endif
dir_guard=@if [ ! -d $(BUILDDIR) ]; then echo "Creating build directory ..."; mkdir -p $(BUILDDIR); fi
.PHONY: all
-all:echoENV $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_treeBuilderCore
+all:echoENV $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_cleanupblastgraph
@echo "[100%] $(GREEN)Everything is compiled with no errors.$(NC)"
$(BUILDDIR)/proteinortho_extract_from_graph.pl: src/proteinortho_extract_from_graph.pl
@@ -264,7 +264,7 @@ else
endif
.PHONY: install
-install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_treeBuilderCore $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_history.pl
+install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_history.pl
@echo "INSTALLING everything to $(INSTALLDIR)"
@install -v $^ $(INSTALLDIR);
@echo "$(GREEN)Everything installed successfully to $(INSTALLDIR).$(NC)"
@@ -390,22 +390,22 @@ test_step2: proteinortho6.pl
test_step3: proteinortho6.pl
./proteinortho6.pl -silent -force -project=test_blastp -p=blastp test/*.faa;
@echo "[TEST] 2. -step=3 tests (proteinortho_clustering) "
- @echo -n " [1/3] various test functions of proteinortho_clustering (if this fails, try make clean first): "; \
+ @echo -n " [1/4] various test functions of proteinortho_clustering (if this fails, try make clean first): "; \
OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -test> /dev/null 2>&1
@echo "$(GREEN)passed$(NC)"
- @echo -n " [2/3] Test proteinortho_clustering using lapack: "; \
- OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 test_blastp.blast-graph -debug 1 2>test_lapack.err >test_lapack.proteinortho.tsv ; \
+ @echo -n " [2/4] Test proteinortho_clustering using lapack: "; \
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 -debug 1 test_blastp.blast-graph 2>test_lapack.err >test_lapack.proteinortho.tsv ; \
set -e ; ./src/chk_test.pl test_lapack.proteinortho.tsv;
@echo "$(GREEN)passed$(NC)"
@perl -lne 'print join "\t", sort split("\t",$$_)' remove.graph | sort > test.A;
- @echo -n " [3/3] Test proteinortho_clustering using power: "; \
- OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 0 test_blastp.blast-graph -debug 1 2>test_power.err >test_power.proteinortho.tsv; \
+ @echo -n " [3/4] Test proteinortho_clustering using power: "; \
+ OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 0 -debug 1 test_blastp.blast-graph 2>test_power.err >test_power.proteinortho.tsv; \
set -e ; ./src/chk_test.pl test_power.proteinortho.tsv;
@echo "$(GREEN)passed$(NC)"
- #@perl -lne 'print join "\t", sort split("\t",$$_)' remove.graph | sort > test.B;
- #@echo -n " [4/4] Test differences between 'using lapack' and 'using power': "; \
- #set -e ; diff test.A test.B;
- #@echo "$(GREEN)passed$(NC)"
+ @perl -lne 'print join "\t", sort split("\t",$$_)' remove.graph | sort > test.B;
+ @echo -n " [4/4] Test differences between 'using lapack' and 'using power': "; \
+ set -e ; diff test.A test.B;
+ @echo "$(GREEN)passed$(NC)"
.PHONY: test_step3_ring4_K5_power
test_step3_ring4_K5_power: proteinortho6.pl
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+proteinortho (6.1.4+dfsg-1) unstable; urgency=medium
+
+ * Fix watch file
+ * New upstream version
+
+ -- Andreas Tille <tille at debian.org> Sat, 26 Nov 2022 06:29:10 +0100
+
proteinortho (6.1.2+dfsg-1) unstable; urgency=medium
[ Andreas Tille ]
=====================================
debian/watch
=====================================
@@ -1,4 +1,4 @@
version=4
-opts="repacksuffix=+dfsg,dversionmangle=auto,repack,compression=xz" \
- https://gitlab.com/paulklemm_PHD/proteinortho/tags?sort=updated_desc .*/archive/.*/proteinortho-v at ANY_VERSION@\.tar\.gz
+opts="repacksuffix=+dfsg,dversionmangle=auto,filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%@PACKAGE at -$1.tar.gz%" \
+ https://gitlab.com/paulklemm_PHD/proteinortho/tags?sort=updated_desc .*/proteinortho-v at ANY_VERSION@\.tar\.gz
=====================================
proteinortho6.pl
=====================================
@@ -509,7 +509,7 @@ use POSIX;
##########################################################################################
# Variables
##########################################################################################
-our $version = "6.1.2";
+our $version = "6.1.4";
our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1 -> don't / be verbose
our $core = 0;
@@ -705,6 +705,9 @@ foreach my $option (@ARGV) {
}
if($inproject eq ""){$inproject=$project} # if --inproject is not specified then set it to --project
+# for no input -> print usage
+if(scalar @ARGV == 0 && $step == 0){ &print_header;print_usage_more(); exit 1; }
+
if($jobnumber != -1 && $tmp_path eq "" ){print STDERR "Please consider to use the -tmp option in combination with -jobs if you use proteinortho on a computing cluster system. Usually these clusters have dedicated local scratch directories.\n";}
if($selfblast){$checkblast=1;}
@@ -943,7 +946,7 @@ if( $keep && $cpus > 8 ){ print STDERR "!!!$ORANGE\n[Warning]:$NC The '-keep' op
# 6.0.32 check if there are _clean files available -> use it instead.
sub search_clean_file{
my $file = shift;
- my $file_clean = abs_path $file;
+ my $file_clean = $file;
$file_clean=~s/(\.[^.]+)$/_clean$1/;
if(-e $file_clean){return $file_clean}
return $file;
@@ -999,10 +1002,25 @@ if ($step == 0 || $step == 1) {
if($verbose){print STDERR "\n$GREEN**Step 1**$NC\n";}
$do_log=1;
+ $_->kill('KILL')->detach foreach threads->list();
threads->create('log_Worker');
foreach my $file (@files) { $step1_QUEUE->enqueue($file); }
- for (my $i = 0; $i < (scalar @files > $cpus+10 || $debug ? $cpus : 1) ; $i++) { threads->create('generate_indices_Worker') } # spawn a thread for each core
+ if($debug){print STDERR "Creating ".(scalar @files > $cpus+10 || $debug ? $cpus : 1)." generate_indices_Worker (cpus=$cpus) for ".$step1_QUEUE->pending." jobs\n"}
+ if($debug){print STDERR "currently running ".(threads->list())." threads (pre spawn)\n";}
+ threads->create('generate_indices_Worker');
+
+ # INFO:
+ # here we use only one thread to generate the db files !
+ # -> diamond will fail if multiple makedb instances are running in parallel,
+ # prducing un-joinable running threads (stuck within the `$cmd 2>&1` in the generate_indices)
+ # approx 1 in 10 runs fail with a infinte loop see issue #69 (https://gitlab.com/paulklemm_PHD/proteinortho/-/issues/69)
+ # work-around in 6.1.4
+
+ # code for multiple threads (does not work sometimes for diamond):
+ #for (my $i = 0; $i < ((scalar @files > 500 && scalar @files > $cpus+10) || $debug ? $cpus : 1) ; $i++) { my $tmp = threads->create('generate_indices_Worker') } # spawn a thread for each core
+ if($debug){print STDERR "currently running ".(threads->list())." threads (post spawn)\n";}
$_->join() foreach threads->list();
+# $_->kill('KILL')->detach foreach threads->list();
#&generate_indices(@files); # Generate index files for blast
if ($desc) {
@@ -1297,6 +1315,8 @@ sub hmmWorker {
}
}
close($FHret);
+
+ threads->exit;
}
@@ -1696,6 +1716,8 @@ sub workerthread {
unlink("$temp_file.log");
unlink("$temp_file.tmp.matching");
+ my $last_jobs_time=time();
+
while (1) {
my ($i, $j);
while (1) {
@@ -1704,20 +1726,20 @@ sub workerthread {
# If there is nothing
unless (defined($job)) {
# Check if more jobs need to be done
-
{
lock($jobs_done); # Productive
- if ($jobs_done >= $jobs_todo || ($part != -1 && $jobs_done >= $part)) { # Productive
-# lock($all_jobs_submitted); # DEBUGGING
-# if ($all_jobs_submitted) { # DEBUGGING
+ if ($jobs_done >= $jobs_todo || ($part != -1 && $jobs_done >= $part) || time()-$last_jobs_time > 3600) { # Productive
+ # lock($all_jobs_submitted); # DEBUGGING
+ # if ($all_jobs_submitted) { # DEBUGGING
if ($debug) {print STDERR "Thread $thread_id\tis leaving\n";}
- return;
+ threads->exit;
}}
# If so, wait
- if ($debug) {print STDERR "Thread $thread_id\tis seeking work ($jobs_done / $jobs_todo)\n";}
+ if ($debug) {print STDERR "Thread $thread_id\tis seeking work ($jobs_done / $jobs_todo)\n";}
sleep(1);
}
else {
+ $last_jobs_time=time();
# Parse job
($i, $j) = split(" ",$job);
# Break the fetch loop
@@ -2274,27 +2296,34 @@ sub auto_cpus {
sub log_Worker {
local $SIG{KILL} = sub { threads->exit };
my $tid = threads->tid();
- if($debug){ print STDERR "log_Worker : spawn\n"; }
+ if($debug){ print STDERR "log_Worker : $tid spawn\n"; }
while($do_log){
while($log_QUEUE->pending()){
+ if($debug){ print STDERR "log_Worker inside : $tid active threads=".(scalar grep { $_->is_running() } threads->list())." log_QUEUE=".$log_QUEUE->pending()." step1_QUEUE=".$step1_QUEUE->pending()." check_files_QUEUE=".$check_files_QUEUE->pending()."\n"; }
my $log = $log_QUEUE->dequeue();
if($verbose){ print STDERR $log; }
}
- if($debug){ print STDERR "log_Worker : active threads=".(scalar grep { $_->is_running() } threads->list())."\n"; }
- if(!$log_QUEUE->pending() && !$step1_QUEUE->pending() && !$check_files_QUEUE->pending() && (scalar grep { $_->is_running() } threads->list()) == 0){$do_log=0}
+ if($debug){ print STDERR "log_Worker : $tid active threads=".(scalar grep { $_->is_running() } threads->list())." log_QUEUE=".$log_QUEUE->pending()." step1_QUEUE=".$step1_QUEUE->pending()." check_files_QUEUE=".$check_files_QUEUE->pending()."\n"; }
+ if(!$log_QUEUE->pending() && !$step1_QUEUE->pending() && !$check_files_QUEUE->pending() && (scalar grep { $_->is_running() } threads->list()) <= 0){$do_log=0}
sleep 1;
}
- if($debug){ print STDERR "log_Worker : dead\n"; }
- return;
+ if($debug){ print STDERR "log_Worker : $tid dead\n"; }
+ threads->exit;
}
sub generate_indices_Worker {
- local $SIG{KILL} = sub { threads->exit };
my $tid = threads->tid();
+ local $SIG{KILL} = sub { threads->exit };
+ if($debug){print STDERR "generate_indices_Worker:$tid spawn\n"}
while($step1_QUEUE->pending()){
+ if($debug){print STDERR "generate_indices_Worker:$tid while\n"}
my $file = $step1_QUEUE->dequeue();
+ if(!defined($file)){last;}
+ if($debug){print STDERR "generate_indices_Worker:$tid working on $file\n"}
&generate_indices($file);
+ if($debug){print STDERR "generate_indices_Worker:$tid ok with $file\n"}
}
- return;
+ if($debug){print STDERR "generate_indices_Worker:$tid dead\n"}
+ threads->exit;
}
sub generate_indices {
my $file=$_[0];
@@ -2302,6 +2331,7 @@ sub generate_indices {
my $oldkeep=$keep;
my $cmd="";
my $msg="";
+ if($debug){print STDERR "generate_indices:$file start\n"}
if($verbose){$msg.= "Generating indices";if($force){$msg.= " anyway (forced).\n"}else{$msg.= ".\n";}}
if ($blastmode eq "rapsearch") {
@@ -2330,16 +2360,20 @@ sub generate_indices {
}else{
if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
$cmd="$makedb '$file' -d '$file.$blastmode' --quiet";
+ if($debug){print STDERR "generate_indices:$file $cmd\n"}
my $makedb_ret = `$cmd 2>&1`;
+ if($debug){print STDERR "generate_indices:$file $cmd DONE\n"}
if ($? != 0 && $makedb_ret =~/--ignore-warnings/) {
$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -d '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now retry with the '--ignore-warnings' option!\n");
$cmd="$makedb '$file' -d '$file.$blastmode' --quiet";
+ if($debug){print STDERR "$cmd\n"}
my $makedb_ret = `$cmd --ignore-warnings 2>&1`;
if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
}elsif($? != 0){$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -d '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";
mkdir("$tmp_path/DB");
if($step==1){$oldkeep=$keep;$keep=1;}
$cmd = "$makedb '$file' -d '$tmp_path/DB/".basename($file).".$blastmode' --quiet";
+ if($debug){print STDERR "$cmd\n"}
$makedb_ret = `$cmd 2>&1`;
if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
@@ -2535,7 +2569,9 @@ sub generate_indices {
unlink('formatdb.log') if -e 'formatdb.log';
}
##MARK_FOR_NEW_BLAST_ALGORITHM
+ if($debug){print STDERR "generate_indices:$file almost end\n"}
$log_QUEUE->enqueue($msg);
+ if($debug){print STDERR "generate_indices:$file end\n"}
return;
}
@@ -2557,7 +2593,11 @@ sub blast {
my $blastOptions_diamond=''; #additional option --sensitive (if gene length is larger than 40)
my $max_genlen = 1;
if($blastmode eq "diamond"){
- if (index($blastOptions, "--more-sensitive") == -1 && index($blastOptions, "--sensitive") == -1) {
+ if (index($blastOptions, "--ultra-sensitive") == -1 &&
+ index($blastOptions, "--very-sensitive") == -1 &&
+ index($blastOptions, "--mid-sensitive") == -1 &&
+ index($blastOptions, "--fast") == -1 &&
+ index($blastOptions, "--more-sensitive") == -1 && index($blastOptions, "--sensitive") == -1) {
if( $max_gene_length_diamond{$a} > 40 || $max_gene_length_diamond{$b} > 40 ){ $blastOptions_diamond = "--sensitive" }
}
}
@@ -3024,8 +3064,10 @@ sub check_files {
if($debug){print STDERR "Checking input files queue:".($check_files_QUEUE->pending())."\n"}
$do_log=1;
+ $_->kill('KILL')->detach foreach threads->list();
threads->create('log_Worker');
- for (my $i = 0; $i < (scalar @files > $cpus+10 || $debug ? $cpus : 1) ; $i++) { my $tmp = threads->create('check_files_Worker') }
+ if($debug){print STDERR "Creating ".(scalar @files > $cpus+10 || $debug ? $cpus : 1)." check_files_Worker (cpus=$cpus)\n"}
+ for (my $i = 0; $i < ((scalar @files > 500 && scalar @files > $cpus+10) || $debug ? $cpus : 1) ; $i++) { my $tmp = threads->create('check_files_Worker') }
$_->join() foreach threads->list();
my $all_only_one_entry=1;
@@ -3047,13 +3089,14 @@ sub check_files_Worker {
my %seen_files;
while($check_files_QUEUE->pending()){
my $file = $check_files_QUEUE->dequeue();
+ if(!defined($file)){last;}
if(exists $seen_files{abs_path($file)}){next}
$seen_files{abs_path($file)}=1;
if($debug){print STDERR "check_files_Worker:$tid working on $file\n"}
&read_details($file,0);
}
- if($debug){print STDERR "check_files_Worker:dead\n"}
- return;
+ if($debug){print STDERR "check_files_Worker:$tid dead\n"}
+ threads->exit;
}
sub convertUniprotAndNCBI {
@@ -3171,7 +3214,7 @@ sub read_details {
exit 1;
}elsif(index($curLine, ",") != -1){
# 6.0.32 : check if gene name contains a comma -> this will cause problems with the proteinortho.tsv output (gene cluster speparator)
- my $file_clean = abs_path $file;
+ my $file_clean = $file;
$file_clean=~s/(\.[^.]+)$/_clean$1/;
$found_comma_in_file=1;
$msg.="\n$ORANGE [WARNING]$NC input '$file' contains a gene-name with a comma, this causes problems with the proteinortho.tsv output, I will clean the file ('$file_clean') and restart the analysis !!!\nThe line is:\n$curLine\n";
@@ -3216,7 +3259,7 @@ sub read_details {
if($found_comma_in_file && !$from_synteny_call){
# 6.0.32 : replace the , with ; -> write output in a *_clean.* file
- my $file_clean = abs_path $file;
+ my $file_clean = $file; #abs_path $file;
$file_clean=~s/(\.[^.]+)$/_clean$1/;
open(FASTA,"<$file") || &Error("Could not open '$file': $!");
open(FASTA_CLEAN,">$file_clean") || &Error("Could not open '$file_clean': $!");
=====================================
src/po_tree.c
=====================================
@@ -43,7 +43,7 @@ void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen);
void merge (Uint* max, Uint css, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen);
BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char*** pnamen);
int sim(BOOL** matrix, Uint a, Uint b, Uint ccs);
-char* myitoa(int zahl, int base);
+char* integer2string(int zahl, int base);
int get_number(char* word);
int main(int argc, char** argv) {
@@ -177,11 +177,11 @@ void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int**
/* transform the number to text
the difference with their affinity is the new branch length (= distance) */
- char* length_0 = myitoa(gemein_0-counter,10);
- char* length_1 = myitoa(gemein_1-counter,10);
+ char* length_0 = integer2string(gemein_0-counter,10);
+ char* length_1 = integer2string(gemein_1-counter,10);
/* transform the number of common genes (= affinity) to text */
- char* anz = myitoa(counter,10);
+ char* anz = integer2string(counter,10);
/* calculate how many place is needed for the new string
sum of both species strings (= subtree) + new numbers + parenteses + : + \0 */
int newsize = strlen((*pnamen)[max[0]])+strlen((*pnamen)[max[1]])+strlen(anz)+strlen(length_0)+strlen(length_1)+8;
@@ -213,7 +213,7 @@ void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int**
/* free */
free(length_0);
free(length_1);
- /* pointer gotten from myitoa */
+ /* pointer gotten from integer2string */
free(anz);
}
@@ -266,7 +266,7 @@ BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char**
/* if both entries are the same (as initiated), we are done */
if (bestpair[0] == bestpair[1]) {
/* get the last branch length of the tree */
- char* anz = myitoa(get_number((*pnamen)[0]),10);
+ char* anz = integer2string(get_number((*pnamen)[0]),10);
/* recalculate size of the tree */
int newsize = strlen((*pnamen)[0])+strlen(anz)+2;
/* realloc */
@@ -274,7 +274,7 @@ BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char**
/* add the information about the root's branchlength */
(*pnamen)[0] = strcat((*pnamen)[0],":");
(*pnamen)[0] = strcat((*pnamen)[0],anz);
- /* free myitoa's string */
+ /* free integer2string's string */
free(anz);
/* return finished */
return 0;
@@ -431,7 +431,7 @@ int get_number(char* word) {
* returns - char array representing number zahl to base base as text
* Attention: returnvalue needs to be freed separately
*/
-char* myitoa(int zahl, int base) {
+char* integer2string(int zahl, int base) {
int i = 0, k;
char reverse[64];
=====================================
src/proteinortho_clustering.cpp
=====================================
@@ -97,7 +97,7 @@ string getTime(void);
int param_useLapack = 1;
// min/max number of alg con iterations
-unsigned int critical_min_nodes = 16777216;
+// unsigned int critical_min_nodes = 16777216; // replaced
const unsigned int min_iter = 16; // below this value, results may vary
unsigned int param_max_iter = 8192; // below this value, results may vary
float param_epsilon = 1e-8; // analog to http://people.sc.fsu.edu/~jburkardt/c_src/power_method/power_method_prb.c
@@ -442,7 +442,7 @@ void printHelp() {
cerr << " -coreMinSpecies int the minimum number of species for -core ["<<param_coreMinSpecies<<"]" << "\n";
cerr << "\ntechnical parameters:" << "\n";
cerr << " -test various test-functions are called first [not set]" << "\n";
- cerr << " -maxnodes int only consider connected component with up to maxnodes nodes ["<<param_max_nodes<<"]" << "\n";
+ cerr << " -maxnodes int only consider connected component with up to maxnodes nodes. If exceeded, greedily remove the worst 10 percent of edges (by weight) until satisfied ["<<param_max_nodes<<"]" << "\n";
cerr << " -maxweight int only use the edge weights for connected components with maxweight nodes ["<<param_max_nodes_weight<<"]" << "\n";
cerr << " -epsilon float convergence threshold ["<<param_epsilon<<"]" << "\n";
cerr << " -weighted bool the spectral partition is calculated using the bitscores ["<<param_useWeights<<"]" << "\n";
@@ -876,6 +876,47 @@ struct compare_ConnectedComponents { //sort from large to small
}
};
+void removeLowQualityEdges( ConnectedComponent &cur_cc ){
+
+ /*
+ * remove low quality edges
+ *
+ * find the range of values of this CC -> define a cut-off = cut_off=min_w+0.1*(max_w-min_w);
+ * remove all edges below that value
+ * redo BFS and start over again for this cluster until -maxnodes are satisfied
+ *
+ */
+
+ // find the lowest 10% of edge values (min+0.1*(max-min))
+ float min_w = -1;
+ float max_w = -1;
+ for (unsigned int i = 0; i < cur_cc.size(); i++) {
+ unsigned int from=cur_cc[i];
+ for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
+ unsigned int to = graph[from].edges[j].edge;
+ unsigned int weight = graph[from].edges[j].weight;
+ if(min_w == -1 || weight < min_w){min_w=weight;}
+ if(min_w == -1 || weight > max_w){max_w=weight;}
+ }
+ }
+ float cut_off=min_w+0.1*(max_w-min_w);
+ if(debug_level>0) cerr << "cut_off="<<cut_off << " min_w="<<min_w<<" max_w="<<max_w<< endl;
+ for (unsigned int i = 0; i < cur_cc.size(); i++) {
+ unsigned int from=cur_cc[i];
+ auto it = std::remove_if(
+ graph[from].edges.begin(),
+ graph[from].edges.end(),
+ [cut_off,from](wedge cur_el)->bool
+ {
+ if(debug_level>0 && cur_el.weight <= cut_off) cerr << "[WARNING] found bad-edge "<< from << "-"<< cur_el.edge << " weight=" << cur_el.weight << endl;
+ return cur_el.weight <= cut_off;
+ }
+ );
+ graph[from].edges.erase(it, graph[from].edges.end());
+ }
+
+}
+
void find_CCs_givenNodes(dispatch_queue *q, vector<unsigned int> todo_work ){
map<unsigned int, bool> done; // Keep track on what was done (for each node)
@@ -901,7 +942,7 @@ void find_CCs_givenNodes(dispatch_queue *q, vector<unsigned int> todo_work ){
allNodesAreDone = true;
for (unsigned int i = 0 ; i < todo_work.size() ; i++) {
-
+
unsigned int protein_id=todo_work[i];
if ( done.count(protein_id) && done[protein_id] ){continue;}// We were here already
@@ -915,19 +956,14 @@ void find_CCs_givenNodes(dispatch_queue *q, vector<unsigned int> todo_work ){
if(debug_level>0) cerr << "ConnectedComponent: @"<< getCCid(todo_work) << "=>" << cur_cc.size() << "@" << getCCid(cur_cc.m_content_CC) << endl;
- // Skip those that are too large (try critical edge heuristic)
- if (cur_cc.size() > param_max_nodes || cur_cc.size() > critical_min_nodes) {
+ // Skip those that are too large (try heuristic)
+ if (cur_cc.size() > param_max_nodes) {
- if(cur_cc.size() > param_max_nodes)
- cerr << " [WARNING] Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
- else
- cerr << " [WARNING] Found a rather large connected component that contains " << cur_cc.size() << ">" << critical_min_nodes << " (critical_min_nodes) elements. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
-
- if(cur_cc.size() > param_max_nodes){
- cerr << " [WARNING] The very large connected component that contains " << cur_cc.size() << " elements, did not have any critical edges." << "\n";
- cerr << " [WARNING] Skipping this connected component. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds." << "\n";
- continue;
- }
+ // reset done vector
+ for (int i = 0; i < cur_cc.size(); ++i){ done[cur_cc[i]]=false; }
+ if(debug_level>0) cerr << " [WARNING] Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes. Now using a slow heuristic: try to identify and remove edges." << "\n";
+ removeLowQualityEdges(cur_cc); i--;
+ continue;
}
if(param_core && cur_cc.species_num==0){
@@ -1066,7 +1102,7 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
* purity is imporatant if there are multiple clusters that can be split or there is no good split
* identified nodes (in purity boundary) are put back to the queue stack (=groupZero)
*
- * maximal purity = 0.1 -> get all nodes with |x_hat(v)|<0.1
+ * maximal purity = 0.12 -> get all nodes with |x_hat(v)|<0.12
* identify all possible purity thresholds
* each purity threshold is evaluated with the total sum of edge weights that are removed with this cut
* find most optimal purity cut-off
@@ -1090,7 +1126,7 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
if(debug_level>0) cerr << " initial purity=0 best_cut=" << best_cut << endl;
for(unsigned int pi = 0; pi < (unsigned int)n; pi++){
unsigned int from = cur_cc[pi];
- if(!mapping.count(from) || abs(x_hat[ mapping[from] ])>0.1 ){continue;}
+ if(!mapping.count(from) || abs(x_hat[ mapping[from] ])>0.12 ){continue;}
float test_purity = abs(x_hat[ mapping[from] ]);
float cut_value=0;
unsigned int num_nodes=0;
@@ -1114,7 +1150,7 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
}
}
- if(new_purity>0.1){new_purity=0;} // if there are no impure nodes -> disable
+ if(new_purity>0.12){new_purity=0;} // if there are no impure nodes -> disable
unsigned int num_impure_nodes=0;
for (unsigned int i = 0; i < x_hat.size(); i++) { if(abs(x_hat[i]) < new_purity) { num_impure_nodes++; } }
if(debug_level>0) cerr << " num_impure_nodes" << num_impure_nodes << endl;
@@ -1391,19 +1427,14 @@ void find_CCs(dispatch_queue *q){
// Do not report singles
if (cur_cc.size() < 2) {continue;} // singletons are from no interest
- // Skip those that are too large (try critical edge heuristic)
- if (cur_cc.size() > param_max_nodes || cur_cc.size() > critical_min_nodes) {
+ // Skip those that are too large (try heuristic)
+ if (cur_cc.size() > param_max_nodes) {
- if(cur_cc.size() > param_max_nodes)
- cerr << " [WARNING] Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
- else
- cerr << " [WARNING] Found a rather large connected component that contains " << cur_cc.size() << ">" << critical_min_nodes << " (critical_min_nodes) elements. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
-
- if(cur_cc.size() > param_max_nodes){
- cerr << " [WARNING] The very large connected component that contains " << cur_cc.size() << " elements, did not have any critical edges." << "\n";
- cerr << " [WARNING] Skipping this connected component. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds." << "\n";
- continue;
- }
+ // reset done vector
+ for (int i = 0; i < cur_cc.size(); ++i){ done[cur_cc[i]]=false; }
+ if(debug_level>0) cerr << " [WARNING] Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes. Now using a slow heuristic: try to identify and remove edges." << "\n";
+ removeLowQualityEdges(cur_cc); protein_id--;
+ continue;
}
if(param_core && cur_cc.species_num==0){
=====================================
src/proteinortho_summary.pl
=====================================
@@ -238,7 +238,7 @@ my @keys=sort keys %species_matrix;
$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
-print STDERR "\n";
+print "\n";
my $ret= "# The adjacency matrix, the number of edges between 2 species\n";
processLine($ret);
$ret= "# file\t";
@@ -269,7 +269,7 @@ $maxNumOfCharsInOneLine=`tput cols`;
chomp($maxNumOfCharsInOneLine);$maxNumOfCharsInOneLine/=2;
if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
-print STDERR "\n";
+print "\n";
$ret= "# file\taverage number of edges\n";
processLine($ret);
for(my $i = 0 ; $i < scalar @keys; $i++){
@@ -289,7 +289,7 @@ $maxNumOfCharsInOneLine=`tput cols`;
chomp($maxNumOfCharsInOneLine);
if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
-print STDERR "\n";
+print "\n";
$ret= "# The 2-path matrix, the number of paths between 2 species of length 2\n";
processLine($ret);
$ret= "# file\t";
@@ -320,7 +320,7 @@ $maxNumOfCharsInOneLine=`tput cols`;
chomp($maxNumOfCharsInOneLine);$maxNumOfCharsInOneLine/=2;
if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
-print STDERR "\n";
+print "\n";
processLine("# file\taverage number of 2-paths\n");
for(my $i = 0 ; $i < scalar @keys; $i++){
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/compare/427c970d6dc7f60f2e664a405045ea9af4a8fe2e...85c9aa9ec8f6a1e761239bffa81ccefe3ba3779a
--
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/compare/427c970d6dc7f60f2e664a405045ea9af4a8fe2e...85c9aa9ec8f6a1e761239bffa81ccefe3ba3779a
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221126/b0027fe5/attachment-0001.htm>
More information about the debian-med-commit
mailing list