[med-svn] [Git][med-team/proteinortho][master] 5 commits: Fix watch file

Andreas Tille (@tille) gitlab at salsa.debian.org
Sat Nov 26 05:32:44 GMT 2022



Andreas Tille pushed to branch master at Debian Med / proteinortho


Commits:
24cd3c07 by Andreas Tille at 2022-11-26T06:28:27+01:00
Fix watch file

- - - - -
bec02a9c by Andreas Tille at 2022-11-26T06:28:54+01:00
New upstream version 6.1.4+dfsg
- - - - -
3f615c18 by Andreas Tille at 2022-11-26T06:28:54+01:00
routine-update: New upstream version

- - - - -
9acfc5be by Andreas Tille at 2022-11-26T06:28:54+01:00
Update upstream source from tag 'upstream/6.1.4+dfsg'

Update to upstream version '6.1.4+dfsg'
with Debian dir a2621cec7bbd46d26e3d676155a87a14b4b73fb9
- - - - -
85c9aa9e by Andreas Tille at 2022-11-26T06:31:44+01:00
routine-update: Ready to upload to unstable

- - - - -


9 changed files:

- CHANGELOG
- CHANGEUID
- Makefile
- debian/changelog
- debian/watch
- proteinortho6.pl
- src/po_tree.c
- src/proteinortho_clustering.cpp
- src/proteinortho_summary.pl


Changes:

=====================================
CHANGELOG
=====================================
@@ -364,4 +364,15 @@
 		- new: automatic prurity detetection (proteinortho_clustering)
 		- BUGFIX: remove.graph was missing edges removed from purity
 	24. Okt (6041) v6.1.2
-		- BUGFIX: proteinortho2html some times got the , wrong in the pop-up window.
\ No newline at end of file
+		- BUGFIX: proteinortho2html some times got the , wrong in the pop-up window.
+	4. Nov (6042) v6.1.3
+		- small bugfix: when cleaning data on symlinks, the new "X_clean" files are generated in abs_path which may not be in the input directory! 
+		- removed the precompiled proteinortho_treeBuilderCore as virustotal is not happy with this file (false positive hit: #65, thanks to Björn for pointing this out)
+		- small bugfix: subparaBlast with diamond and e.g. fast did not work previously
+	15. Nov (6045) v6.1.3
+		- small adjustments to the upper limit of the automatic purity detection system (0.1 -> 0.12)
+		- added the with and without lapack test back to the Makefile (make test or make test_step3)
+		- refined feature for proteinortho_clustering -maxnodes: if max-nodes are exceeded, remove the worst 10% of edges of this CC (by weight) until the CC breaks into smaller CC
+	23. Nov (6051) v6.1.4
+		- small bugfix: newlines of the output of proteinortho_summary.pl are now printed to STDOUT instead of STDERR, thanks to Matthias Bernt
+		- bugfix: for large project the step=1 could soft lock itself sometimes caused by the log_worker (if used with many cores)


=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-6041
+6051


=====================================
Makefile
=====================================
@@ -96,7 +96,7 @@ endif
 dir_guard=@if [ ! -d $(BUILDDIR) ]; then echo "Creating build directory ..."; mkdir -p $(BUILDDIR); fi
 
 .PHONY: all
-all:echoENV $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_treeBuilderCore
+all:echoENV $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_cleanupblastgraph
 	@echo "[100%] $(GREEN)Everything is compiled with no errors.$(NC)"
 
 $(BUILDDIR)/proteinortho_extract_from_graph.pl: src/proteinortho_extract_from_graph.pl
@@ -264,7 +264,7 @@ else
 endif
 
 .PHONY: install
-install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_treeBuilderCore $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_history.pl 
+install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_history.pl 
 	@echo "INSTALLING everything to $(INSTALLDIR)"
 	@install -v $^ $(INSTALLDIR);
 	@echo "$(GREEN)Everything installed successfully to $(INSTALLDIR).$(NC)"
@@ -390,22 +390,22 @@ test_step2: proteinortho6.pl
 test_step3: proteinortho6.pl
 	./proteinortho6.pl -silent -force -project=test_blastp -p=blastp test/*.faa;
 	@echo "[TEST] 2. -step=3 tests (proteinortho_clustering) "
-	@echo -n " [1/3] various test functions of proteinortho_clustering (if this fails, try make clean first): "; \
+	@echo -n " [1/4] various test functions of proteinortho_clustering (if this fails, try make clean first): "; \
 	OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -test> /dev/null 2>&1
 	@echo "$(GREEN)passed$(NC)"
-	@echo -n " [2/3] Test proteinortho_clustering using lapack: "; \
-	OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 test_blastp.blast-graph -debug 1 2>test_lapack.err >test_lapack.proteinortho.tsv ; \
+	@echo -n " [2/4] Test proteinortho_clustering using lapack: "; \
+	OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 2 -debug 1 test_blastp.blast-graph 2>test_lapack.err >test_lapack.proteinortho.tsv ; \
 	set -e ; ./src/chk_test.pl test_lapack.proteinortho.tsv; 
 	@echo "$(GREEN)passed$(NC)"
 	@perl -lne 'print join "\t", sort split("\t",$$_)' remove.graph | sort > test.A; 
-	@echo -n " [3/3] Test proteinortho_clustering using power: "; \
-	OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 0 test_blastp.blast-graph -debug 1 2>test_power.err >test_power.proteinortho.tsv; \
+	@echo -n " [3/4] Test proteinortho_clustering using power: "; \
+	OMP_NUM_THREADS=1 $(BUILDDIR)/proteinortho_clustering -lapack 0 -debug 1 test_blastp.blast-graph 2>test_power.err >test_power.proteinortho.tsv; \
 	set -e ; ./src/chk_test.pl test_power.proteinortho.tsv;
 	@echo "$(GREEN)passed$(NC)"
-	#@perl -lne 'print join "\t", sort split("\t",$$_)' remove.graph | sort > test.B; 
-	#@echo -n " [4/4] Test differences between 'using lapack' and 'using power': "; \
-	#set -e ; diff test.A test.B;
-	#@echo "$(GREEN)passed$(NC)"
+	@perl -lne 'print join "\t", sort split("\t",$$_)' remove.graph | sort > test.B; 
+	@echo -n " [4/4] Test differences between 'using lapack' and 'using power': "; \
+	set -e ; diff test.A test.B;
+	@echo "$(GREEN)passed$(NC)"
 
 .PHONY: test_step3_ring4_K5_power
 test_step3_ring4_K5_power: proteinortho6.pl


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+proteinortho (6.1.4+dfsg-1) unstable; urgency=medium
+
+  * Fix watch file
+  * New upstream version
+
+ -- Andreas Tille <tille at debian.org>  Sat, 26 Nov 2022 06:29:10 +0100
+
 proteinortho (6.1.2+dfsg-1) unstable; urgency=medium
 
   [ Andreas Tille ]


=====================================
debian/watch
=====================================
@@ -1,4 +1,4 @@
 version=4
 
-opts="repacksuffix=+dfsg,dversionmangle=auto,repack,compression=xz" \
-  https://gitlab.com/paulklemm_PHD/proteinortho/tags?sort=updated_desc .*/archive/.*/proteinortho-v at ANY_VERSION@\.tar\.gz
+opts="repacksuffix=+dfsg,dversionmangle=auto,filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%@PACKAGE at -$1.tar.gz%" \
+  https://gitlab.com/paulklemm_PHD/proteinortho/tags?sort=updated_desc .*/proteinortho-v at ANY_VERSION@\.tar\.gz


=====================================
proteinortho6.pl
=====================================
@@ -509,7 +509,7 @@ use POSIX;
 ##########################################################################################
 # Variables
 ##########################################################################################
-our $version = "6.1.2";
+our $version = "6.1.4";
 our $step = 0;    # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
 our $verbose = 1; # 0/1   -> don't / be verbose
 our $core = 0; 
@@ -705,6 +705,9 @@ foreach my $option (@ARGV) {
 }
 if($inproject eq ""){$inproject=$project} # if --inproject is not specified then set it to --project
 
+# for no input -> print usage 
+if(scalar @ARGV == 0 && $step == 0){ &print_header;print_usage_more(); exit 1; }
+
 if($jobnumber != -1 && $tmp_path eq "" ){print STDERR "Please consider to use the -tmp option in combination with -jobs if you use proteinortho on a computing cluster system. Usually these clusters have dedicated local scratch directories.\n";}
 if($selfblast){$checkblast=1;}
 
@@ -943,7 +946,7 @@ if( $keep && $cpus > 8 ){ print STDERR "!!!$ORANGE\n[Warning]:$NC The '-keep' op
 # 6.0.32 check if there are _clean files available -> use it instead. 
 sub search_clean_file{
   my $file = shift;
-  my $file_clean = abs_path $file;
+  my $file_clean = $file;
   $file_clean=~s/(\.[^.]+)$/_clean$1/;
   if(-e $file_clean){return $file_clean}
   return $file;
@@ -999,10 +1002,25 @@ if ($step == 0 || $step == 1) {
   if($verbose){print STDERR "\n$GREEN**Step 1**$NC\n";}
 
   $do_log=1;
+  $_->kill('KILL')->detach foreach threads->list(); 
   threads->create('log_Worker');
   foreach my $file (@files) { $step1_QUEUE->enqueue($file); }
-  for (my $i = 0; $i < (scalar @files > $cpus+10 || $debug ? $cpus : 1) ; $i++) { threads->create('generate_indices_Worker') } # spawn a thread for each core
+  if($debug){print STDERR "Creating ".(scalar @files > $cpus+10 || $debug ? $cpus : 1)." generate_indices_Worker (cpus=$cpus) for ".$step1_QUEUE->pending." jobs\n"}
+  if($debug){print STDERR "currently running ".(threads->list())." threads (pre spawn)\n";}
+  threads->create('generate_indices_Worker');
+  
+  # INFO:
+  # here we use only one thread to generate the db files !
+  # -> diamond will fail if multiple makedb instances are running in parallel, 
+  #    prducing un-joinable running threads (stuck within the `$cmd 2>&1` in the generate_indices)
+  # approx 1 in 10 runs fail with a infinte loop see issue #69 (https://gitlab.com/paulklemm_PHD/proteinortho/-/issues/69)
+  # work-around in 6.1.4
+  
+  # code for multiple threads (does not work sometimes for diamond): 
+  #for (my $i = 0; $i < ((scalar @files > 500 && scalar @files > $cpus+10) || $debug ? $cpus : 1) ; $i++) { my $tmp = threads->create('generate_indices_Worker') } # spawn a thread for each core
+  if($debug){print STDERR "currently running ".(threads->list())." threads (post spawn)\n";}
   $_->join() foreach threads->list(); 
+#  $_->kill('KILL')->detach foreach threads->list(); 
 
   #&generate_indices(@files);  # Generate index files for blast
   if ($desc) {
@@ -1297,6 +1315,8 @@ sub hmmWorker {
     }
   }
   close($FHret);
+
+  threads->exit;
 }
 
 
@@ -1696,6 +1716,8 @@ sub workerthread {
   unlink("$temp_file.log");
   unlink("$temp_file.tmp.matching");
 
+  my $last_jobs_time=time();
+
   while (1) {
     my ($i, $j);
     while (1) {
@@ -1704,20 +1726,20 @@ sub workerthread {
       # If there is nothing
       unless (defined($job)) {
         # Check if more jobs need to be done
-
         {
         lock($jobs_done);             #   Productive
-        if ($jobs_done >= $jobs_todo || ($part != -1 && $jobs_done >= $part)) { #   Productive
-#       lock($all_jobs_submitted);            #   DEBUGGING
-#       if ($all_jobs_submitted) {            #   DEBUGGING
+        if ($jobs_done >= $jobs_todo || ($part != -1 && $jobs_done >= $part) || time()-$last_jobs_time > 3600) { #   Productive
+          # lock($all_jobs_submitted);            #   DEBUGGING
+          # if ($all_jobs_submitted) {            #   DEBUGGING
           if ($debug) {print STDERR "Thread $thread_id\tis leaving\n";}
-          return;
+          threads->exit;
         }}
         # If so, wait
-          if ($debug) {print STDERR "Thread $thread_id\tis seeking work ($jobs_done / $jobs_todo)\n";}
+        if ($debug) {print STDERR "Thread $thread_id\tis seeking work ($jobs_done / $jobs_todo)\n";}
         sleep(1);
       }
       else {
+        $last_jobs_time=time();
         # Parse job
         ($i, $j) = split(" ",$job);
         # Break the fetch loop
@@ -2274,27 +2296,34 @@ sub auto_cpus {
 sub log_Worker {
   local $SIG{KILL} = sub { threads->exit };
   my $tid = threads->tid();
-  if($debug){ print STDERR "log_Worker : spawn\n"; }
+  if($debug){ print STDERR "log_Worker : $tid spawn\n"; }
   while($do_log){
     while($log_QUEUE->pending()){
+      if($debug){ print STDERR "log_Worker inside : $tid active threads=".(scalar grep { $_->is_running() } threads->list())." log_QUEUE=".$log_QUEUE->pending()." step1_QUEUE=".$step1_QUEUE->pending()." check_files_QUEUE=".$check_files_QUEUE->pending()."\n"; }
       my $log = $log_QUEUE->dequeue();
       if($verbose){ print STDERR $log; }
     }
-    if($debug){ print STDERR "log_Worker : active threads=".(scalar grep { $_->is_running() } threads->list())."\n"; }
-    if(!$log_QUEUE->pending() && !$step1_QUEUE->pending() && !$check_files_QUEUE->pending() && (scalar grep { $_->is_running() } threads->list()) == 0){$do_log=0}
+    if($debug){ print STDERR "log_Worker : $tid active threads=".(scalar grep { $_->is_running() } threads->list())." log_QUEUE=".$log_QUEUE->pending()." step1_QUEUE=".$step1_QUEUE->pending()." check_files_QUEUE=".$check_files_QUEUE->pending()."\n"; }
+    if(!$log_QUEUE->pending() && !$step1_QUEUE->pending() && !$check_files_QUEUE->pending() && (scalar grep { $_->is_running() } threads->list()) <= 0){$do_log=0}
     sleep 1;
   }
-  if($debug){ print STDERR "log_Worker : dead\n"; }
-  return;
+  if($debug){ print STDERR "log_Worker : $tid dead\n"; }
+  threads->exit;
 }
 sub generate_indices_Worker {
-  local $SIG{KILL} = sub { threads->exit };
   my $tid = threads->tid();
+  local $SIG{KILL} = sub { threads->exit };
+  if($debug){print STDERR "generate_indices_Worker:$tid spawn\n"}
   while($step1_QUEUE->pending()){
+    if($debug){print STDERR "generate_indices_Worker:$tid while\n"}
     my $file = $step1_QUEUE->dequeue();
+    if(!defined($file)){last;}
+    if($debug){print STDERR "generate_indices_Worker:$tid working on $file\n"}
     &generate_indices($file);
+    if($debug){print STDERR "generate_indices_Worker:$tid ok with $file\n"}
   }
-  return;
+  if($debug){print STDERR "generate_indices_Worker:$tid dead\n"}
+  threads->exit;
 }
 sub generate_indices {
   my $file=$_[0];
@@ -2302,6 +2331,7 @@ sub generate_indices {
   my $oldkeep=$keep;
   my $cmd="";
   my $msg="";
+  if($debug){print STDERR "generate_indices:$file start\n"}
   
   if($verbose){$msg.= "Generating indices";if($force){$msg.= " anyway (forced).\n"}else{$msg.= ".\n";}}
   if ($blastmode eq "rapsearch") {
@@ -2330,16 +2360,20 @@ sub generate_indices {
       }else{
         if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         $cmd="$makedb '$file' -d '$file.$blastmode' --quiet";
+        if($debug){print STDERR "generate_indices:$file $cmd\n"}
         my $makedb_ret = `$cmd 2>&1`;
+        if($debug){print STDERR "generate_indices:$file $cmd DONE\n"}
         if ($? != 0 && $makedb_ret =~/--ignore-warnings/) {
           $msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -d '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now retry with the '--ignore-warnings' option!\n");
           $cmd="$makedb '$file' -d '$file.$blastmode' --quiet";
+          if($debug){print STDERR "$cmd\n"}
           my $makedb_ret = `$cmd --ignore-warnings 2>&1`;
           if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
         }elsif($? != 0){$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -d '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";
           mkdir("$tmp_path/DB");
           if($step==1){$oldkeep=$keep;$keep=1;}
           $cmd = "$makedb '$file' -d '$tmp_path/DB/".basename($file).".$blastmode' --quiet";
+          if($debug){print STDERR "$cmd\n"}
           $makedb_ret = `$cmd 2>&1`;
           if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
           system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
@@ -2535,7 +2569,9 @@ sub generate_indices {
     unlink('formatdb.log') if -e 'formatdb.log';
   }
 ##MARK_FOR_NEW_BLAST_ALGORITHM
+  if($debug){print STDERR "generate_indices:$file almost end\n"}
   $log_QUEUE->enqueue($msg);
+  if($debug){print STDERR "generate_indices:$file end\n"}
   return;
 }
 
@@ -2557,7 +2593,11 @@ sub blast {
   my $blastOptions_diamond=''; #additional option --sensitive (if gene length is larger than 40)
   my $max_genlen = 1;
     if($blastmode eq "diamond"){
-      if (index($blastOptions, "--more-sensitive") == -1 && index($blastOptions, "--sensitive") == -1) {
+      if (index($blastOptions, "--ultra-sensitive") == -1 && 
+          index($blastOptions, "--very-sensitive") == -1 &&  
+          index($blastOptions, "--mid-sensitive") == -1 &&  
+          index($blastOptions, "--fast") == -1 &&  
+          index($blastOptions, "--more-sensitive") == -1 && index($blastOptions, "--sensitive") == -1) {
       if( $max_gene_length_diamond{$a} > 40 || $max_gene_length_diamond{$b} > 40 ){ $blastOptions_diamond = "--sensitive" }
     }
   }
@@ -3024,8 +3064,10 @@ sub check_files {
   if($debug){print STDERR "Checking input files queue:".($check_files_QUEUE->pending())."\n"}
 
   $do_log=1;
+  $_->kill('KILL')->detach foreach threads->list(); 
   threads->create('log_Worker');
-  for (my $i = 0; $i < (scalar @files > $cpus+10 || $debug ? $cpus : 1) ; $i++) { my $tmp = threads->create('check_files_Worker') } 
+  if($debug){print STDERR "Creating ".(scalar @files > $cpus+10 || $debug ? $cpus : 1)." check_files_Worker (cpus=$cpus)\n"}
+  for (my $i = 0; $i < ((scalar @files > 500 && scalar @files > $cpus+10) || $debug ? $cpus : 1) ; $i++) { my $tmp = threads->create('check_files_Worker') } 
   $_->join() foreach threads->list(); 
 
   my $all_only_one_entry=1;
@@ -3047,13 +3089,14 @@ sub check_files_Worker {
   my %seen_files;
   while($check_files_QUEUE->pending()){
     my $file = $check_files_QUEUE->dequeue();
+    if(!defined($file)){last;}
     if(exists $seen_files{abs_path($file)}){next}
     $seen_files{abs_path($file)}=1;
     if($debug){print STDERR "check_files_Worker:$tid working on $file\n"}
     &read_details($file,0);
   }
-  if($debug){print STDERR "check_files_Worker:dead\n"}
-  return;
+  if($debug){print STDERR "check_files_Worker:$tid dead\n"}
+  threads->exit;
 }
 
 sub convertUniprotAndNCBI {
@@ -3171,7 +3214,7 @@ sub read_details {
         exit 1;
       }elsif(index($curLine, ",") != -1){
         # 6.0.32 : check if gene name contains a comma -> this will cause problems with the proteinortho.tsv output (gene cluster speparator)
-        my $file_clean = abs_path $file;
+        my $file_clean = $file;
         $file_clean=~s/(\.[^.]+)$/_clean$1/;
         $found_comma_in_file=1;
         $msg.="\n$ORANGE [WARNING]$NC input '$file' contains a gene-name with a comma, this causes problems with the proteinortho.tsv output, I will clean the file ('$file_clean') and restart the analysis !!!\nThe line is:\n$curLine\n";
@@ -3216,7 +3259,7 @@ sub read_details {
 
   if($found_comma_in_file && !$from_synteny_call){
     # 6.0.32 : replace the , with ; -> write output in a *_clean.* file
-    my $file_clean = abs_path $file;
+    my $file_clean = $file; #abs_path $file;
     $file_clean=~s/(\.[^.]+)$/_clean$1/;
     open(FASTA,"<$file") || &Error("Could not open '$file': $!");
     open(FASTA_CLEAN,">$file_clean") || &Error("Could not open '$file_clean': $!");


=====================================
src/po_tree.c
=====================================
@@ -43,7 +43,7 @@ void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen);
 void merge (Uint* max, Uint css, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen);
 BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char*** pnamen);
 int sim(BOOL** matrix, Uint a, Uint b, Uint ccs);
-char* myitoa(int zahl, int base);
+char* integer2string(int zahl, int base);
 int get_number(char* word);
 
 int main(int argc, char** argv) {
@@ -177,11 +177,11 @@ void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int**
 
 	/* transform the number to text 
 	   the difference with their affinity is the new branch length (= distance) */
-	char* length_0 = myitoa(gemein_0-counter,10);
-	char* length_1 = myitoa(gemein_1-counter,10);
+	char* length_0 = integer2string(gemein_0-counter,10);
+	char* length_1 = integer2string(gemein_1-counter,10);
 
 	/* transform the number of common genes (= affinity) to text */
-	char* anz = myitoa(counter,10);
+	char* anz = integer2string(counter,10);
 	/* calculate how many place is needed for the new string 
          sum of both species strings (= subtree) + new numbers + parenteses + : + \0 */
 	int newsize = strlen((*pnamen)[max[0]])+strlen((*pnamen)[max[1]])+strlen(anz)+strlen(length_0)+strlen(length_1)+8;
@@ -213,7 +213,7 @@ void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int**
 	/* free */
 	free(length_0);
 	free(length_1);
-	/* pointer gotten from myitoa */
+	/* pointer gotten from integer2string */
 	free(anz);
 
 }
@@ -266,7 +266,7 @@ BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char**
 	/* if both entries are the same (as initiated), we are done */
 	if (bestpair[0] == bestpair[1]) {
 		/* get the last branch length of the tree */
-		char* anz = myitoa(get_number((*pnamen)[0]),10);
+		char* anz = integer2string(get_number((*pnamen)[0]),10);
 		/* recalculate size of the tree */
 		int newsize = strlen((*pnamen)[0])+strlen(anz)+2;
 		/* realloc */
@@ -274,7 +274,7 @@ BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char**
 		/* add the information about the root's branchlength */
 		(*pnamen)[0] = strcat((*pnamen)[0],":");
 		(*pnamen)[0] = strcat((*pnamen)[0],anz);
-		/* free myitoa's string */
+		/* free integer2string's string */
 		free(anz);
 		/* return finished */
 		return 0;
@@ -431,7 +431,7 @@ int get_number(char* word) {
  *  returns - char array representing number zahl to base base as text
  *            Attention: returnvalue needs to be freed separately
  */
-char* myitoa(int zahl, int base) {
+char* integer2string(int zahl, int base) {
 	int i = 0, k;
 	char reverse[64];
 


=====================================
src/proteinortho_clustering.cpp
=====================================
@@ -97,7 +97,7 @@ string getTime(void);
 	int param_useLapack = 1;
 
 // min/max number of alg con iterations
-unsigned int critical_min_nodes = 16777216;
+// unsigned int critical_min_nodes = 16777216; // replaced
 const unsigned int min_iter = 16;			// below this value, results may vary
 unsigned int param_max_iter = 8192;			// below this value, results may vary
 float param_epsilon = 1e-8; // analog to http://people.sc.fsu.edu/~jburkardt/c_src/power_method/power_method_prb.c
@@ -442,7 +442,7 @@ void printHelp() {
 	cerr << "         -coreMinSpecies int  the minimum number of species for -core ["<<param_coreMinSpecies<<"]" << "\n";
 	cerr << "\ntechnical parameters:" << "\n";
 	cerr << "         -test             various test-functions are called first [not set]" << "\n";
-	cerr << "         -maxnodes int     only consider connected component with up to maxnodes nodes ["<<param_max_nodes<<"]" << "\n";
+	cerr << "         -maxnodes int     only consider connected component with up to maxnodes nodes. If exceeded, greedily remove the worst 10 percent of edges (by weight) until satisfied ["<<param_max_nodes<<"]" << "\n";
 	cerr << "         -maxweight int    only use the edge weights for connected components with maxweight nodes ["<<param_max_nodes_weight<<"]" << "\n";
 	cerr << "         -epsilon float    convergence threshold ["<<param_epsilon<<"]" << "\n";
 	cerr << "         -weighted bool    the spectral partition is calculated using the bitscores ["<<param_useWeights<<"]" << "\n";
@@ -876,6 +876,47 @@ struct compare_ConnectedComponents { //sort from large to small
 	}
 };
 
+void removeLowQualityEdges( ConnectedComponent &cur_cc ){
+
+		/*
+		 * remove low quality edges
+		 * 
+		 * find the range of values of this CC -> define a cut-off = cut_off=min_w+0.1*(max_w-min_w);
+		 * remove all edges below that value
+		 * redo BFS and start over again for this cluster until -maxnodes are satisfied
+		 * 
+		 */
+
+		// find the lowest 10% of edge values (min+0.1*(max-min))
+		float min_w = -1; 
+		float max_w = -1;
+		for (unsigned int i = 0; i < cur_cc.size(); i++) {
+			unsigned int from=cur_cc[i];
+			for(unsigned int j = 0 ; j < graph[from].edges.size() ; j++){
+				unsigned int to = graph[from].edges[j].edge;
+				unsigned int weight = graph[from].edges[j].weight;
+				if(min_w == -1 || weight < min_w){min_w=weight;} 
+				if(min_w == -1 || weight > max_w){max_w=weight;}
+			}
+		}
+		float cut_off=min_w+0.1*(max_w-min_w);
+		if(debug_level>0) cerr << "cut_off="<<cut_off << " min_w="<<min_w<<" max_w="<<max_w<< endl;
+		for (unsigned int i = 0; i < cur_cc.size(); i++) {
+			unsigned int from=cur_cc[i];
+			auto it = std::remove_if(
+				graph[from].edges.begin(), 
+				graph[from].edges.end(), 
+				[cut_off,from](wedge cur_el)->bool 
+				{
+					if(debug_level>0 && cur_el.weight <= cut_off) cerr << "[WARNING] found bad-edge "<< from << "-"<< cur_el.edge << " weight=" << cur_el.weight << endl;
+					return cur_el.weight <= cut_off;
+				}
+			);
+			graph[from].edges.erase(it, graph[from].edges.end());
+		}
+
+}
+
 void find_CCs_givenNodes(dispatch_queue *q, vector<unsigned int> todo_work ){
 
 	map<unsigned int, bool> done;	// Keep track on what was done (for each node)
@@ -901,7 +942,7 @@ void find_CCs_givenNodes(dispatch_queue *q, vector<unsigned int> todo_work ){
 		allNodesAreDone = true;
 
 		for (unsigned int i = 0 ; i < todo_work.size() ; i++) {
-			
+
 			unsigned int protein_id=todo_work[i];
 			if ( done.count(protein_id) && done[protein_id] ){continue;}// We were here already
 
@@ -915,19 +956,14 @@ void find_CCs_givenNodes(dispatch_queue *q, vector<unsigned int> todo_work ){
 
 			if(debug_level>0) cerr << "ConnectedComponent: @"<< getCCid(todo_work) << "=>" << cur_cc.size() << "@" << getCCid(cur_cc.m_content_CC) << endl;
 
-			// Skip those that are too large (try critical edge heuristic)
-			if (cur_cc.size() > param_max_nodes || cur_cc.size() > critical_min_nodes) {
+			// Skip those that are too large (try heuristic)
+			if (cur_cc.size() > param_max_nodes) {
 
-				if(cur_cc.size() > param_max_nodes)
-					cerr << " [WARNING]  Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
-				else
-					cerr << " [WARNING]  Found a rather large connected component that contains " << cur_cc.size() << ">" << critical_min_nodes << " (critical_min_nodes) elements. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
-				
-				if(cur_cc.size() > param_max_nodes){
-					cerr << " [WARNING]  The very large connected component that contains " << cur_cc.size() << " elements, did not have any critical edges." << "\n";
-					cerr << " [WARNING]  Skipping this connected component. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds." << "\n";
-					continue;	
-				}
+				// reset done vector
+				for (int i = 0; i < cur_cc.size(); ++i){ done[cur_cc[i]]=false; }
+				if(debug_level>0) cerr << " [WARNING]  Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes. Now using a slow heuristic: try to identify and remove edges." << "\n";
+				removeLowQualityEdges(cur_cc); i--;
+				continue;
 			}
 
 			if(param_core && cur_cc.species_num==0){
@@ -1066,7 +1102,7 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
 		 * purity is imporatant if there are multiple clusters that can be split or there is no good split
 		 * identified nodes (in purity boundary) are put back to the queue stack (=groupZero)
 		 * 
-		 * maximal purity = 0.1 -> get all nodes with |x_hat(v)|<0.1
+		 * maximal purity = 0.12 -> get all nodes with |x_hat(v)|<0.12
 		 * identify all possible purity thresholds 
 		 * each purity threshold is evaluated with the total sum of edge weights that are removed with this cut
 		 * find most optimal purity cut-off
@@ -1090,7 +1126,7 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
 		if(debug_level>0)	cerr << " initial purity=0 best_cut=" << best_cut << endl;
 		for(unsigned int pi = 0; pi < (unsigned int)n; pi++){
 			unsigned int from = cur_cc[pi];
-			if(!mapping.count(from) || abs(x_hat[ mapping[from] ])>0.1 ){continue;}
+			if(!mapping.count(from) || abs(x_hat[ mapping[from] ])>0.12 ){continue;}
 			float test_purity = abs(x_hat[ mapping[from] ]);
 			float cut_value=0;
 			unsigned int num_nodes=0;
@@ -1114,7 +1150,7 @@ void partition_CC(ConnectedComponent cur_cc, dispatch_queue *q, bool do_lapack,
 			}
 		}
 
-		if(new_purity>0.1){new_purity=0;} // if there are no impure nodes -> disable
+		if(new_purity>0.12){new_purity=0;} // if there are no impure nodes -> disable
 		unsigned int num_impure_nodes=0;
 		for (unsigned int i = 0; i < x_hat.size(); i++) { if(abs(x_hat[i]) < new_purity) { num_impure_nodes++; } }
 		if(debug_level>0)	cerr << " num_impure_nodes" << num_impure_nodes << endl;
@@ -1391,19 +1427,14 @@ void find_CCs(dispatch_queue *q){
 			// Do not report singles
 			if (cur_cc.size() < 2) {continue;} // singletons are from no interest
 
-			// Skip those that are too large (try critical edge heuristic)
-			if (cur_cc.size() > param_max_nodes || cur_cc.size() > critical_min_nodes) {
+			// Skip those that are too large (try heuristic)
+			if (cur_cc.size() > param_max_nodes) {
 
-				if(cur_cc.size() > param_max_nodes)
-					cerr << " [WARNING]  Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
-				else
-					cerr << " [WARNING]  Found a rather large connected component that contains " << cur_cc.size() << ">" << critical_min_nodes << " (critical_min_nodes) elements. Now using the Critical-Heuristic: try to identify and remove critical edges." << "\n";
-				
-				if(cur_cc.size() > param_max_nodes){
-					cerr << " [WARNING]  The very large connected component that contains " << cur_cc.size() << " elements, did not have any critical edges." << "\n";
-					cerr << " [WARNING]  Skipping this connected component. This behavior can be adjusted using -maxnodes, still you better consider a higher e-value or coverage thresholds." << "\n";
-					continue;	
-				}
+				// reset done vector
+				for (int i = 0; i < cur_cc.size(); ++i){ done[cur_cc[i]]=false; }
+				if(debug_level>0) cerr << " [WARNING]  Found a very large connected component that contains " << cur_cc.size() << ">" << param_max_nodes << " (maxnodes) elements. This behavior can be adjusted using -maxnodes. Now using a slow heuristic: try to identify and remove edges." << "\n";
+				removeLowQualityEdges(cur_cc); protein_id--;
+				continue;
 			}
 
 			if(param_core && cur_cc.species_num==0){


=====================================
src/proteinortho_summary.pl
=====================================
@@ -238,7 +238,7 @@ my @keys=sort keys %species_matrix;
 
 $noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
 
-print STDERR "\n";
+print "\n";
 my $ret= "# The adjacency matrix, the number of edges between 2 species\n";
 processLine($ret);
 $ret= "# file\t";
@@ -269,7 +269,7 @@ $maxNumOfCharsInOneLine=`tput cols`;
 chomp($maxNumOfCharsInOneLine);$maxNumOfCharsInOneLine/=2;
 if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
 
-print STDERR "\n";
+print "\n";
 $ret= "# file\taverage number of edges\n";
 processLine($ret);
 for(my $i = 0 ; $i < scalar @keys; $i++){
@@ -289,7 +289,7 @@ $maxNumOfCharsInOneLine=`tput cols`;
 chomp($maxNumOfCharsInOneLine);
 if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
 
-print STDERR "\n";
+print "\n";
 $ret= "# The 2-path matrix, the number of paths between 2 species of length 2\n";
 processLine($ret);
 $ret= "# file\t";
@@ -320,7 +320,7 @@ $maxNumOfCharsInOneLine=`tput cols`;
 chomp($maxNumOfCharsInOneLine);$maxNumOfCharsInOneLine/=2;
 if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
 
-print STDERR "\n";
+print "\n";
 processLine("# file\taverage number of 2-paths\n");
 for(my $i = 0 ; $i < scalar @keys; $i++){
 	



View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/compare/427c970d6dc7f60f2e664a405045ea9af4a8fe2e...85c9aa9ec8f6a1e761239bffa81ccefe3ba3779a

-- 
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/compare/427c970d6dc7f60f2e664a405045ea9af4a8fe2e...85c9aa9ec8f6a1e761239bffa81ccefe3ba3779a
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221126/b0027fe5/attachment-0001.htm>


More information about the debian-med-commit mailing list