[med-svn] [Git][med-team/proteinortho][upstream] New upstream version 6.0.12+dfsg

Steffen Möller gitlab at salsa.debian.org
Thu Dec 12 23:51:21 GMT 2019



Steffen Möller pushed to branch upstream at Debian Med / proteinortho


Commits:
83071ae5 by Steffen Moeller at 2019-12-12T23:17:49Z
New upstream version 6.0.12+dfsg
- - - - -


9 changed files:

- CHANGELOG
- CHANGEUID
- Makefile
- README.md
- proteinortho6.pl
- .gitlab-ci.yml → src/.gitlab-ci.yml
- src/proteinortho_ffadj_mcs.py
- + src/proteinortho_history.pl
- + src/proteinortho_summary.pl


Changes:

=====================================
CHANGELOG
=====================================
@@ -218,9 +218,18 @@
         proteinortho6.pl replaced chomp with s/[\r\n]+$//
         proteinortho_clustering.cpp fix bug that only uses lapack if -pld is set, regardless of the value.
     11. Sept (uid: 3813)
-	updated shebang of ffadj such that python2.7 is used directly (ffadj fails if called with higher version of python)
-	-p=blastp is now alias of blastp+ and legacy blast is now -p=blastp_legacy (blastn is equivalent)
-	Makefile: static now includes -lquadmath
+		updated shebang of ffadj such that python2.7 is used directly (ffadj fails if called with higher version of python)
+		-p=blastp is now alias of blastp+ and legacy blast is now -p=blastp_legacy (blastn is equivalent)
+		Makefile: static now includes -lquadmath
     25. Sept (uid: 3899)
-	synteny update to python3 (but the code looks fishy, the -synteny option now gets a deprecated warning)
-	proteinortho now only print html for <10 files automatically and otherwise only gives the option
+		synteny update to python3 (but the code looks fishy, the -synteny option now gets a deprecated warning)
+		proteinortho now only print html for <10 files automatically and otherwise only gives the option
+    4. Nov (uid: 4020)
+		FIXED: sometimes the python3 version produces one edditional edge (global defintion of ALPHA). Special thanks for this update goes to Daniel Doerr for fixing this.
+	25. Nov (uid: 4030)
+		added proteinortho_history
+		the synteny option ffadj is now not depricated anymore
+    10. Dec (uid: 4196)
+    	improved proteinortho_history
+    	removed the new diamond spam
+        + added proteinortho_summary.pl for a summary of proteinortho-graph on species level.


=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-3899
+4196


=====================================
Makefile
=====================================
@@ -77,7 +77,7 @@ endif
 dir_guard=@if [ ! -d $(BUILDDIR) ]; then echo "Creating build directory ..."; mkdir -p $(BUILDDIR); fi
 
 .PHONY: all
-all:$(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_treeBuilderCore
+all:$(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_treeBuilderCore
 	@echo "[100%] $(GREEN)Everything is compiled with no errors.$(NC)"
 
 $(BUILDDIR)/proteinortho_extract_from_graph.pl: src/proteinortho_extract_from_graph.pl
@@ -120,6 +120,14 @@ $(BUILDDIR)/proteinortho_ffadj_mcs.py: src/proteinortho_ffadj_mcs.py
 	$(dir_guard)
 	@cp $< $@
 
+$(BUILDDIR)/proteinortho_history.pl: src/proteinortho_history.pl
+	$(dir_guard)
+	@cp $< $@
+
+$(BUILDDIR)/proteinortho_summary.pl: src/proteinortho_summary.pl
+	$(dir_guard)
+	@cp $< $@
+
 echoENV:
 	@echo -n "CC = "
 	@echo $(CC)
@@ -224,7 +232,7 @@ else
 endif
 
 .PHONY: install
-install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_treeBuilderCore $(BUILDDIR)/proteinortho_grab_proteins.pl
+install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho_history.pl $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_treeBuilderCore $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_summary.pl $(BUILDDIR)/proteinortho_history.pl 
 	@echo "INSTALLING everything to $(INSTALLDIR)"
 	@install -v $^ $(INSTALLDIR);
 	@echo "$(GREEN)Everything installed successfully to $(INSTALLDIR).$(NC)"
@@ -246,15 +254,15 @@ test_step2: proteinortho6.pl
 		echo "$(GREEN)passed$(NC)"; \
 	fi
 
-#	@echo -n " [2/12] -p=blastp+ synteny (PoFF) test: "
-#	@if [ "$(shell which blastp)" = "" ]; then\
-#		echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
-#	else \
-#		./proteinortho6.pl -silent -force -project=test_synteny -synteny -singles -p=blastp+ test/*.faa; \
-#		set -e ; ./src/chk_test.pl test_synteny.proteinortho.tsv; \
-#		set -e ; ./src/chk_test.pl test_synteny.poff.tsv; \
-#		echo "$(GREEN)passed$(NC)"; \
-#	fi
+	@echo -n " [2/12] -p=blastp+ synteny (PoFF) test: "
+	@if [ "$(shell which blastp)" = "" ]; then\
+		echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
+	else \
+		./proteinortho6.pl -silent -force -project=test_synteny -synteny -singles -p=blastp+ test/*.faa; \
+		set -e ; ./src/chk_test.pl test_synteny.proteinortho.tsv; \
+		set -e ; ./src/chk_test.pl test_synteny.poff.tsv; \
+		echo "$(GREEN)passed$(NC)"; \
+	fi
 
 	@echo -n " [3/12] -p=diamond test: "
 	@if [ "$(shell which diamond)" = "" ]; then\


=====================================
README.md
=====================================
@@ -1,27 +1,45 @@
 # Proteinortho
 
- Proteinortho is a tool to detect orthologous genes within different species. For doing so, it compares similarities of given gene sequences and clusters them to find significant groups. The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. Details can be found in <a href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-124">Lechner et al., BMC Bioinformatics. 2011 Apr 28;12:124.</a>
-To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (doi:10.1371/journal.pone.0105015), is already build in Proteinortho. The general workflow of proteinortho is depicted [![here](https://www.dropbox.com/s/7ubl1ginn3fmf8k/proteinortho_workflow.jpg?dl=0)].
+Proteinortho is a tool to detect orthologous genes within different species.
+ 
+For doing so, it compares similarities of given gene sequences and clusters them to find significant groups. 
+The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. 
+Details can be found in ([doi:10.1186/1471-2105-12-124](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-124)).
+To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF ([doi:10.1371/journal.pone.0105015](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0105015)), is already build in Proteinortho. The general workflow of proteinortho: 
 
-# New Features of Proteinortho Version 6!
+<img src="https://www.uni-marburg.de/de/fb16/ipc/ag-lechner/graph.png/@@images/image/unimr_lead_image_sd" alt="proteinortho.workflow.png" height="250">
+
+Input: multiple fasta files (orange box) with many proteins/genes (circles). 
+
+First an initial all vs. all comparison between all proteins of all species is performed to determine protein similarities (upper right image). 
+
+The second stage is the clustering of similar genes to meaningful co-orthologous groups (lower right image). Connected components within this graph can be considered as putative co-orthologous groups in theory and are returned in the output (lower left image).
+
+# New Features of Proteinortho Version 6
 
   - Implementation of various Blast alternatives for step (for -step=2 the -p= options): Diamond, MMseqs2, Last, Topaz, Rapsearch2, Blat, Ublast and Usearch
   - Multithreading support for the clustering step (-step=3)
   - Integration of the LAPACK Fortran Library for a faster clustering step (-step=3)
   - Integration of the bitscore weights in the connectivity calculation for more data dependant splits (-step=3)
+  - Continuous Integration [![pipeline status](https://gitlab.com/paulklemm_PHD/proteinortho/badges/master/pipeline.svg)](https://gitlab.com/paulklemm_PHD/proteinortho/pipelines) 
 <details>
-  <summary>Minor features: (Click to expand)</summary>
+  <summary>Minor new features: (Click to expand)</summary>
 
   - Output now supports OrthoXML (-xml) and HTML.
+  - [proteinortho_history.pl](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Tools%20and%20additional%20programs) a new tool for tracking proteins (or pairs of proteins) in the workflow of proteinortho.
+  - [proteinortho_summary.pl](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Tools%20and%20additional%20programs)
   - Various test routines (make test).
   - New heuristics for connectivity calculation (-step=3).
 </details>
+<details>
+  <summary>6.0.12: (Click to expand)</summary>
+
+  - removed the diamond spam
+  - improved [proteinortho_history.pl](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Tools%20and%20additional%20programs) : now the program is "smarter" in detecting files automatically
+  - added [proteinortho_summary.pl](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Tools%20and%20additional%20programs) : a tool for summarizing the proteinortho-graph on species level. With the output it is easy to identify weak connected species.   
+</details>
 
-# Continuous Integration
-supports
-The badge
-[![pipeline status](https://gitlab.com/paulklemm_PHD/proteinortho/badges/master/pipeline.svg)](https://gitlab.com/paulklemm_PHD/proteinortho/commits/master) indicates the current status of the continuous integration (CI) among various platforms (ubuntu, centos, debian, fedora) and GNU c++ versions (5, 6, latest)
-The whole git repository gets deployed on a clean docker imager (gcc:latest,gcc:5,ubuntu:latest,fedora:latest,debian:latest,centos:latest) and compiled (make all) and tested (make test). The badge is green only if all test are passed. For more information see [Continuous Integration (proteinortho wiki)](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Continuous%20Integration).
+A more detailed list of all changes: [CHANGELOG](https://gitlab.com/paulklemm_PHD/proteinortho/blob/master/CHANGELOG)
 
 # Table of Contents
 1. [Installation](#installation)
@@ -30,12 +48,18 @@ The whole git repository gets deployed on a clean docker imager (gcc:latest,gcc:
 4. [PoFF synteny extension](#poff)
 5. [Output description](#output)
 6. [Examples](#examples)
-7. [Error Codes and Troubleshooting](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error-Codes) <- look here if you cannot compile/run (proteinortho wiki)
-8. [Large compute jobs example](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Large-compute-jobs-(the--jobs-option)) (proteinortho wiki)
-9. [Biological example](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/biological-example) (proteinortho wiki)
 
-Bug reports: See chapter 7. or send a mail to incoming+paulklemm-phd-proteinortho-7278443-issue- at incoming.gitlab.com (Please include the 'Parameter-vector' that is printed for all errors)
-You can also send a mail to lechner at staff.uni-marburg.de.
+# [Proteinortho-Wiki](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/) Table of Contents
+
+1. [Tools and additional programs](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Tools%20and%20additional%20programs)
+2. [Error Codes and Troubleshooting](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error-Codes) <- look here if you cannot compile/run proteinortho
+3. [Large compute jobs example](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Large-compute-jobs-(the--jobs-option))
+4. [FAQ](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/FAQ) <br>
+[(...)](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/)
+
+Bug reports: Please have a look at chapter [2.](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error-Codes) first or send a mail to incoming+paulklemm-phd-proteinortho-7278443-issue- at incoming.gitlab.com. (please include the 'parameter-vector' that is printed for all errors)
+You can also send mails to lechner at staff.uni-marburg.de. Any suggestions, feedback and comments are welcome!
+
 
 # Installation
 
@@ -88,7 +112,7 @@ Afterwards the deb package can be installed with `sudo dpkg -i proteinortho*deb`
 
 <br>
 
-#### 1. Prerequisites
+#### Prerequisites for compiling proteinortho from source
 
 Proteinortho uses standard software which is often installed already or is part of then package repositories and can thus easily be installed. The sources come with a precompiled version of Proteinortho for 64bit Linux.
 
@@ -126,7 +150,7 @@ Proteinortho uses standard software which is often installed already or is part
 
 <br>
 
-#### 2. Building and installing proteinortho from source (linux and osx)
+#### Building and installing proteinortho from source (linux and osx)
 
   Here you can use a working lapack library, check this with 'dpkg --get-selections | grep lapack'. Install lapack e.g. with 'apt-get install libatlas3-base' or liblapack3.
 
@@ -179,7 +203,7 @@ OR(!) specify the new g++ in 'make CXX=/usr/local/bin/g++-7 all'
 [100%] Everything is compiled with no errors.
 </pre>
 
-The compilation of proteinortho_clustering has multiple fall-back routines. If everything fails please look here [Troubleshooting (proteinortho wiki)](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes).
+The compilation of proteinortho\_clustering has multiple fall-back routines. If everything fails please look here [Troubleshooting (proteinortho wiki)](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes).
 
 </details>
 
@@ -214,21 +238,13 @@ If you have problems compiling/running the program go to [Troubleshooting (prote
 <br>
 
 # SYNOPSIS
-  > **proteinortho6.pl [options] \<fasta file(s)\>** (one fasta for each species, at least 2)
-
-  OR
-
   > **proteinortho [options] \<fasta file(s)\>**
+ 
+   one fasta for each species; at least 2
 
 # DESCRIPTION
   **proteinortho** is a tool to detect orthologous genes within different
-  species. For doing so, it compares similarities of given gene sequences
-  and clusters them to find significant groups. The algorithm was designed
-  to handle large-scale data and can be applied to hundreds of species at
-  one. Details can be found in Lechner et al., BMC Bioinformatics. 2011 Apr
-  28;12:124. To enhance the prediction accuracy, the relative order of genes
-  (synteny) can be used as additional feature for the discrimination of
-  orthologs. The corresponding extension, namely PoFF (doi:10.1371/journal.pone.0105015), is already build in Proteinortho.
+  species. 
 
   Proteinortho assumes, that you have all your gene sequences in FASTA
   format either represented as amino acids or as nucleotides. The source
@@ -302,6 +318,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 
         - diamond : Only for protein files! standard diamond procedure and for
         genes/proteins of length >40 with the additional --sensitive flag
+        Warning: Please use version 0.9.29 or later to avoid this known bug: https://gitlab.com/paulklemm_PHD/proteinortho/issues/24
 
         - lastn,lastp : lastal. -n : dna files, -p protein files (BLOSUM62
         scoring matrix)!
@@ -343,7 +360,6 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 <br>
 
  **Synteny options (optional, step 2)**
-  (This option is deprecated)
   (output: <myproject>.ffadj-graph, <myproject>.poff.tsv (tab separated file)-graph)
 
 <details>
@@ -537,6 +553,9 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
  </details>
  <br>
 
+[myproject.proteinortho-graph.summary](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Tools-and-additional-programs#proteinortho-graphblast-graph-species-summary-table)
+
+ <br>
  <details>
   <summary> myproject.proteinortho.html (Click to expand)</summary>
     The html version of the myproject.proteinortho.tsv file


=====================================
proteinortho6.pl
=====================================
@@ -341,8 +341,6 @@ By default, Proteinortho splits each group into two more dense subgroups when th
 
 =head2 POFF (-synteny)
 
--->> This option is deprecated <<---
-
 The synteny based graph files (myproject.ffadj-graph and myproject.poff-graph) have two additional columns:
 same_strand and simscore. The first one indicates if two genes from a match are located at the same strands (1) or not (-1).
 The second one is an internal score which can be interpreted as a normalized weight ranging from 0 to 1 based on the respective e-values.
@@ -455,7 +453,7 @@ use POSIX;
 ##########################################################################################
 # Variables
 ##########################################################################################
-our $version = "6.0.8";
+our $version = "6.0.12";
 our $step = 0;    # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
 our $verbose = 1; # 0/1   -> don't / be verbose
 our $debug = 0;   # 0/1   -> don't / show debug data
@@ -595,9 +593,9 @@ foreach my $option (@ARGV) {
   elsif ($option =~ m/^--?selfblast=(0|1)$/)    { $selfblast = $1; }
   elsif ($option =~ m/^--?singles$/)      { $singles = 1; }
   elsif ($option =~ m/^--?singles=(0|1)$/)    { $singles = $1; }
-  elsif ($option =~ m/^--?poff$/)       { $synteny = 1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
-  elsif ($option =~ m/^--?synteny$/)      { $synteny = 1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
-  elsif ($option =~ m/^--?synteny=(0|1)$/)    { $synteny = $1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
+  elsif ($option =~ m/^--?poff$/)       { $synteny = 1;  } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---";
+  elsif ($option =~ m/^--?synteny$/)      { $synteny = 1;  } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; 
+  elsif ($option =~ m/^--?synteny=(0|1)$/)    { $synteny = $1; } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; 
   elsif ($option =~ m/^--?dups=0$/)     { $duplication = 0; }
   elsif ($option =~ m/^--?dups=([1-8])$/)   { $duplication = $1+1;}
   elsif ($option =~ m/^--?neighbourjoin$/)    { $neighbourjoin = 1; }
@@ -606,9 +604,9 @@ foreach my $option (@ARGV) {
   elsif ($option =~ m/^--?keep$/)     { $keep = 1; }
   elsif ($option =~ m/^--?force$/)      { $force = 1; }
   elsif ($option =~ m/^--?clean$/)     { $clean = 1; }
-  elsif ($option =~ m/^--?help$/)     { print_usage(); exit 0;}
+  elsif ($option =~ m/^--?help$/)     { &print_header;print_usage_more(); exit 0;}
   elsif ($option =~ m/^--?test$/)     { &check_bins; &get_po_path; print "All necessary proteinortho_* binaries are found.\n"; exit 0;}
-  elsif ($option =~ m/^--?h$/)     { print_usage(); exit 0;}
+  elsif ($option =~ m/^--?h$/)     { &print_header;print_usage(); exit 0;}
   elsif ($option =~ m/^--?nograph$/)      { $nograph = 1; }
   elsif ($option =~ m/^--?xml$/)      { $doxml = 1; }
   elsif ($option =~ m/^--?graph$/)      { $nograph = 0; }
@@ -622,6 +620,7 @@ foreach my $option (@ARGV) {
   else  {&print_usage(); &reset_locale();die "$RED"."[Error]$NC $ORANGE Invalid command line option: \'$option\'! $NC\n\n"; }
 }
 
+if($jobnumber != -1 && $tmp_path eq "" ){print STDERR "$ORANGE"."[WARNING]$NC You should use the -tmp option in combination with -jobs.\n";}
 if($selfblast){$checkblast=1;}
 
 $po_path = &get_po_path();    # Determine local path
@@ -856,7 +855,7 @@ sub cluster {
             if($_ ne ""){
               if($ci eq 1){ # first column = total
                 $freemem_inMB=$_*0.75;
-                if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using $freemem_inMB MB of memory (75% of total memory) and $cpus cpu core(s).\n";}
+                if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using up to $freemem_inMB MB of memory (75% of total memory) and $cpus cpu core(s). Adjust this behaviour with the -mem option.\n";}
                 last;
               }
               $ci=$ci+1;
@@ -865,10 +864,10 @@ sub cluster {
         }
       }else{
         $freemem_inMB=16384;
-        if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using $freemem_inMB MB of memory (default value, command 'free' not found) and $cpus cpu core(s).\n";}
+        if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using up to $freemem_inMB MB of memory (default value, command 'free' not found) and $cpus cpu core(s). Adjust this behaviour with the -mem option.\n";}
       }
     }else{
-      if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using $freemem_inMB MB of memory and $cpus cpu core(s).\n";}
+      if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using up to $freemem_inMB MB of memory and $cpus cpu core(s). Adjust this behaviour with the -mem option.\n";}
     }
     #if($freemem_inMB < 5){if($verbose){print STDERR "$ORANGE"."[WARNING]$NC Increased the maximal memory threshold to 5 MB ($freemem_inMB MB is not sufficent).$NC\n";}$freemem_inMB=5;} # minimum 5 MB
     if($freemem_inMB < 1000){if($verbose){print STDERR "$ORANGE"."[WARNING]$NC The memory threshold of $freemem_inMB MB seems somewhat small. I will increase the threshold to 1 GB (-ram=1000).$NC\n";}$freemem_inMB=1024;} # minimum 5 MB
@@ -902,6 +901,7 @@ sub cluster {
     system("$po_path/proteinortho_singletons.pl $fastas <'$simtable' >>'$simtable'");
   }
 
+
   if (!$nograph && !$useMcl) {
     system("$po_path/proteinortho_graphMinusRemovegraph '$rm_simgraph' $simgraph* >'$csimgraph'");
     unless ($keep) {unlink($rm_simgraph);}
@@ -909,6 +909,9 @@ sub cluster {
   }elsif($useMcl){
     if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csimgraph\n";}
   }
+  if(-x "$po_path/proteinortho_summary.pl"){
+    system("$po_path/proteinortho_summary.pl '$csimgraph' >>'$csimgraph.summary'");
+  }
 
   if(scalar @files < 10){
     system("perl $po_path/proteinortho2html.pl $simtable ".join(" ", at files)." >$simtablehtml");
@@ -951,7 +954,10 @@ sub cluster {
     }elsif($useMcl){
       if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csyngraph\n";}
     }
-
+    if(-x "$po_path/proteinortho_summary.pl"){
+      system("$po_path/proteinortho_summary.pl '$csyngraph' >>'$csyngraph.summary'");
+    }
+    
     system("perl $po_path/proteinortho2html.pl $syntable ".join(" ", at files)." >$syntablehtml");
     if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $syntablehtml\n";}
 
@@ -995,21 +1001,83 @@ Options:
 
          [Search options]
          -p=          blast program [default: diamond]
-                      {blast*|tblastx|blast*+|tblastx+|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|*blat*|mmseqs*}
-                      blast*|tblastx : standard blast family (blastp : protein files, blastn : dna files)
-                      blast*+|tblastx+ : standard blastal family (blastp+ : protein files, blastn+ : dna files)
-                      blast*_legacy : legacy blast family
-                      diamond : Only for protein files! standard diamond procedure and for genes/proteins of length >40 with the additional --sensitive flag
-                      usearch : usearch_local procedure with -id 0 (minimum identity percentage).
-                      ublast : usearch_ublast procedure.
-                      lastn : standard lastal. Only for dna files!
-                      lastp : lastal using -p and BLOSUM62 scoring matrix. Only for protein files!
-                      rapsearch : Only for protein files!
-                      topaz : Only for protein files!
-                      blat* : Blat family. blatp : Only for protein files! blatn : Only for dna files! blatx : Only for dna files!
-                      mmseqs* : mmseqs family. mmseqsp : Only for protein files! mmseqsn : Only for dna files! blatx : Only for dna files!
+                      {blastp|blastn|tblastx|blastp_legacy|blastn_legacy|tblastx_legacy|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|blatp|blatn|mmseqsp|mmseqsn}
+                      The suffix 'p' or 'n' indicates aminoacid fasta files (p) or nucleotide fasta files (n).
+                      The suffix '_legacy' indicates legacy blastall (otherwise blast+ is used).
          -e=          E-value for blast [default: 1e-05]
 
+         [Synteny options]
+         -synteny     activate PoFF extension to separate similar sequences print
+                      by contextual adjacencies (requires .gff for each .fasta)
+         -alpha=      PoFF: weight of adjacencies vs. sequence similarity
+                      (default: 0.5)
+
+         [Clustering options]
+         -singles     report singleton genes without any hit
+         -conn=       min. algebraic connectivity [default: 0.1]
+         -xml        produces an OrthoXML formatted file of the *.proteinortho.
+
+         (...) show more with --help
+
+For more information see the man page: 'proteinortho -man' or online: https://gitlab.com/paulklemm_PHD/proteinortho
+Or you can use the GUI proteinorthoHelper.html (available at http://lechnerlab.de/proteinortho/)
+Do you have suggestions or need more help: write a mail to lechner\@staff.uni-marburg.de.
+
+";
+}
+
+sub print_usage_more {
+print STDERR "
+     |
+  ${BLUE}  /${NC} ${RED2}\\  $NC
+  ${BLUE} /\\${NC}${RED2} /\\ $NC
+  ${BLUE}/ ${RED2}/${BLUE} \\ ${RED2}\\${NC}
+
+Usage: proteinortho6.pl [OPTIONS] FASTA1 FASTA2 [FASTA...] (one for each species, at least 2)
+Options:
+         [General options]
+         -project=    prefix for all result file names [default: myproject]
+         -cpus=       number of processors to use [default: auto]
+         -ram=        maximal used ram threshold for LAPACK and the input graph in MB [default: 75% of the total memory]
+         -silent      sets verbose to 0
+         -temp=       path for temporary files [default: working directory]
+         -keep        stores temporary blast results for reuse
+         -force       forces the recalculation of the blast results in any case in step=2. Also forces the recreation of the database generation in step=1
+         -clean       remove all unnecessary files after processing
+         -step=       1 -> generate indices
+                      2 -> run blast (and ff-adj, if -synteny is set)
+                      3 -> clustering
+                      0 -> all (default)
+
+         [Search options]
+         -p=          blast program [default: diamond]
+
+                      {blastp|blastn|tblastx|blastp_legacy|blastn_legacy|tblastx_legacy|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|blatp|blatn|mmseqsp|mmseqsn}
+
+                      The program need to be installed first.
+                      A suffix 'p' or 'n' indicates aminoacid fasta files (p) or nucleotide fasta files (n).
+                      The '_legacy' suffix indicates legacy blastall (otherwise blast+ is used).
+
+                        blast*|tblastx : standard blast+ family (blastp : protein files, blastn : dna files)
+                        blast*_legacy : legacy blast family (blastall)
+                        diamond : Only for protein files! standard diamond procedure and for genes/proteins of length >40 with the additional --sensitive flag
+                        usearch : usearch_local procedure with -id 0 (minimum identity percentage).
+                        ublast : usearch_ublast procedure.
+                        lastn : standard lastal. Only for dna files!
+                        lastp : lastal using -p and BLOSUM62 scoring matrix. Only for protein files!
+                        rapsearch : Only for protein files!
+                        topaz : Only for protein files!
+                        blat* : Blat family. blatp : For protein files! blatn : For dna files! blatx : For dna files!
+                        mmseqs* : mmseqs family. mmseqsp : For protein files! mmseqsn : For dna files! blatx : For dna files!
+
+         -e=          E-value for blast [default: 1e-05]
+         -selfblast   apply selfblast, detects paralogs without orthologs
+         -sim=        min. similarity for additional hits (0..1) [default: 0.95]
+         -identity=   min. percent identity of best blast hits [default: 25]
+         -cov=        min. coverage of best blast alignments in % [default: 50]
+         -subpara=    additional parameters for the search tool
+                      example -subpara='-seg no' or -subpara='--more-sensitive' for diamond
+
          [Synteny options]
          -synteny     activate PoFF extension to separate similar sequences print
                       by contextual adjacencies (requires .gff for each .fasta)
@@ -1023,16 +1091,36 @@ Options:
          [Clustering options]
          -singles     report singleton genes without any hit
          -conn=       min. algebraic connectivity [default: 0.1]
-         -xml        produces an OrthoXML formatted file of the *.proteinortho.
-
-         (...)
+         -xml         produces an OrthoXML formatted file of the *.proteinortho.tsv file.
+         -purity=     avoid spurious graph assignments, the higher the more uncertain edges are cut [0-1, default: 1e-07]
+         -mcl         enables the mcl algorithm for clustering instead of power iteration or lapacks routine. (needs mcl to be installed) 
+         -nograph     do not generate .proteinortho-graph file (pairwise orthology relations)
+
+         [Misc options]
+         -desc        write description files (for NCBI FASTA input only)
+         -debug       gives detailed information for bug tracking
+         -binpath=    path to your directory of local programs that are not available globally (this should not be needed)
+
+         [Large compute jobs]
+          In case jobs should be distributed onto several machines, use
+         -jobs=M/N    N defines the number of defined job groups (e.g. PCs)
+                      M defines the set of jobs to run in this process
+
+         Example:     run with 
+                        -step=1
+                      to prepare data then to split a run on two machines use
+                        -jobs=1/2 -step=2 on PC one and
+                        -jobs=2/2 -step=2 on PC two
+                      finally run with
+                        -step=3 to finalize
 
 For more information see the man page: 'proteinortho -man' or online: https://gitlab.com/paulklemm_PHD/proteinortho
 Or you can use the GUI proteinorthoHelper.html (available at http://lechnerlab.de/proteinortho/)
-Dou have suggestions or need more help: write a mail to lechner\@staff.uni-marburg.de.
+Do you have suggestions or need more help: write a mail to lechner\@staff.uni-marburg.de.
 
 ";
 }
+
 # -memory=     the amount of ram used for partioning in MB [default 75% of the total memory]
 
 sub init_graph_output {
@@ -1584,12 +1672,12 @@ sub get_legal_matches {
 
 # if(($blastmode ne "phmmer" && $blastmode ne "jackhmmer")){ #workaround for jackhmmer + phmmer
     # Percent identity
-    if (!$twilight && $local_identity < $identity)          {next;}
-    if ( $twilight && $local_identity < &identitybylength($alignment_length)) {next;}
+    if (!$twilight && $local_identity < $identity)          {if ($debug) {print STDERR "!$query_id -> $subject_id is removed because of identity threshold\n";} next;}
+    if ( $twilight && $local_identity < &identitybylength($alignment_length)) {if ($debug) {print STDERR "!$query_id -> $subject_id is removed because of identity by length threshold\n";} next;}
     # Min. length
     if ($blastmode eq "tblastx+" || $blastmode eq "tblastx") {$alignment_length *= 3;}
-    if ($alignment_length < $length{$query_id}*($coverage/100)+0.5)     {next;}
-    if ($alignment_length < $length{$subject_id}*($coverage/100)+0.5)     {next;}
+    if ($alignment_length < $length{$query_id}*($coverage/100)+0.5)     {if ($debug) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold\n";} next;}
+    if ($alignment_length < $length{$subject_id}*($coverage/100)+0.5)     {if ($debug) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold\n";} next;}
 
     # It hit itself (only during selfblast)
     # if ($selfblast && $query_id eq $subject_id)           {next;} # 5.16 reuse IDs
@@ -1667,12 +1755,12 @@ sub generate_indices {
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
       }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
-        system("$makedb '$file' -d '$file.$blastmode' --quiet >\/dev\/null");
+        system("$makedb '$file' -d '$file.$blastmode' --quiet >\/dev\/null 2>\/dev\/null");
         if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";
 
           mkdir("$tmp_path/DB");
           if($step==1){$oldkeep=$keep;$keep=1;}
-          system("$makedb '$file' -d '$tmp_path/DB/".basename($file).".$blastmode' --quiet >\/dev\/null");
+          system("$makedb '$file' -d '$tmp_path/DB/".basename($file).".$blastmode' --quiet >\/dev\/null 2>\/dev\/null");
 
           if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
         }
@@ -1901,7 +1989,6 @@ sub blast {
   else  {&Error("This should not happen! Please submit the FASTA file(s) and the parameter vector (above) to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue.");}
   ##MARK_FOR_NEW_BLAST_ALGORITHM
 
-
   # File does not exists yet or I am forced to rewrite it
   if (!(-e $bla) || $force) {
 
@@ -1909,7 +1996,7 @@ sub blast {
 
     if ($debug || $verbose==2) {print STDERR "$command\n";}                     # 5.16
     if ($blastmode eq "diamond") {
-      my $command2 = $binpath."diamond view --threads $threads_per_process --quiet -a $bla.daa -o $bla.tmp";
+      my $command2 = $binpath."diamond view --threads $threads_per_process --quiet -a $bla.daa -o $bla.tmp >/dev/null $printSTDERR";
       system("$command && $command2");   # run diamond and presort
       if ($? != 0) {
         &Error($blastmode." failed.\nThe most likely  errorsources of this are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
@@ -2170,13 +2257,18 @@ sub check_bins {
     &Error("Please call -p=mmseqsp for protein datasets and -p=mmseqsn for nucleotide datasets.");
   }
   elsif ($blastmode eq "diamond") {
-    my $cmd = $binpath."diamond version";
+    my $cmd = $binpath."diamond version 2>/dev/null";
     my $out = qx($cmd 2>&1);
     if (defined($out) && $out =~ /diamond\sversion\s(.+)\n/) {
       my @version = split(/\s+/,$1);
       my $versionnumber = pop @version;
       $makedb = $binpath."diamond makedb --in";
       if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";}
+      if($versionnumber =~ m/^0\.9\.(\d+)/){ if($1 < 29){
+        print STDERR "\n!!!! \nWARNING '$blastmode' version $versionnumber has a known bug that incorrectly computes the length of an alignment, thus the coverage threshold can produce wrong results leading in false negatives. See https://gitlab.com/paulklemm_PHD/proteinortho/issues/24 for more details.\n\n >>> Please update diamond to 0.9.29 or higher <<<\n";
+        print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
+        sleep 10;
+        print STDERR "\nWell then, proceeding...\n\n";} }
       return;
     }else{
       print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
@@ -2474,29 +2566,29 @@ sub get_po_path {
   $uname=~s/[\r\n]+$//;
 
   if(-x "proteinortho_clustering"){
-    $tmppath[1]="";
-    if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
-  }elsif(-x $tmppath[1]."/src/proteinortho_clustering"){
-    $tmppath[1]=$tmppath[1]."/src";
-    if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
+    my $p=`whereis proteinortho_clustering`;
+    $p=~s/^proteinortho_clustering:  *([^ ]+)\/proteinortho_clustering.*$/$1/;
+    chomp($p);
+    $tmppath[1]=$p;
+    if($debug){print STDERR "Detected (PATH enviroment variable)\n";}
   }else{
     if(-x $tmppath[1]."/src/BUILD/$uname/proteinortho_clustering"){
       $tmppath[1]=$tmppath[1]."/src/BUILD/$uname";
       if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
     }elsif(-x "/usr/bin/proteinortho_clustering"){
-      $tmppath[1]="/usr/bin";
+      $tmppath[1]="/usr/bin/";
       if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
     }elsif(-x "/usr/local/bin/proteinortho_clustering"){
-      $tmppath[1]="/usr/local/bin";
+      $tmppath[1]="/usr/local/bin/";
       if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
     }elsif(-x "$binpath/proteinortho_clustering"){
-      $tmppath[1]="$binpath";
+      $tmppath[1]="$binpath/";
       if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
     }
   }
 
   if(!-x $tmppath[1]."/proteinortho_clustering"){
-    &Error("cannot find proteinortho_clustering in: the current directory '.', ./src/, ./src/BUILD/$uname , /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
+    &Error("cannot find proteinortho_clustering in: the current directory '.', ./src/, ./src/BUILD/$uname , /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
     exit 1;
   }
   # if(!-x "$tmppath[1]/proteinortho_cleanupblastgraph"){
@@ -2508,7 +2600,7 @@ sub get_po_path {
   #   exit 0;
   # }
   if(!-x $tmppath[1]."/proteinortho2html.pl"){
-   &Error("cannot find proteinortho2html.pl in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
+   &Error("cannot find proteinortho2html.pl in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
     exit 1;
   }
   # if(!-x "$tmppath[1]/proteinortho2tree.pl"){
@@ -2516,23 +2608,23 @@ sub get_po_path {
   #   exit 0;
   # }
   if(!-x $tmppath[1]."/proteinortho_ffadj_mcs.py" && $synteny){
-    &Error("cannot find proteinortho_ffadj_mcs.py$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
+    &Error("cannot find proteinortho_ffadj_mcs.py$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
     exit 1;
   }
   if(!-x $tmppath[1]."/proteinortho_singletons.pl"){
-    &Error("cannot find proteinortho_singletons.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
+    &Error("cannot find proteinortho_singletons.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
     exit 1;
   }
   if(!-x $tmppath[1]."/proteinortho_graphMinusRemovegraph"){
-    &Error("cannot find proteinortho_graphMinusRemovegraph$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
+    &Error("cannot find proteinortho_graphMinusRemovegraph$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
     exit 1;
   }
   if(!-x $tmppath[1]."/proteinortho_do_mcl.pl"){
-    &Error("cannot find proteinortho_do_mcl.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
+    &Error("cannot find proteinortho_do_mcl.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
       exit 1;
   }
   if(!-x $tmppath[1]."/proteinortho2xml.pl"){
-    &Error("cannot find proteinortho2xml.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
+    &Error("cannot find proteinortho2xml.pl$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
     exit 1;
   }
   return $tmppath[1];


=====================================
.gitlab-ci.yml → src/.gitlab-ci.yml
=====================================
@@ -2,7 +2,7 @@ variables:
   PROJECT_NAME: "Proteinortho"
 before_script:
   - echo "starting yml for Proteinortho"
-  - apt-get update && apt-get -y install cmake diffutils wget ncbi-blast+ time git
+  - apt-get update && apt-get -y install cmake diffutils wget ncbi-blast+ time git python3
 stages:
   - codequality
   - test-precompiled-bins
@@ -17,7 +17,7 @@ gcc-latest-alloptions:
   - tar xzf diamond-linux64.tar.gz
   - mkdir ~/bin
   - cp diamond ~/bin
-  - perl proteinortho*pl -project=testasd -cpus=1 -ram=100 -verbose=2 -selfblast -silent -force -desc -checkfasta -cleanblast -debug -binpath=~/bin -tmp='~/' -e=0.000001 -sim=0.9 -identity=20 -cov=30 -subparaBlast='--more-sensitive' -synteny -dups=1 -cs=4 -alpha=0.4 -conn=0.01 -purity=0.00001 -minspecies=2 -subparaCluster='-cpus 1 -seed 1' -nograph -singles -xml -exactstep3 test/*faa >/dev/null 2>&1 && rm testasd*poff* && rm testasd*fadj* && rm testasd*info* && export LC_NUMERIC="C" && export LC_ALL="C" && for f in testasd.*; do sort $f >$f.testasd; done; sha256sum -b *testasd | tr -d '\n' | awk '{if($0 == "eb88ba29afd4f2dba16d3dbf97a5b0d2ab7686654a854f8502f0e778628e7f56 *testasd.descriptions.testasdf80df4c1a951bfb55b02300a273f6395694f01e8ae908e296d9c14a847d432ac *testasd.proteinortho.html.testasdfa18e9a0530f5a5754f045cfe97deaf818bdb5eb725619952633f1da0641cf7b *testasd.proteinortho.tsv.testasdc598b8c43e48e06614ec19e2f6b870e2737a7117a50ab2b1613880764d0884b2 *testasd.proteinortho.tsv.xml.testasd"){print $0." -> OK"; exit 0}else{print $0." -> failed"; exit 1}}'
+  - perl proteinortho*pl -project=testasd -cpus=1 -ram=100 -verbose=2 -selfblast -silent -force -desc -checkfasta -cleanblast -debug -binpath=~/bin -tmp='~/' -e=0.000001 -sim=0.9 -identity=20 -cov=30 -subparaBlast='--more-sensitive' -synteny -dups=1 -cs=4 -alpha=0.4 -conn=0.01 -purity=0.00001 -minspecies=2 -subparaCluster='-cpus 1 -seed 1' -nograph -singles -xml -exactstep3 test/*faa >/dev/null 2>&1 && rm testasd*poff* && rm testasd*fadj* && rm testasd*info* && export LC_NUMERIC="C" && export LC_ALL="C" && for f in testasd.*; do sort $f >$f.testasd; done; sha256sum -b *testasd | tr -d '\n' | awk '{if($0 == "eb88ba29afd4f2dba16d3dbf97a5b0d2ab7686654a854f8502f0e778628e7f56 *testasd.descriptions.testasde3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 *testasd.proteinortho-graph.summary.testasdf80df4c1a951bfb55b02300a273f6395694f01e8ae908e296d9c14a847d432ac *testasd.proteinortho.html.testasdfa18e9a0530f5a5754f045cfe97deaf818bdb5eb725619952633f1da0641cf7b *testasd.proteinortho.tsv.testasdc598b8c43e48e06614ec19e2f6b870e2737a7117a50ab2b1613880764d0884b2 *testasd.proteinortho.tsv.xml.testasd"){print $0." -> OK"; exit 0}else{print $0." -> failed"; exit 1}}'
 
 gcc-latest-all-p:
   image: gcc
@@ -96,7 +96,7 @@ ubuntu-latest0:
   image: ubuntu
   stage: test-precompiled-bins
   script:
-  - apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++
+  - apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++ && apt-get -y install python3
   - echo "installing topaz"
   - git clone https://github.com/ajm/topaz
   - cd topaz/src
@@ -115,7 +115,7 @@ ubuntu-latest:
   image: ubuntu
   stage: recompile-and-test
   script:
-  - apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++
+  - apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++ && apt-get -y install python3
   - echo "installing topaz"
   - git clone https://github.com/ajm/topaz
   - cd topaz/src
@@ -136,7 +136,7 @@ debian-latest:
   image: debian
   stage: recompile-and-test
   script:
-  - apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++
+  - apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++ && apt-get -y install python3
   - echo "installing topaz"
   - git clone https://github.com/ajm/topaz
   - cd topaz/src
@@ -153,34 +153,33 @@ debian-latest:
   - make all
   - make test
 
-fedora-latest:
-  image: fedora
-  stage: test-precompiled-bins
-  script:
-  - yum -y groupinstall "Development Tools" 
-  - yum -y install gcc-c++
-  - yum -y install cmake
-  - yum -y install make
-  - yum -y install tar
-  - yum -y install which
-  - yum -y install wget
-  - yum -y install libstdc++-static
-  - yum -y install lapack-static
-  - yum -y install cpan
-  - yum -y install python
-  - yum -y install ncbi-blast+
-  - cpan Thread::Queue
-  - wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast*-x64-linux.tar.gz
-  - tar -xzvf ncbi-blast*-x64-linux.tar.gz
-  - cp ncbi-blast*/bin/blastp $HOME
-  - cp ncbi-blast*/bin/makeblastdb $HOME
-  - echo "installing diamond"
-  - wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
-  - tar xzf diamond-linux64.tar.gz
-  - cp diamond $HOME
-  - export PATH="$PATH:$HOME"
-  - echo "start proteinortho tests"
-  - make test
+#fedora-latest:
+#  image: fedora
+#  stage: test-precompiled-bins
+#  script:
+#  - yum -y groupinstall "Development Tools" 
+#  - yum -y install gcc-c++
+#  - yum -y install cmake
+#  - yum -y install make
+#  - yum -y install tar
+#  - yum -y install which
+#  - yum -y install wget
+#  - yum -y install libstdc++-static
+#  - yum -y install lapack-static
+#  - yum -y install cpan
+#  - yum -y install python
+#  - yum -y install ncbi-blast+
+#  - cpan Thread::Queue
+#  - wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast*-x64-linux.tar.gz
+#  - tar -xzvf ncbi-blast*-x64-linux.tar.gz
+#  - cp ncbi-blast*/bin/blastp $HOME
+#  - cp ncbi-blast*/bin/makeblastdb $HOME
+#  - echo "installing diamond"
+#  - wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
+#  - tar xzf diamond-linux64.tar.gz
+#  - cp diamond $HOME
+#  - export PATH="$PATH:$HOME"
+#  - echo "start proteinortho tests"
 
 centos-latest:
   image: centos
@@ -193,6 +192,7 @@ centos-latest:
   - yum -y install tar
   - yum -y install which
   - yum -y install wget
+  - yum -y install gcc-gfortran
   - wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast*-x64-linux.tar.gz
   - tar -xzvf ncbi-blast*-x64-linux.tar.gz
   - cp ncbi-blast*/bin/blastp $HOME
@@ -205,7 +205,6 @@ centos-latest:
   - echo "start proteinortho tests"
   - make clean
   - make
-  - make test
  
 code_quality:
   image: docker:stable


=====================================
src/proteinortho_ffadj_mcs.py
=====================================
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python
 
 from sys import stderr, exit, argv, maxsize
 from copy import deepcopy
@@ -8,9 +8,6 @@ from random import randint
 from math import ceil
 import logging as log
 
-ALPHA = 1
-
-
 class BothStrands:
 
     def __eq__(self, x):
@@ -59,9 +56,9 @@ class Run:
         return len(self.weight)
 
     def __str__(self):
-        return 'G1:%s-%s G2:%s-%s %s (%.5f)' % (self.startG1, self.endG1,
+        return 'G1:%s-%s G2:%s-%s %s' % (self.startG1, self.endG1,
                                                 self.startG2, self.endG2,
-                                                self.direction, self.getWeight(ALPHA))
+                                                self.direction)
 
 
 def readDistsAndOrder(dist_file, edgeThreshold):
@@ -133,11 +130,11 @@ def sort_genome(chrom_pos):
     return telomeres, g
 
 
-def insertIntoRunList(runs, runList):
-    keys = [x.getWeight(ALPHA) for x in runList]
+def insertIntoRunList(runs, runList, alpha):
+    keys = [x.getWeight(alpha) for x in runList]
     for run in runs:
-        i = bisect(keys, run.getWeight(ALPHA))
-        keys.insert(i, run.getWeight(ALPHA))
+        i = bisect(keys, run.getWeight(alpha))
+        keys.insert(i, run.getWeight(alpha))
         runList.insert(i, run)
 
 
@@ -363,7 +360,7 @@ def replaceByNew(g1_runs, g2_runs, i, j, r_old, r_new):
             break
 
 
-def doMatching(g1, g2, g1_runs, g2_runs, m, runList):
+def doMatching(g1, g2, g1_runs, g2_runs, m, runList, alpha):
     g1pos = dict(zip(g1, range(len(g1))))
     g2pos = dict(zip(g2, range(len(g2))))
     newRuns = set()
@@ -455,6 +452,7 @@ def doMatching(g1, g2, g1_runs, g2_runs, m, runList):
                     g1_runs[g1pos[r.endG1]].remove(r)
                 r.startG2 = g2[j]
                 log.info('Divided overlapping run in %s and %s' % (r_new, r))
+
                 replaceByNew(g1_runs, g2_runs, g1pos[r_new.startG1],
                              g2pos[r_new.startG2], r, r_new)
                 newRuns.add(r_new)
@@ -480,10 +478,10 @@ def doMatching(g1, g2, g1_runs, g2_runs, m, runList):
                 newRuns.add(r)
             elif r in newRuns:
                 newRuns.remove(r)
-    insertIntoRunList(newRuns, runList)
+    insertIntoRunList(newRuns, runList, alpha)
 
 
-def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched):
+def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched, alpha):
     g1pos = dict(zip(g1, range(len(g1))))
     g2pos = dict(zip(g2, range(len(g2))))
 
@@ -504,9 +502,9 @@ def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched):
         # points (mod_g1) can be processed.
 
         for r1, r2 in product(sorted(g1_runs[i].difference(g1_runs[i+1]),
-                                     key=lambda x: x.getWeight(ALPHA), reverse=True),
+                                     key=lambda x: x.getWeight(alpha), reverse=True),
                               sorted(g1_runs[i+1].difference(g1_runs[i]),
-                                     key=lambda x: x.getWeight(ALPHA), reverse=True)):
+                                     key=lambda x: x.getWeight(alpha), reverse=True)):
             if r1.endG1 == g1[i] and r2.startG1 == g1[i+1] and \
                     r1.direction == r2.direction and \
                     r1.endG1[0] == r2.startG1[0] and \
@@ -538,7 +536,7 @@ def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched):
                     if r1 in alreadyMatched:
                         alreadyMatched.remove(r1)
                     # redo matching in case r1 xor r2 were not in matching before
-                    insertIntoRunList(newRuns, runList)
+                    insertIntoRunList(newRuns, runList, alpha)
                     return r2, set(mod_g1[x+1:])
                 if r2 in alreadyMatched:
                     # actually, both are already matched
@@ -548,7 +546,7 @@ def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched):
                     # none is matched
                     newRuns.add(r2)
 
-    insertIntoRunList(newRuns, runList)
+    insertIntoRunList(newRuns, runList, alpha)
     return None, []
 
 
@@ -567,7 +565,7 @@ def removeSingleGenes(genome, genome_runs):
     return del_res, mod_res
 
 
-def findRandomRunSequence(g1, g2, dists, topXperCent):
+def findRandomRunSequence(g1, g2, dists, topXperCent, alpha):
     g2dists = dict()
     for g1i, x in list(dists.items()):
         for g2j, d in list(x.items()):
@@ -585,11 +583,12 @@ def findRandomRunSequence(g1, g2, dists, topXperCent):
     g1_runs, g2_runs, runs = getAllRuns(g1, g2, dists)
     log.info('Found %s runs.' % len(runs))
     # sort
-    runList = sorted(runs, key=lambda x: x.getWeight(ALPHA))
+    runList = sorted(runs, key=lambda x: x.getWeight(alpha))
 
     res = set()
     while runList:
-        noOfAdjacencies = len([x for x in runList if x.getWeight(ALPHA) and x.getWeight(ALPHA) or 0])
+        noOfAdjacencies = len([x for x in runList if x.getWeight(alpha) and
+            x.getWeight(alpha) or 0])
         if noOfAdjacencies:
             randPos = randint(1, ceil(noOfAdjacencies * topXperCent))
         else:
@@ -601,7 +600,7 @@ def findRandomRunSequence(g1, g2, dists, topXperCent):
         while mx:
             res.add(mx)
             # update run list
-            doMatching(g1, g2, g1_runs, g2_runs, mx, runList)
+            doMatching(g1, g2, g1_runs, g2_runs, mx, runList, alpha)
             del_g1, new_mod_g1 = removeSingleGenes(g1, g1_runs)
             if del_g1:
                 log.info('Zombie genes removed from G1: %s' % ', '.join(map(str, del_g1)))
@@ -631,7 +630,7 @@ def findRandomRunSequence(g1, g2, dists, topXperCent):
                                 mod_g1.add(g1[g1pos[g1i]-1])
             # merge runs
             mx, mod_g1 = mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs,
-                                   runList, res)
+                                   runList, res, alpha)
 
     if res:
         log.info('Matching finished. Longest run size is %s.' % (max(list(map(len, res)))))
@@ -642,7 +641,7 @@ def findRandomRunSequence(g1, g2, dists, topXperCent):
 
 
 def repeatMatching(g1, g2, g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching,
-                   minCsSize, topXperCent):
+                   minCsSize, topXperCent, alpha):
 
     g1_mod_res = g1_mod
     g2_mod_res = g2_mod
@@ -680,7 +679,7 @@ def repeatMatching(g1, g2, g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching,
                      (noReps-repMatching+2))
             break
 
-        g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns = findRandomRunSequence(g1, g2, dists, topXperCent)
+        g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns = findRandomRunSequence(g1, g2, dists, topXperCent, alpha)
         checkMatching(g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns, dists)
 
         log.info('Obtained %s adjacencies in matching of size %s from iteration %s.' %
@@ -793,7 +792,6 @@ if __name__ == '__main__':
     cli.add_argument('-a', '--alpha', type=float, metavar='F', default=0.5)
     cli.add_argument('dist_file')
     args = cli.parse_args()
-    AlPHA = args.alpha
     repMatching = args.repeat_matching
     if repMatching > 0:
         repMatching -= 1
@@ -802,14 +800,16 @@ if __name__ == '__main__':
                     format="%(levelname)s\t%(asctime)s\t++ %(message)s")
 
     multiChrom, g1, g2, dists = readDistsAndOrder(args.dist_file, args.edge_weight_threshold)
-    g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns = findRandomRunSequence(g1, g2, dists, args.greedy)
+    g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns = findRandomRunSequence(g1,
+            g2, dists, args.greedy, args.alpha)
     checkMatching(g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns, dists)
 
     # calculate number of breakpoints only from result of the first matching
     bkp = len(selectedRuns) - 1
 
     g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns_new = repeatMatching(g1, g2,
-            g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching, args.min_cs_size, args.greedy)
+            g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching,
+            args.min_cs_size, args.greedy, args.alpha)
 
     selectedRuns.update(selectedRuns_new)
 
@@ -833,3 +833,4 @@ if __name__ == '__main__':
 
     print('#bkp\t#edg\tadj\tedg')
     print('%s\t%s\t%.6f\t%.6f' % (bkp, edg, wAdj, wEdg))
+


=====================================
src/proteinortho_history.pl
=====================================
@@ -0,0 +1,570 @@
+#!/usr/bin/env perl
+#pk
+
+##########################################################################################
+#	  This file is part of proteinortho.
+#	  (C) 2009 Marcus Lechner
+# 
+#	  proteinortho is free software; you can redistribute it and/or modify
+#	  it under the terms of the GNU General Public License as published
+#	  by the Free Software Foundation; either version 2, or (at your
+#	  option) any later version.
+#
+#	  proteinortho is distributed in the hope that it will be useful, but
+#	  WITHOUT ANY WARRANTY; without even the implied warranty of
+#	  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#	  General Public License for more details.
+#
+#	  You should have received a copy of the GNU General Public License
+#	  along with proteinortho; see the file COPYING.  If not, write to the
+#	  Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+#	  Boston, MA 02111-1307, USA.	
+##########################################################################################
+
+##########################################################################################
+# About
+##########################################################################################
+# 
+# @author Paul Klemm
+# @email klemmp at staff.uni-marburg.de
+# @company Bioinformatics, University of Leipzig
+# @version 1
+# @date 11-11-2019
+#
+##########################################################################################
+
+use POSIX;
+
+my $usage = "
+proteinortho_history.pl        reports the history of a (or a pair of) gene/protein(s).
+ 
+SYNOPSIS
+ 
+proteinortho_history.pl (-project=myproject) QUERY (FASTA1 FASTA2 ...)
+
+	QUERY	A string of a single gene/protein or 2 separated by a comma or a whitespace (the input is escaped using quotemeta, use -noquotemeta to avoid this)
+
+	-project=MYPROJECT	The project name (as specified in proteinortho with -project) (default:auto detect in the current directory)
+	-step=[123] 		(optional) If specified more optput is printed (to STDOUT) for the given step:
+		-step=1 : search for the given fasta sequence in the input fasta files
+		-step=2 : search in the *.blast-graph
+		-step=3 : search in the *.proteinortho file 
+		-step=all : prints everything of above to STDOUT
+	FASTA*						(optional) input fasta files 
+	-noquotemeta, -E			(optional) If set, then the query will not be escaped.
+	-plain, -p, -notableformat	(optional) If -step= is set too, then the tables are not formatted and a plain csv is printed instead. 
+	-delim= 					(optional) Defines the delimiter character for spliting the query (if you want to search for 2 genes/proteins)
+
+	NOTE: if you use the -keep option and you have the project_cache_proteinortho/ directory, this program additionally searches for all blast hits.
+
+";
+
+our $maxNumOfCharsInOneLine=`tput cols`;
+chomp($maxNumOfCharsInOneLine);
+if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
+our $split_delim="[:\t]";
+our @spl_header;
+our @spl;
+our $last_isHeaderLine=0;
+our $last_isHeaderLine=0;$isHeaderLine=1;
+our $noheader=0;
+
+my $query;
+my $query_esc;
+my $help;
+my $project="myproject";
+my $step="none";
+my $delim="[, ]";
+my $do_quotemeta=1;
+our $notableformat=0;
+
+my @ARGViddone=[];
+my $ARGViddone_counter=scalar(@ARGV);
+for(my $v = 0 ; $v < scalar @ARGV ; $v++){
+	
+	if($ARGV[$v] =~ m/^--?(help|h)$/){$help=1;$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^--?project=?(.*)$/){$project=$1;chomp($project);$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^--?step=?([123]|(all))$/){$step=$1;chomp($step);$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^--?delim=?(.*)$/){$delim=$1;chomp($delim);$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^--?noquotemeta$/){$do_quotemeta=0;$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^--?E$/){$do_quotemeta=0;$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^--?notableformat$/){$notableformat=1;$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^--?plain$/){$notableformat=1;$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^--?p$/){$notableformat=1;$ARGViddone[$v]=1;}
+	elsif($ARGV[$v] =~ m/^-.+/){print $usage; print STDERR "ERROR: invalid option ".$ARGV[$v]."!\n\n";$ARGViddone[$v]=1;exit(1);}
+	elsif(!defined($query)){$query = $ARGV[$v];}
+	else{$ARGViddone[$v]=0;$ARGViddone_counter++;}
+
+}
+
+my $project_corrected="";
+
+if ($help){
+    print $usage;
+    exit(0);
+}
+my $fail="";
+if (!defined($query)){
+    $fail.="ERROR: no QUERY provided!\n";
+}
+if (! -e $project.".blast-graph" && ! -e $project.".proteinortho.tsv" && ! -e $project.".proteinortho-graph" && ! -d "proteinortho_cache_".$project){
+	@blastglob=glob("*.blast-graph");
+	@proteinorthoglob=glob("*.proteinortho.tsv");
+	if (scalar(@blastglob)>0){
+		$project_corrected=$project;
+		$project=$blastglob[0];$project=~s/.blast-graph$//g;
+	}elsif(scalar(@proteinorthoglob)>0){
+		$project_corrected=$project;
+		$project=$proteinorthoglob[0];$project=~s/.blast-graph$//g;
+	}else{
+		 $fail.="ERROR: '".$project."' files not found, make sure you are in the directory with the proteinortho output! Specify the project name (prefix) with -project=... \n";
+	}
+}
+
+if($do_quotemeta){
+	$query_esc=quotemeta($query);
+	$query_esc=~s/\\([, ])/$1/;
+	$query_esc=~s/\\($delim)/$1/;
+	if($query ne $query_esc){print STDERR "(escaped the input query '$query' -> '$query_esc')\n";}
+}else{
+	$query_esc=$query;
+}
+
+my @query_split=split($delim,$query_esc);
+my @query_split_noesc=split($delim,$query);
+
+if (scalar(@query_split) > 2){
+    $fail.="ERROR: the QUERY contains too many delimiters ([, ]), specify the delimiter character with -delim=... !\n";
+}
+
+if($fail){
+	print $usage.$fail;
+	exit(1);
+}
+
+print STDERR "\nA short history of '"; if(scalar(@query_split)==2){print STDERR $query_split_noesc[0]."' and '".$query_split_noesc[1];}else{print STDERR $query_split_noesc[0]} print STDERR "' in '$project':\n";
+
+print STDERR "\n:::::::[ input files ]::::::::::::::::\n\n";
+#
+# input files
+#
+
+my $did_found_a_in_input=0;
+my $did_found_b_in_input=0;
+
+if($step eq "1" || $step eq "none" || $step eq "all"){
+
+	if ( $ARGViddone_counter != scalar(@ARGV) ){
+		print STDERR "checking the input fasta files...\n";
+
+		for(my $v = 0 ; $v < scalar @ARGV ; $v++){
+
+			if($ARGViddone[$v] || $ARGV[$v] eq $query){next;}	
+			if(! -e $ARGV[$v]){print STDERR " ! WARNING ! did not find file '".$ARGV[$v]."', proceeding anyway...\n"}
+			elsif(! -r $ARGV[$v]){print STDERR "! WARNING ! cannot read '".$ARGV[$v]."', proceeding anyway...\n"}
+
+			$result=`grep -nH '\\b$query_split[0]\\b' $ARGV[$v] 2>/dev/null`;
+			$num_result=scalar split("\n",$result);
+
+			if($num_result>0 && $did_found_a_in_input!=0){
+				print STDERR "! WARNING ! found '".$query_split_noesc[0]."' again in ".$ARGV[$v].". (proteinortho can handle this) \n";
+			}
+			if($num_result==0){
+				# did not found the query in this file ...
+			}elsif($num_result==1 && $did_found_a_in_input==0){
+				print STDERR "found '".$query_split_noesc[0]."' in ".$ARGV[$v].". \n";
+				$did_found_a_in_input=1;
+				if($step ne "none"){print `proteinortho_grab_proteins.pl -E '\\b$query_split[0]\\b' $ARGV[$v]`."\n";}
+			}elsif($num_result>1){
+				print STDERR "! ERROR ! found '".$query_split_noesc[0]."' multiple times in ".$ARGV[$v].". This seems dangerous, make sure there are no duplicates present in your fasta files ! \n";
+				if($step ne "none"){print `proteinortho_grab_proteins.pl -E '\\b$query_split[0]\\b' $ARGV[$v]`."\n";}
+			}
+
+			if(scalar(@query_split)==2){
+
+				$result=`grep -nH '\\b$query_split[1]\\b' $ARGV[$v] 2>/dev/null`;
+				$num_result=scalar split("\n",$result);
+
+				if($num_result>0 && $did_found_b_in_input!=0){
+					print STDERR "! WARNING ! found '".$query_split_noesc[1]."' again in ".$ARGV[$v].". (proteinortho can handle this) \n";
+					if($step ne "none"){print "$result\n";}
+				}
+				if($num_result==0){
+					# did not found the query in this file ...
+				}elsif($num_result==1 && $did_found_b_in_input==0){
+					print STDERR "found '".$query_split_noesc[1]."' in ".$ARGV[$v].". \n";
+					$did_found_b_in_input=1;
+					if($step ne "none"){print `proteinortho_grab_proteins.pl -E '\\b$query_split[1]\\b' $ARGV[$v]`."\n";}
+
+				}elsif($num_result>1){
+					print STDERR "! ERROR ! found '".$query_split_noesc[1]."' multiple times in ".$ARGV[$v].". This seems dangerous, make sure there are no duplicates present in your fasta files ! \n";
+					if($step ne "none"){print `proteinortho_grab_proteins.pl -E '\\b$query_split[1]\\b' $ARGV[$v]`."\n";}
+				}
+			}
+		}
+
+		if(!$did_found_a_in_input){
+			print STDERR "! ERROR ! did not found '".$query_split_noesc[0]."' in the provided fasta files...\n";
+		}
+		if(scalar(@query_split)==2 && !$did_found_b_in_input){
+			print STDERR "! ERROR ! did not found '".$query_split_noesc[1]."' in the provided fasta files...\n";
+		}
+	}else{
+		print STDERR "input fasta files are not provided: skipping\n";
+	}
+if($step eq "none"){print STDERR "\n(use -step=1 for more details)\n";}
+}
+
+print STDERR "\n:::::::[ all versus all blast ]:::::::\n\n";
+#
+# step 2 blast-graph
+# TODO the single blast graph ... 
+
+my $proteinortho_cache=0;
+my $num_a_all_blast_hits =0;
+my $num_b_all_blast_hits =0;
+my $num_ab_all_blast_hits =0;
+my $num_a_aRBH_hits =0;
+my $num_b_aRBH_hits =0;
+my $num_ab_aRBH_hits =0;
+
+if($step eq "2" || $step eq "none" || $step eq "all"){
+
+	if(-d "proteinortho_cache_".$project){
+		$proteinortho_cache=1;
+		print STDERR "checking all blast files in 'proteinortho_cache_".$project."'...\n";
+
+		$result=`grep -nH '\\b$query_split[0]\\b' proteinortho_cache_$project/*.vs.* 2>/dev/null`;
+		$num_a_all_blast_hits=scalar split("\n",$result);
+
+		print STDERR "the query '".$query_split_noesc[0]."' got $num_a_all_blast_hits hit(s).\n";
+		if($step ne "none"){$result="# file\tline\tqseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\n".$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+		if(scalar @query_split == 2){
+
+			$result=`grep -nH '\\b$query_split[1]\\b' proteinortho_cache_$project/*.vs.* 2>/dev/null`;
+			$num_b_all_blast_hits=scalar split("\n",$result);
+
+			print STDERR "the query '".$query_split_noesc[1]."' got $num_b_all_blast_hits hit(s).\n";
+			if($step ne "none"){$result="# file\tline\tqseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\n".$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+			$result=`grep -nHE '(\\b$query_split[0]\\b\\t\\b$query_split[1]\\b)|(\\b$query_split[1]\\b\\t\\b$query_split[0]\\b)' proteinortho_cache_$project/*.vs.* 2>/dev/null`;
+			$num_ab_all_blast_hits=scalar split("\n",$result);
+
+			print STDERR "both queries '".$query_split_noesc[0]."' and '".$query_split_noesc[1]."' have $num_ab_all_blast_hits hit(s) with each other use -step=2 for more details).\n";
+			if($step ne "none"){$result="# file\tline\tqseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\n".$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+			#print "grep -nH '(\\b$query_split[0]\\b\\t\\b$query_split[1]\\b)|(\\b$query_split[1]\\b\\t\\b$query_split[0]\\b)' proteinortho_cache_$project/*.vs.* 2>/dev/null";
+		}
+
+	}elsif(glob("*.vs.*")){
+		$proteinortho_cache=1;
+		print STDERR "checking all blast files...\n";
+
+		$result=`grep -nH '\\b$query_split[0]\\b' *.vs.* 2>/dev/null`;
+		$num_a_all_blast_hits=scalar split("\n",$result);
+
+		print STDERR "the query '".$query_split_noesc[0]."' got $num_a_all_blast_hits hit(s).\n";
+		if($step ne "none"){$result="# file\tline\tqseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\n".$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+		if(scalar @query_split == 2){
+
+			$result=`grep -nH '\\b$query_split[1]\\b' *.vs.* 2>/dev/null`;
+			$num_b_all_blast_hits=scalar split("\n",$result);
+
+			print STDERR "the query '".$query_split_noesc[1]."' got $num_b_all_blast_hits hit(s).\n";
+			if($step ne "none"){$result="# file\tline\tqseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\n".$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+			$result=`grep -nHE '(\\b$query_split[0]\\b\\t\\b$query_split[1]\\b)|(\\b$query_split[1]\\b\\t\\b$query_split[0]\\b)' *.vs.* 2>/dev/null`;
+			$num_ab_all_blast_hits=scalar split("\n",$result);
+
+			print STDERR "both queries '".$query_split_noesc[0]."' and '".$query_split_noesc[1]."' have $num_ab_all_blast_hits hit(s) with each other use -step=2 for more details).\n";
+			if($step ne "none"){$result="# file\tline\tqseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\n".$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+			#print "grep -nH '(\\b$query_split[0]\\b\\t\\b$query_split[1]\\b)|(\\b$query_split[1]\\b\\t\\b$query_split[0]\\b)' *.vs.* 2>/dev/null";
+		}
+
+	}else{
+		print STDERR "did not found the temporary files *.vs.* neither in 'proteinortho_cache_".$project."/' nor in the current directory. If you want to analyse all blast files please use '-keep' option with proteinortho or provide the *.vs.* files in the current directory...\n";
+	}
+print STDERR "\n:::::::[ reciprocal best hit ]::::::::\n\n";
+
+	if(-e $project.".blast-graph"){
+		print STDERR "checking the blast-graph '$project.blast-graph' (reciprocal adaptive best hit graph)...\n";
+
+		$result=`grep -nH '\\b$query_split[0]\\b' $project.blast-graph 2>/dev/null`;
+		$num_a_aRBH_hits=scalar split("\n",$result);
+
+		print STDERR "the query '".$query_split_noesc[0]."' got $num_a_aRBH_hits reciprocal hit(s).\n";
+		if($step ne "none"){$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",`head -n2 $project.blast-graph |tail -n1| perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result)){processLine($_);}print "\n";}
+
+		if(scalar @query_split == 2){
+
+			$result=`grep -nH '\\b$query_split[1]\\b' $project.blast-graph 2>/dev/null`;
+			$num_b_aRBH_hits=scalar split("\n",$result);
+
+			print STDERR "the query '".$query_split_noesc[1]."' got $num_b_aRBH_hits reciprocal hit(s).\n";
+			if($step ne "none"){$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",`head -n2 $project.blast-graph |tail -n1| perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result)){processLine($_);}print "\n";}
+
+			$result=`grep -nHE '(\\b$query_split[0]\\b\\t\\b$query_split[1]\\b)|(\\b$query_split[1]\\b\\t\\b$query_split[0]\\b)' $project.blast-graph 2>/dev/null`;
+			$num_ab_aRBH_hits=scalar split("\n",$result);
+
+			print STDERR "both queries '".$query_split_noesc[0]."' and '".$query_split_noesc[1]."' have $num_ab_aRBH_hits reciprocal hit(s) with each other.\n";
+			if($step ne "none"){$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",`head -n2 $project.blast-graph |tail -n1| perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result)){processLine($_);}print "\n";}
+		}
+
+	}else{
+		print STDERR "did not found the blast-graph '$project.blast-graph': skipping...\n";
+	}
+if($step eq "none"){print STDERR "\n(use -step=2 for more details)\n";}
+}
+
+print STDERR "\n:::::::[ clustering ]:::::::::::::::::\n\n";
+#
+# step 3 proteinortho
+#
+
+my $num_a_cluster_hits =0;
+my $num_b_cluster_hits =0;
+my $num_ab_cluster_hits =0;
+my $num_a_cluster_groups =0;
+my $num_b_cluster_groups =0;
+my $num_ab_cluster_groups =0;
+
+if($step eq "3" || $step eq "none" || $step eq "all"){
+
+	if(-e $project.".proteinortho-graph"){
+		print STDERR "checking the proteinortho-graph '$project.proteinortho-graph' (result of the clustering)...\n";
+
+		$result=`grep -nH '\\b$query_split[0]\\b' $project.proteinortho-graph 2>/dev/null`;
+		$num_a_cluster_hits=scalar split("\n",$result);
+
+		print STDERR "the query '".$query_split_noesc[0]."' got $num_a_cluster_hits putative ortholog(s).\n";
+		if($step ne "none"){$result=`echo '# file\tline\t'|tr -d '\n'; head -n2 $project.proteinortho-graph| tail -n1 | sed 's/#//g'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+		if(scalar @query_split == 2){
+
+			$result=`grep -nH '\\b$query_split[1]\\b' $project.proteinortho-graph 2>/dev/null`;
+			$num_b_cluster_hits=scalar split("\n",$result);
+
+			print STDERR "the query '".$query_split_noesc[1]."' got $num_b_cluster_hits putative ortholog(s).\n";
+			if($step ne "none"){$result=`echo '# file\tline\t'|tr -d '\n'; head -n2 $project.proteinortho-graph| tail -n1 | sed 's/#//g'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+			$result=`grep -nHE '(\\b$query_split[0]\\b\\t\\b$query_split[1]\\b)|(\\b$query_split[1]\\b\\t\\b$query_split[0]\\b)' $project.proteinortho-graph 2>/dev/null`;
+			$num_ab_cluster_hits=scalar split("\n",$result);
+
+			if($num_ab_cluster_hits!=0){
+				print STDERR "both queries '".$query_split_noesc[0]."' and '".$query_split_noesc[1]."' are putative ortholog(s).\n";
+				if($step ne "none"){$result=`echo '# file\tline\t'|tr -d '\n'; head -n2 $project.proteinortho-graph| tail -n1 | sed 's/#//g'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+			}else{
+				print STDERR "both queries '".$query_split_noesc[0]."' and '".$query_split_noesc[1]."' are NOT putative ortholog(s).\n";
+			}
+			
+		}
+
+	}else{
+		print STDERR "did not found the proteinortho-graph '$project.proteinortho-graph': skipping...\n";
+	}
+print STDERR "\n:::::::[ clustering groups ]::::::::::\n\n";
+
+
+	if(-e $project.".proteinortho.tsv"){
+		print STDERR "checking the proteinortho.tsv file '$project.proteinortho.tsv' (result of the clustering)...\n";
+
+		$result=`grep -nH '\\b$query_split[0]\\b' $project.proteinortho.tsv 2>/dev/null`;
+		$num_a_cluster_groups=scalar split("\n",$result);
+
+		print STDERR "the query '".$query_split_noesc[0]."' is part of $num_a_cluster_groups group(s) of putative orthologs.\n";
+		if($step ne "none"){$result=`head -n1 $project.proteinortho.tsv | perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+		if(scalar @query_split == 2){
+
+			$result=`grep -nH '\\b$query_split[1]\\b' $project.proteinortho.tsv 2>/dev/null`;
+			$num_b_cluster_groups=scalar split("\n",$result);
+
+			print STDERR "the query '".$query_split_noesc[1]."' is part of $num_b_cluster_groups group(s) of putative orthologs.\n";
+			if($step ne "none"){$result=`head -n1 $project.proteinortho.tsv | perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+			$result=`grep -nHE '(\\b$query_split[0]\\b.*\\b$query_split[1]\\b)|(\\b$query_split[1]\\b.*\\b$query_split[0]\\b)' $project.proteinortho.tsv 2>/dev/null`;
+			$num_ab_cluster_groups=scalar split("\n",$result);
+
+			print STDERR "both queries '".$query_split_noesc[0]."' and '".$query_split_noesc[1]."' have $num_ab_cluster_groups group(s) in common.\n";
+			if($step ne "none"){$result=`head -n1 $project.proteinortho.tsv | perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+		}
+
+	}elsif(-e $project.".proteinortho"){
+		print STDERR "checking the proteinortho file '$project.proteinortho' (result of the clustering)...\n";
+
+		$result=`grep -nH '\\b$query_split[0]\\b' $project.proteinortho 2>/dev/null`;
+		$num_a_cluster_groups=scalar split("\n",$result);
+
+		print STDERR "the query '".$query_split_noesc[0]."' is part of $num_a_cluster_groups group(s) of putative orthologs.\n";
+		if($step ne "none"){$result=`head -n1 $project.proteinortho | perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+		if(scalar @query_split == 2){
+
+			$result=`grep -nH '\\b$query_split[1]\\b' $project.proteinortho 2>/dev/null`;
+			$num_b_cluster_groups=scalar split("\n",$result);
+
+			print STDERR "the query '".$query_split_noesc[1]."' is part of $num_b_cluster_groups group(s) of putative orthologs.\n";
+			if($step ne "none"){$result=`head -n1 $project.proteinortho | perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+			$result=`grep -nHE '(\\b$query_split[0]\\b.*\\b$query_split[1]\\b)|(\\b$query_split[1]\\b.*\\b$query_split[0]\\b)' $project.proteinortho 2>/dev/null`;
+			$num_ab_cluster_groups=scalar split("\n",$result);
+
+			print STDERR "both queries '".$query_split_noesc[0]."' and '".$query_split_noesc[1]."' have $num_ab_cluster_groups group(s) in common.\n";
+			if($step ne "none"){$result=`head -n1 $project.proteinortho | perl -lne 'chomp;s/#//g;print "# file\tline\t\$_"'`.$result;$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();foreach(split("\n",$result)){processLine($_);}print "\n";}
+
+		}
+
+	}else{
+		print STDERR "did not found the proteinortho.tsv '$project.proteinortho.tsv': skipping...\n";
+	}
+
+if($step eq "none"){print STDERR "\n(use -step=3 for more details)\n";}
+
+}
+print STDERR "\n:::::::[ summary ]::::::::::::::::::::\n\n";
+if(scalar @query_split == 2){
+	if($did_found_b_in_input && $did_found_a_in_input){print STDERR "Found both queries in the input files.\n"}else{print STDERR "Did found the two queries in the input files...\n\n";exit(0);}
+	if($proteinortho_cache){
+		if($num_ab_all_blast_hits){print STDERR "Both queries hit each other"}else{print STDERR "Sadly, the two queries did not hit each other using the blast algorithm...\n\n";exit(0);}
+		if($num_ab_aRBH_hits){print STDERR " AND they are adaptive reciprocal best hits too.\n"}else{print STDERR " but they are NOT adaptive reciprocal best hits...\n\n";exit(0);}
+	}else{
+		if($num_ab_aRBH_hits){print STDERR "Furthermore, Both queries are adaptive reciprocal best hits.\n"}else{print STDERR "Sadly, both queries are NOT adaptive reciprocal best hits...\n\n";exit(0);}	
+	}
+	if($num_ab_cluster_groups){print STDERR "Finally, both queries are putative orthologs, since they occure in the same group after the clustering step.\n"}else{print STDERR "Sadly, both queries are NOT putative orthologs, since they are not occuring in the same group  after the clustering step....\n\n";exit(0);}	
+}else{
+	if($did_found_a_in_input){print STDERR "Found the query in the input files.\n"}else{print STDERR "I did NOT find the query in the input files...\n\n";exit(0);}
+	if($proteinortho_cache){
+		if($num_a_all_blast_hits){print STDERR "The query hit $num_a_all_blast_hits other protein(s)/gene(s) (blast-graph)"}else{print STDERR "the query did not hit anything (blast-graph)...\n\n";exit(0);}
+		if($num_a_aRBH_hits){print STDERR " AND $num_a_aRBH_hits of these hits are adaptive reciprocal best hits too.\n"}else{print STDERR " BUT none of these hits are adaptive reciprocal best hits...\n\n";exit(0);}
+	}else{
+		if($num_a_aRBH_hits){print STDERR "The query is part of the adaptive reciprocal best hit graph.\n"}else{print STDERR "The query is NOT part of the adaptive reciprocal best hit graph...\n\n";exit(0);}	
+	}
+	if($num_a_cluster_groups){print STDERR "Furthermore, the query is a putative ortholog with $num_a_cluster_hits other protein(s)/gene(s)\n"}else{print STDERR "Sadly, the query is NOT a putative ortholog of any other protein(s)/gene(s)....\n\n";exit(0);}	
+}
+
+print STDERR "\n";
+
+print STDERR "WARNING: The project '".$project_corrected."' was not found, I automatically detected '$project' ! (Specify the project name (prefix) with -project=...)\n";
+
+print STDERR "\n";
+
+
+
+sub processLine{
+	$_=shift;
+	if($notableformat == 1){print "$_\n";next;}
+	chomp;
+	if(length($_)<1){next;}
+
+	@spl;
+
+	if($split_delim eq ""){
+		@spl_t=split("\t",$_);
+		@spl_c=split(",",$_);
+		@spl_s=split(";",$_);
+
+		if(scalar @spl_t < 2 && scalar @spl_c < 2 && scalar @spl_s < 2){next;}
+
+		if(scalar @spl_t > scalar @spl_c && scalar @spl_t > scalar @spl_s ){ @spl = @spl_t; $split_delim='\t';}
+		elsif(scalar @spl_c > scalar @spl_t && scalar @spl_c > scalar @spl_s ){ @spl = @spl_c; $split_delim=",";}
+		elsif(scalar @spl_s > scalar @spl_c && scalar @spl_s > scalar @spl_t ){ @spl = @spl_s; $split_delim=";";}
+
+	}else{
+		@spl=split($split_delim,$_);
+	}
+
+	@spl_backup=@spl;
+
+	if(scalar @spl_header > 0 && scalar @spl != scalar @spl_header){$isHeaderLine=1;}
+	if(scalar @spl < 2 ){next;}
+	if(substr($spl[0],0,1) eq "#"){$spl[0]=~s/^# ?//g;}
+	if(scalar(@spl)*2-1>$maxNumOfCharsInOneLine){$maxNumOfCharsInOneLine= -1+2*scalar @spl;print STDERR "Corrected minimum table width: -w=$maxNumOfCharsInOneLine such that at least 1 character per column is displayed.\n";}
+
+	$sumOfCharsLine=length(join("", at spl));
+
+	if($isHeaderLine){ # is a header row 
+		while(($sumOfCharsLine + scalar @spl-1) > $maxNumOfCharsInOneLine){ # shave of chars from widest cell
+			$max_l=0;
+			@max_l_is;
+			for (my $i = 0; $i < scalar @spl; $i++) {
+				if($max_l < length $spl[$i]){$max_l=length $spl[$i];@max_l_is=();push(@max_l_is,$i)}elsif($max_l == length $spl[$i]){push(@max_l_is,$i)}
+			}
+			for (my $i = 0; $i < scalar @max_l_is; $i++) {
+				if(length $spl[$max_l_is[$i]] > 8 && substr($spl[$max_l_is[$i]],-3) ne "..." ){
+					$spl[$max_l_is[$i]]=substr($spl[$max_l_is[$i]],0,length($spl[$max_l_is[$i]])-3-1)."..."
+				}
+				else{
+					$spl[$max_l_is[$i]]=substr($spl_backup[$max_l_is[$i]],0,length($spl[$max_l_is[$i]])-1)
+				}
+			}
+			$sumOfCharsLine=length(join("", at spl));
+		}
+
+
+		while(($sumOfCharsLine + scalar @spl-1) < $maxNumOfCharsInOneLine ){ # add of chars to smallest cell
+			$min_l=$maxNumOfCharsInOneLine*10;
+			@min_l_is;
+			for (my $i = 0; $i < scalar @spl; $i++) {
+				if($min_l > length $spl[$i]){$min_l=length $spl[$i];@min_l_is=();push(@min_l_is,$i)}
+			}
+			for (my $i = 0; $i < scalar @min_l_is; $i++) {
+
+				$leftPad=0;
+				$rightPad=0;
+				if($spl[$min_l_is[$i]]=~m/( +)$/){$rightPad=length $1}
+				if($spl[$min_l_is[$i]]=~m/^( +)/){$leftPad=length $1}
+
+				if( $leftPad < $rightPad ){
+					$spl[$min_l_is[$i]]=" ".$spl[$min_l_is[$i]];
+				}else{
+					$spl[$min_l_is[$i]]=$spl[$min_l_is[$i]]." ";
+				}
+				
+			}
+			$sumOfCharsLine=length(join("", at spl));
+		}
+
+		@spl_header=@spl;
+
+	}else{ # is not headerline -> do the same as in headerline
+		
+		while(scalar @spl > scalar @spl_header){pop @spl;}
+
+		for (my $i = 0; $i < scalar @spl; $i++) {
+			while(length $spl[$i]< length $spl_header[$i]){ # add pads
+				$leftPad=0;
+				$rightPad=0;
+				if($spl[$i]=~m/( +)$/){$rightPad=length $1}
+				if($spl[$i]=~m/^( +)/){$leftPad=length $1}
+
+				if( $leftPad < $rightPad ){
+					$spl[$i]=" ".$spl[$i];
+				}else{
+					$spl[$i]=$spl[$i]." ";
+				}
+			}
+			while(length $spl[$i]>length $spl_header[$i]){ # trim
+				if(length $spl[$i] > 5 && substr($spl[$i],-3) ne "..." ){
+					$spl[$i]=substr($spl[$i],0,length($spl[$i])-3-1)."..."
+				}
+				else{
+					$spl[$i]=substr($spl_backup[$i],0,length($spl[$i])-2)."#"
+				}
+			}
+		}
+	}
+
+	if($isHeaderLine && !$last_isHeaderLine ){$tmp=join("|", at spl);$tmp=~s/\|/+/g;$tmp=~s/[^+]/-/g; print "$tmp\n";}
+	print join("|", at spl);
+	if($isHeaderLine ){print "\n";$tmp=join("|", at spl);$tmp=~s/\|/+/g;$tmp=~s/[^+]/-/g; print "$tmp";}
+	print "\n";
+	$last_isHeaderLine=$isHeaderLine;
+	$isHeaderLine=0;
+
+
+}
+


=====================================
src/proteinortho_summary.pl
=====================================
@@ -0,0 +1,351 @@
+#!/usr/bin/env perl
+#pk
+
+##########################################################################################
+#	  This file is part of proteinortho.
+#	  (C) 2009 Marcus Lechner
+# 
+#	  proteinortho is free software; you can redistribute it and/or modify
+#	  it under the terms of the GNU General Public License as published
+#	  by the Free Software Foundation; either version 2, or (at your
+#	  option) any later version.
+#
+#	  proteinortho is distributed in the hope that it will be useful, but
+#	  WITHOUT ANY WARRANTY; without even the implied warranty of
+#	  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#	  General Public License for more details.
+#
+#	  You should have received a copy of the GNU General Public License
+#	  along with proteinortho; see the file COPYING.  If not, write to the
+#	  Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+#	  Boston, MA 02111-1307, USA.	
+##########################################################################################
+
+##########################################################################################
+# About
+##########################################################################################
+# 
+# @author Paul Klemm
+# @email klemmp at staff.uni-marburg.de
+# @company Bioinformatics, University of Leipzig
+# @version 1
+# @date 11-12-2019
+#
+##########################################################################################
+
+use POSIX;
+
+my $usage = "
+proteinortho_summary.pl        produces a summary on species level.
+ 
+SYNOPSIS
+ 
+proteinortho_summary.pl (options) GRAPH (GRAPH2)
+
+	GRAPH	Path to the *.proteinortho-graph or *.blast-graph file generated by proteinortho. 
+	GRAPH2	(optional) If you provide a blast-graph AND a proteinortho-graph, the difference is calculated (GRAPH - GRAPH2)
+
+	Note: The *.proteinortho.tsv file does not work here (use the proteinortho-graph file)
+
+	OPTIONS
+
+		-format,-f	enables the table formatting instead of the plain csv output.
+
+";
+
+my $graphfilenameA="";
+my $graphfilenameB="";
+my $notableformat=1;
+
+for(my $v = 0 ; $v < scalar @ARGV ; $v++){
+	if($ARGV[$v] =~ m/^--?(help|h)$/){$help=1;}
+	elsif($ARGV[$v] =~ m/^--?(format|f)$/){$notableformat=0;}
+	elsif($ARGV[$v] =~ m/^-.+/){ print $usage; print STDERR "ERROR: invalid option ".$ARGV[$v]."!\n\n";exit(1);}
+	elsif($graphfilenameA eq ""){$graphfilenameA = $ARGV[$v];}
+	elsif($graphfilenameB eq ""){$graphfilenameB = $ARGV[$v];}
+}
+
+if ($help){
+    print $usage;
+    exit(0);
+}
+my $fail="";
+if ($graphfilenameA eq ""){
+    $fail.="ERROR: GRAPH not provided!\n";
+}
+if($fail ne ""){
+	print $usage.$fail;
+	exit(1);
+}
+our $maxNumOfCharsInOneLine=`tput cols`;
+chomp($maxNumOfCharsInOneLine);
+if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
+our $split_delim="[:\t]";
+our @spl_header;
+our @spl;
+our $last_isHeaderLine=0;
+our $last_isHeaderLine=0;$isHeaderLine=1;
+our $noheader=0;
+
+
+my %species_matrix;
+my %species_matrix_pow2;
+my $currentSpeciesA;
+my $currentSpeciesB;
+
+open(my $FH,"<",$graphfilenameA) || die $!;
+while(<$FH>){
+	if($_ eq ""){next;}
+	chomp;
+	if($_ eq "# file_a	file_b" || $_ eq "# a	b	evalue_ab	bitscore_ab	evalue_ba	bitscore_ba"){next;}
+	my @arr=split("\t",$_);
+	if(substr($_,0,1) eq "#" && scalar @arr == 2){
+		$currentSpeciesA=$arr[0];
+		$currentSpeciesB=$arr[1];
+		$currentSpeciesA=~s/^# ?//g;
+	}elsif(substr($_,0,1) ne "#" && scalar @arr == 6){
+		if(!exists $species_matrix{$currentSpeciesA}{$currentSpeciesB}){
+			$species_matrix{$currentSpeciesA}{$currentSpeciesB} = 1;
+			$species_matrix{$currentSpeciesB}{$currentSpeciesA} = 1;
+			$species_matrix_pow2{$currentSpeciesA}{$currentSpeciesB} = 0;
+			$species_matrix_pow2{$currentSpeciesB}{$currentSpeciesA} = 0;
+		}else{
+			$species_matrix{$currentSpeciesA}{$currentSpeciesB} ++;
+			$species_matrix{$currentSpeciesB}{$currentSpeciesA} ++;
+		}
+	}elsif( !(substr($_,0,1) eq "#" && scalar @arr == 4) ){
+		print STDERR "[STDERR] Error: wrong fromat... Please make sure you only provide *.blast-graph or *.proteinortho-graph files as input...\n";die;
+	}
+}
+close($FH);
+
+if($graphfilenameB ne ""){
+	open(my $FH,"<",$graphfilenameB) || die $!;
+	while(<$FH>){
+		if($_ eq ""){next;}
+		chomp;
+		my @arr=split("\t",$_);
+		if(substr($_,0,1) eq "#" && scalar @arr == 2){
+			$currentSpeciesA=$arr[0];
+			$currentSpeciesB=$arr[1];
+			$currentSpeciesA=~s/^# ?//g;
+		}elsif(substr($_,0,1) ne "#"){
+			if(!exists $species_matrix{$currentSpeciesA}{$currentSpeciesB}){
+				$species_matrix{$currentSpeciesA}{$currentSpeciesB} = 1;
+				$species_matrix{$currentSpeciesB}{$currentSpeciesA} = 1;
+				$species_matrix_pow2{$currentSpeciesA}{$currentSpeciesB} = 0;
+				$species_matrix_pow2{$currentSpeciesB}{$currentSpeciesA} = 0;
+			}else{
+				$species_matrix{$currentSpeciesA}{$currentSpeciesB} --;
+				$species_matrix{$currentSpeciesB}{$currentSpeciesA} --;
+			}
+		}
+	}
+	close($FH);
+}
+
+my @keys=sort keys %species_matrix;
+
+$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
+
+print STDERR "\n";
+my $ret= "# The adjacency matrix, the number of edges between 2 species\n";
+processLine($ret);
+$ret= "# file\t";
+
+for(my $i = 0 ; $i < scalar @keys; $i++){
+	if(scalar @keys>10 && !$notableformat){$ret.= "($i)\t";}
+	else{$ret.=$keys[$i]."\t";}
+}
+$ret.= "\n";
+processLine($ret);
+for(my $i = 0 ; $i < scalar @keys; $i++){
+	if(scalar @keys >10 && !$notableformat){
+		$ret=$keys[$i]."($i)\t";
+	}else{
+		$ret=$keys[$i]."\t";
+	}
+	for(my $j = 0 ; $j < scalar @keys; $j++){
+		if($i==$j){$species_matrix{$keys[$i]}{$keys[$j]}=0;}
+		$ret.= $species_matrix{$keys[$i]}{$keys[$j]};
+		if($j<scalar @keys -1){$ret.="\t";}
+	}
+	$ret.= "\n";
+	processLine($ret);
+}
+
+$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
+$maxNumOfCharsInOneLine=`tput cols`;
+chomp($maxNumOfCharsInOneLine);$maxNumOfCharsInOneLine/=2;
+if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
+
+print STDERR "\n";
+$ret= "# file\taverage number of edges\n";
+processLine($ret);
+for(my $i = 0 ; $i < scalar @keys; $i++){
+	$ret= $keys[$i]."\t";
+	my $sum=0;
+	for(my $j = 0 ; $j < scalar @keys; $j++){
+		$sum+=$species_matrix{$keys[$i]}{$keys[$j]};
+	}
+	$ret.= $sum/scalar @keys;
+	if($j<scalar @keys -1){$ret.= "\t";}
+	$ret.= "\n";
+	processLine($ret);
+}
+
+$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
+$maxNumOfCharsInOneLine=`tput cols`;
+chomp($maxNumOfCharsInOneLine);
+if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
+
+print STDERR "\n";
+$ret= "# The 2-path matrix, the number of paths between 2 species of length 2\n";
+processLine($ret);
+$ret= "# file\t";
+for(my $i = 0 ; $i < scalar @keys; $i++){
+	if(scalar @keys>10 && !$notableformat){$ret.= "($i)\t";}
+	else{$ret.=$keys[$i]."\t";}
+}
+$ret.= "\n";
+processLine($ret);
+for(my $i = 0 ; $i < scalar @keys; $i++){
+	$ret= $keys[$i]."($i)\t";
+	for(my $j = 0 ; $j < scalar @keys; $j++){
+		if($i<$i+1){
+			for(my $k = 0 ; $k < scalar @keys; $k++){
+				$species_matrix_pow2{$keys[$i]}{$keys[$j]}+=$species_matrix{$keys[$i]}{$keys[$k]}*$species_matrix{$keys[$k]}{$keys[$j]};
+				$species_matrix_pow2{$keys[$j]}{$keys[$i]}=$species_matrix_pow2{$keys[$i]}{$keys[$j]};
+			}
+		}
+		$ret.= $species_matrix_pow2{$keys[$i]}{$keys[$j]};
+		if($j<scalar @keys -1){$ret.= "\t";}
+	}
+	$ret.= "\n";
+	processLine($ret);
+}
+
+$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
+$maxNumOfCharsInOneLine=`tput cols`;
+chomp($maxNumOfCharsInOneLine);$maxNumOfCharsInOneLine/=2;
+if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
+
+print STDERR "\n";
+processLine("# file\taverage number of 2-paths\n");
+for(my $i = 0 ; $i < scalar @keys; $i++){
+	
+	my $sum=0;
+	for(my $j = 0 ; $j < scalar @keys; $j++){
+		$sum+=$species_matrix_pow2{$keys[$i]}{$keys[$j]};
+	}
+	processLine($keys[$i]."($i)\t".($sum/scalar @keys)."\n");
+}
+
+
+
+
+
+
+
+
+sub processLine{
+	$_=shift;chomp;
+	if($notableformat == 1){print "$_\n";return 1;}
+	
+	if(length($_)<1){return 1;}
+
+	@spl=split($split_delim,$_);
+	
+	if(scalar @spl <2){print "$_\n";return 1;}
+
+	@spl_backup=@spl;
+
+	if(scalar @spl_header > 0 && scalar @spl != scalar @spl_header){$isHeaderLine=1;}
+	if(scalar @spl < 2 ){return 1;}
+	if(substr($spl[0],0,1) eq "#"){$spl[0]=~s/^# ?//g;}
+	if(scalar(@spl)*2-1>$maxNumOfCharsInOneLine){$maxNumOfCharsInOneLine= -1+2*scalar @spl;print STDERR "Corrected minimum table width: -w=$maxNumOfCharsInOneLine such that at least 1 character per column is displayed.\n";}
+
+	$sumOfCharsLine=length(join("", at spl));
+
+	if($isHeaderLine){ # is a header row 
+		while(($sumOfCharsLine + scalar @spl-1) > $maxNumOfCharsInOneLine){ # shave of chars from widest cell
+			$max_l=0;
+			@max_l_is;
+			for (my $i = 0; $i < scalar @spl; $i++) {
+				if($max_l < length $spl[$i]){$max_l=length $spl[$i];@max_l_is=();push(@max_l_is,$i)}elsif($max_l == length $spl[$i]){push(@max_l_is,$i)}
+			}
+			for (my $i = 0; $i < scalar @max_l_is; $i++) {
+				if(length $spl[$max_l_is[$i]] > 8 && substr($spl[$max_l_is[$i]],-3) ne "..." ){
+					$spl[$max_l_is[$i]]=substr($spl[$max_l_is[$i]],0,length($spl[$max_l_is[$i]])-3-1)."..."
+				}
+				else{
+					$spl[$max_l_is[$i]]=substr($spl_backup[$max_l_is[$i]],0,length($spl[$max_l_is[$i]])-1)
+				}
+			}
+			$sumOfCharsLine=length(join("", at spl));
+		}
+
+
+		while(($sumOfCharsLine + scalar @spl-1) < $maxNumOfCharsInOneLine ){ # add of chars to smallest cell
+			$min_l=$maxNumOfCharsInOneLine*10;
+			@min_l_is;
+			for (my $i = 0; $i < scalar @spl; $i++) {
+				if($min_l > length $spl[$i]){$min_l=length $spl[$i];@min_l_is=();push(@min_l_is,$i)}
+			}
+			for (my $i = 0; $i < scalar @min_l_is; $i++) {
+
+				$leftPad=0;
+				$rightPad=0;
+				if($spl[$min_l_is[$i]]=~m/( +)$/){$rightPad=length $1}
+				if($spl[$min_l_is[$i]]=~m/^( +)/){$leftPad=length $1}
+
+				if( $leftPad < $rightPad ){
+					$spl[$min_l_is[$i]]=" ".$spl[$min_l_is[$i]];
+				}else{
+					$spl[$min_l_is[$i]]=$spl[$min_l_is[$i]]." ";
+				}
+				
+			}
+			$sumOfCharsLine=length(join("", at spl));
+		}
+
+		@spl_header=@spl;
+
+	}else{ # is not headerline -> do the same as in headerline
+		
+		while(scalar @spl > scalar @spl_header){pop @spl;}
+
+		for (my $i = 0; $i < scalar @spl; $i++) {
+			while(length $spl[$i]< length $spl_header[$i]){ # add pads
+				$leftPad=0;
+				$rightPad=0;
+				if($spl[$i]=~m/( +)$/){$rightPad=length $1}
+				if($spl[$i]=~m/^( +)/){$leftPad=length $1}
+
+				if( $leftPad < $rightPad ){
+					$spl[$i]=" ".$spl[$i];
+				}else{
+					$spl[$i]=$spl[$i]." ";
+				}
+			}
+			while(length $spl[$i]>length $spl_header[$i]){ # trim
+				if(length $spl[$i] > 5 && substr($spl[$i],-3) ne "..." ){
+					$spl[$i]=substr($spl[$i],0,length($spl[$i])-3-1)."..."
+				}
+				else{
+					$spl[$i]=substr($spl_backup[$i],0,length($spl[$i])-2)."#"
+				}
+			}
+		}
+	}
+
+	if($isHeaderLine && !$last_isHeaderLine ){$tmp=join("|", at spl);$tmp=~s/\|/+/g;$tmp=~s/[^+]/-/g; print "$tmp\n";}
+	print join("|", at spl);
+	if($isHeaderLine ){print "\n";$tmp=join("|", at spl);$tmp=~s/\|/+/g;$tmp=~s/[^+]/-/g; print "$tmp";}
+	print "\n";
+	$last_isHeaderLine=$isHeaderLine;
+	$isHeaderLine=0;
+
+
+}
+



View it on GitLab: https://salsa.debian.org/med-team/proteinortho/commit/83071ae5ce0d7816bbf6ec8d0ba0ccae976ebc7e

-- 
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/commit/83071ae5ce0d7816bbf6ec8d0ba0ccae976ebc7e
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191212/202f8235/attachment-0001.html>


More information about the debian-med-commit mailing list